triton-lang · ShawnZhong · Jun 26, 2025 · Jun 27, 2025 · Jun 27, 2025
@@ -6,7 +6,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
   // CHECK-LABEL: convert_read_counter
   llvm.func @convert_read_counter() -> i32 {
     // CHECK: rocdl.sched.barrier 0
-    %1 = proton_gpu.read_counter : i32
+    %1 = proton_gpu.read_counter {metric = 0 : i32} : i32
     llvm.return %1 : i32
   }
 }
@@ -54,21 +54,21 @@ module attributes {"ttg.num-warps" = 8 : i32, ttg.profile_scratch_memory_alignme
     %0 = ttg.local_alloc : () -> !ttg.memdesc<512xi32, #shared, #smem, mutable>
     %1 = proton_gpu.global_scratch_alloc {alignment = 128 : i32, nbytes = 384 : i32, offset = 0 : i32} : !tt.ptr<i32>
     %2 = proton_gpu.segment_alloc %0 : !ttg.memdesc<512xi32, #shared, #smem, mutable> -> !proton_gpu.segment<2048, #smem, warp>
-    %3 = proton_gpu.read_counter : i32
+    %3 = proton_gpu.read_counter {metric = 0 : i32 } : i32
     proton_gpu.circular_store start %2, %3 {scopeId = 0 : i32} : !proton_gpu.segment<2048, #smem, warp>, i32
     scf.for %arg0 = %c0 to %c4 step %c1 {
-      %7 = proton_gpu.read_counter : i32
+      %7 = proton_gpu.read_counter {metric = 0 : i32} : i32
       proton_gpu.circular_store start %2, %7 {scopeId = 0 : i32} : !proton_gpu.segment<2048, #smem, warp>, i32
       scf.for %arg1 = %c0 to %c4 step %c1 {
-        %9 = proton_gpu.read_counter : i32
+        %9 = proton_gpu.read_counter {metric = 0 : i32} : i32
         proton_gpu.circular_store start %2, %9 {scopeId = 0 : i32} : !proton_gpu.segment<2048, #smem, warp>, i32
       }
-      %8 = proton_gpu.read_counter : i32
+      %8 = proton_gpu.read_counter {metric = 0 : i32} : i32
       proton_gpu.circular_store start %2, %8 {scopeId = 0 : i32} : !proton_gpu.segment<2048, #smem, warp>, i32
     }
-    %5 = proton_gpu.read_counter : i32
+    %5 = proton_gpu.read_counter {metric = 0 : i32} : i32
     proton_gpu.circular_store start %2, %5 {scopeId = 0 : i32} : !proton_gpu.segment<2048, #smem, warp>, i32
-    %6 = proton_gpu.read_counter : i32
+    %6 = proton_gpu.read_counter {metric = 0 : i32} : i32
     proton_gpu.circular_store start %2, %6 {scopeId = 0 : i32} : !proton_gpu.segment<2048, #smem, warp>, i32
     gpu.barrier
     proton_gpu.finalize %2, %1 : !proton_gpu.segment<2048, #smem, warp>, !tt.ptr<i32>

@@ -18,15 +18,27 @@ module attributes {"ttg.num-warps" = 8 : i32} {
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 8 : i32} {
-  // CHECK-LABEL: convert_read_counter
-  llvm.func @convert_read_counter() -> i32 {
+  // CHECK-LABEL: convert_read_counter_cycle
+  llvm.func @convert_read_counter_cycle() -> i32 {
     //CHECK: llvm.call_intrinsic "llvm.amdgcn.s.memtime"() : () -> i64
     //CHECK: llvm.trunc %{{.*}} : i64 to i32
-    %1 = proton_gpu.read_counter : i32
+    %1 = proton_gpu.read_counter {metric = 0 : i32} : i32
     llvm.return %1 : i32
   }
 }
 
+// -----
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-warps" = 8 : i32} {
+  // CHECK-LABEL: convert_read_counter_realtime
+  llvm.func @convert_read_counter_realtime() -> i64 {
+    //CHECK: llvm.call_intrinsic "llvm.amdgcn.s.memrealtime"() : () -> i64
+    %1 = proton_gpu.read_counter {metric = 1 : i32} : i64
+    llvm.return %1 : i64
+  }
+}
+
 // -----
 
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
@@ -64,7 +76,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
   	// CHECK-DAG: %[[CYCLE1:.*]] = llvm.call_intrinsic "llvm.amdgcn.s.memtime"()
     %0 = ttg.local_alloc : () -> !ttg.memdesc<512xi32, #shared, #smem, mutable>
     %3 = proton_gpu.segment_alloc %0 : !ttg.memdesc<512xi32, #shared, #smem, mutable> -> !proton_gpu.segment<2048, #smem, warp, [0, 1]>
-    %8 = proton_gpu.read_counter : i32
+    %8 = proton_gpu.read_counter {metric = 0 : i32} : i32
     proton_gpu.circular_store start %3, %8 {scopeId = 1 : i32} : !proton_gpu.segment<2048, #smem, warp, [0, 1]>, i32
     llvm.return
   }
@@ -147,7 +159,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
     // CHECK-DAG: llvm.extractelement %[[CYCLE64]]
     %0 = ttg.local_alloc : () -> !ttg.memdesc<512xi32, #shared, #smem, mutable>
     %3 = proton_gpu.segment_alloc %0 : !ttg.memdesc<512xi32, #shared, #smem, mutable> -> !proton_gpu.segment<2048, #smem, warp, [0, 1]>
-    %8 = proton_gpu.read_counter : i64
+    %8 = proton_gpu.read_counter {metric = 0 : i32} : i64
     proton_gpu.circular_store start %3, %8 {scopeId = 1 : i32} : !proton_gpu.segment<2048, #smem, warp, [0, 1]>, i64
     llvm.return
   }

@@ -12,20 +12,32 @@ module attributes {"ttg.num-warps" = 8 : i32} {
   }
 }
 
-
 // -----
 
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 8 : i32} {
-  // CHECK-LABEL: convert_read_counter
-  llvm.func @convert_read_counter() {
+  // CHECK-LABEL: convert_read_counter_cycle
+  llvm.func @convert_read_counter_cycle() {
     // CHECK: llvm.inline_asm has_side_effects asm_dialect = att operand_attrs = [] "mov.u32 $0, %clock;", "=r"  : () -> i32
-    %1 = proton_gpu.read_counter : i32
+    %1 = proton_gpu.read_counter {metric = 0 : i32} : i32
     llvm.return
   }
 }
 
+// -----
+
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-warps" = 8 : i32} {
+  // CHECK-LABEL: convert_read_counter_realtime
+  llvm.func @convert_read_counter_realtime() -> i64 {
+    // CHECK: llvm.call_intrinsic "llvm.nvvm.read.ptx.sreg.globaltimer"() : () -> i64
+    %1 = proton_gpu.read_counter {metric = 1 : i32} : i64
+    llvm.return %1 : i64
+  }
+}
+
 
 // -----
 
@@ -89,7 +101,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
     %3 = proton_gpu.segment_alloc %0 : !ttg.memdesc<512xi32, #shared, #smem, mutable> -> !proton_gpu.segment<2048, #smem, warp, [0, 1]>
     scf.for %arg0 = %c0 to %c4 step %c1 {
       scf.for %arg1 = %c0 to %c4 step %c1 {
-        %8 = proton_gpu.read_counter : i32
+        %8 = proton_gpu.read_counter {metric = 0 : i32} : i32
         proton_gpu.circular_store start %3, %8 {scopeId = 1 : i32} : !proton_gpu.segment<2048, #smem, warp, [0, 1]>, i32
       }
     }
@@ -117,7 +129,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
     // CHECK-DAG: llvm.inline_asm has_side_effects{{.*}}st.shared.v2.b32{{.*}}%[[SMEM_PTR]], %{{.*}}, %{{.*}}, %{{.*}}
     %0 = ttg.local_alloc : () -> !ttg.memdesc<512xi32, #shared, #smem, mutable>
     %3 = proton_gpu.segment_alloc %0 : !ttg.memdesc<512xi32, #shared, #smem, mutable> -> !proton_gpu.segment<2048, #smem, warp, [0, 1]>
-    %8 = proton_gpu.read_counter : i32
+    %8 = proton_gpu.read_counter {metric = 0 : i32} : i32
     proton_gpu.circular_store start %3, %8 {scopeId = 1 : i32} : !proton_gpu.segment<2048, #smem, warp, [0, 1]>, i32
     llvm.return
   }
@@ -211,7 +223,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
   llvm.func @use_clock64() {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<512xi32, #shared, #smem, mutable>
     %3 = proton_gpu.segment_alloc %0 : !ttg.memdesc<512xi32, #shared, #smem, mutable> -> !proton_gpu.segment<2048, #smem, warp, [0, 1]>
-    %8 = proton_gpu.read_counter : i64
+    %8 = proton_gpu.read_counter {metric = 0 : i32} : i64
     proton_gpu.circular_store start %3, %8 {scopeId = 1 : i32} : !proton_gpu.segment<2048, #smem, warp, [0, 1]>, i64
     llvm.return
   }

@@ -34,7 +34,7 @@ module attributes {"ttg.num-warps" = 8 : i32} {
     %1 = proton_gpu.global_scratch_alloc {alignment = 128 : i32, nbytes = 384 : i32} : !tt.ptr<i32>
     %seg = proton_gpu.segment_alloc %0 : !ttg.memdesc<64xi32, #shared, #smem, mutable> -> !proton_gpu.segment<256, #shared, warp>
     %seg_stack = proton_gpu.segment_alloc %stack : !ttg.memdesc<64xi32, #shared, #proton_gpu.stack_memory, mutable> -> !proton_gpu.segment<256, #proton_gpu.stack_memory, warp>
-    %3 = proton_gpu.read_counter : i32
+    %3 = proton_gpu.read_counter {metric = 0 : i32 } : i32
     proton_gpu.circular_store start %seg, %3 {scopeId = 0 : i32} : !proton_gpu.segment<256, #shared, warp>, i32
     gpu.barrier
     proton_gpu.finalize %seg, %1 : !proton_gpu.segment<256, #shared, warp>, !tt.ptr<i32>

@@ -15,9 +15,9 @@ module attributes {"ttg.num-warps" = 8 : i32} {
   // CHECK: %[[SCRATCH:.*]] = proton_gpu.global_scratch_alloc {alignment = 128 : i32, nbytes = 1152 : i32} : !tt.ptr<i32>
   // CHECK: %[[BUF:.*]] = ttg.local_alloc  : () -> !ttg.memdesc<256xi32, #shared, #smem, mutable>
   // CHECK: %[[SEGMENT:.*]] = proton_gpu.segment_alloc %[[BUF]]
-  // CHECK: %[[START:.*]] = proton_gpu.read_counter : i32
+  // CHECK: %[[START:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
   // CHECK: proton_gpu.circular_store start %[[SEGMENT]], %[[START]] {scopeId = 0 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
-  // CHECK: %[[END:.*]] = proton_gpu.read_counter : i32
+  // CHECK: %[[END:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
   // CHECK: proton_gpu.circular_store end %[[SEGMENT]], %[[END]] {scopeId = 0 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
   // CHECK: gpu.barrier
   // CHECK: proton_gpu.finalize %[[SEGMENT]], %[[SCRATCH]] : !proton_gpu.segment<1024, #smem, warp>, !tt.ptr<i32>
@@ -40,15 +40,15 @@ module attributes {"ttg.num-warps" = 8 : i32} {
     // CHECK: %[[SCRATCH:.*]] = proton_gpu.global_scratch_alloc
     // CHECK: %[[BUF:.*]] = ttg.local_alloc
     // CHECK: %[[SEGMENT:.*]] = proton_gpu.segment_alloc %[[BUF]]
-    // CHECK: %[[START0:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[START0:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store start %[[SEGMENT]], %[[START0]] {scopeId = 0 : i32}
     // CHECK: scf.for
-    // CHECK: %[[START1:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[START1:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store start %[[SEGMENT]], %[[START1]] {scopeId = 1 : i32}
-    // CHECK: %[[END1:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[END1:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store end %[[SEGMENT]], %[[END1]] {scopeId = 1 : i32}
     // CHECK: }
-    // CHECK: %[[END0:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[END0:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store end %[[SEGMENT]], %[[END0]] {scopeId = 0 : i32}
     // CHECK: gpu.barrier
     // CHECK: proton_gpu.finalize %[[SEGMENT]], %[[SCRATCH]]
@@ -73,21 +73,21 @@ module attributes {"ttg.num-warps" = 8 : i32} {
     // CHECK: %[[SCRATCH:.*]] = proton_gpu.global_scratch_alloc
     // CHECK: %[[BUF:.*]] = ttg.local_alloc
     // CHECK: %[[SEGMENT:.*]] = proton_gpu.segment_alloc %[[BUF]]
-    // CHECK: %[[START0:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[START0:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store start %[[SEGMENT]], %[[START0]] {scopeId = 0 : i32}
     // CHECK: scf.for
-    // CHECK: %[[START1:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[START1:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store start %[[SEGMENT]], %[[START1]] {scopeId = 1 : i32}
     // CHECK: scf.for
-    // CHECK: %[[END1:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[END1:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store end %[[SEGMENT]], %[[END1]] {scopeId = 1 : i32}
     // CHECK: }
-    // CHECK: %[[END0:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[END0:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store end %[[SEGMENT]], %[[END0]] {scopeId = 0 : i32}
     // CHECK: }
-    // CHECK: %[[START2:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[START2:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store start %[[SEGMENT]], %[[START2]] {scopeId = 2 : i32}
-    // CHECK: %[[END2:.*]] = proton_gpu.read_counter : i32
+    // CHECK: %[[END2:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
     // CHECK: proton_gpu.circular_store end %[[SEGMENT]], %[[END2]] {scopeId = 2 : i32}
     // CHECK: gpu.barrier
     // CHECK: proton_gpu.finalize %[[SEGMENT]], %[[SCRATCH]]

@@ -5,8 +5,8 @@ module attributes {"ttg.num-warps" = 8 : i32} {
   // CHECK: %[[SCRATCH:.*]] = proton_gpu.global_scratch_alloc {alignment = 128 : i32, nbytes = 1152 : i32} : !tt.ptr<i32>
   // CHECK-NEXT: %[[BUF:.*]] = ttg.local_alloc  : () -> !ttg.memdesc<256xi32, #shared, #smem, mutable>
   // CHECK-NEXT: %[[SEGMENT:.*]] = proton_gpu.segment_alloc %[[BUF]]
-  // CHECK-NEXT: %[[START:.*]] = proton_gpu.read_counter : i32
-  // CHECK-NEXT: %[[END:.*]] = proton_gpu.read_counter : i32
+  // CHECK-NEXT: %[[START:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
+  // CHECK-NEXT: %[[END:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
   // CHECK-NEXT: proton_gpu.circular_store start %[[SEGMENT]], %[[START]] {scopeId = 0 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
   // CHECK-NEXT: proton_gpu.circular_store end %[[SEGMENT]], %[[END]] {scopeId = 0 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
   // CHECK-NEXT: gpu.barrier
@@ -26,12 +26,12 @@ module attributes {"ttg.num-warps" = 8 : i32} {
   // CHECK: %[[SCRATCH:.*]] = proton_gpu.global_scratch_alloc {alignment = 128 : i32, nbytes = 1152 : i32} : !tt.ptr<i32>
   // CHECK-NEXT: %[[BUF:.*]] = ttg.local_alloc  : () -> !ttg.memdesc<256xi32, #shared, #smem, mutable>
   // CHECK-NEXT: %[[SEGMENT:.*]] = proton_gpu.segment_alloc %[[BUF]]
-  // CHECK-NEXT: %[[START1:.*]] = proton_gpu.read_counter : i32
-  // CHECK-NEXT: %[[START2:.*]] = proton_gpu.read_counter : i32
-  // CHECK-NEXT: %[[END2:.*]] = proton_gpu.read_counter : i32
+  // CHECK-NEXT: %[[START1:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
+  // CHECK-NEXT: %[[START2:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
+  // CHECK-NEXT: %[[END2:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
   // CHECK-NEXT: proton_gpu.circular_store start %[[SEGMENT]], %[[START2]] {scopeId = 1 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
   // CHECK-NEXT: proton_gpu.circular_store end %[[SEGMENT]], %[[END2]] {scopeId = 1 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
-  // CHECK-NEXT: %[[END1:.*]] = proton_gpu.read_counter : i32
+  // CHECK-NEXT: %[[END1:.*]] = proton_gpu.read_counter {metric = 0 : i32} : i32
   // CHECK-NEXT: proton_gpu.circular_store start %[[SEGMENT]], %[[START1]] {scopeId = 0 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
   // CHECK-NEXT: proton_gpu.circular_store end %[[SEGMENT]], %[[END1]] {scopeId = 0 : i32} : !proton_gpu.segment<1024, #smem, warp>, i32
   // CHECK-NEXT: gpu.barrier

@@ -27,7 +27,7 @@ struct GetNumProgramsOpConversion
     std::string sreg = numCTAs == 1 ? "nctaid." : "nclusterid.";
     sreg.append(1, 'x' + op.getAxisAsInt()); // 0 -> 'x', 1 -> 'y', 2 -> 'z'
 
-    Value numPrograms = LLVM::NVIDIA::getSRegValue(rewriter, loc, sreg);
+    Value numPrograms = LLVM::NVIDIA::getSRegValue(rewriter, loc, sreg, i32_ty);
     rewriter.replaceOp(op, numPrograms);
     return success();
   }

@@ -93,14 +93,15 @@ Value llGetPid(Location loc, RewriterBase &rewriter, ModuleOp moduleOp,
 
   std::string sreg = numCTAs == 1 ? "ctaid." : "clusterid.";
   sreg.append(1, 'x' + axis); // 0 -> 'x', 1 -> 'y', 2 -> 'z'
-  return getSRegValue(rewriter, loc, sreg);
+  return getSRegValue(rewriter, loc, sreg, i32_ty);
 }
 
-Value getSRegValue(OpBuilder &rewriter, Location loc, StringRef sRegStr) {
+Value getSRegValue(OpBuilder &rewriter, Location loc, StringRef sRegStr,
+                   TypeRange types) {
   ValueRange args;
   auto intrName = Twine("llvm.nvvm.read.ptx.sreg.") + sRegStr;
   auto callOp =
-      createLLVMIntrinsicCallOp(rewriter, loc, intrName.str(), i32_ty, args);
+      createLLVMIntrinsicCallOp(rewriter, loc, intrName.str(), types, args);
   return callOp.getResult(0);
 }
 

@@ -25,7 +25,8 @@ namespace LLVM {
 
 namespace NVIDIA {
 
-Value getSRegValue(OpBuilder &b, Location loc, StringRef sRegStr);
+Value getSRegValue(OpBuilder &b, Location loc, StringRef sRegStr,
+                   TypeRange types);
 Value shuffleXor(Location loc, RewriterBase &rewriter, Value val, int i);
 Value shuffleUp(Location loc, RewriterBase &rewriter, Value val, int i);
 Value shuffleIdx(Location loc, RewriterBase &rewriter, Value val, int i);

@@ -20,6 +20,9 @@ class TargetInfo : public mlir::triton::proton::gpu::TargetInfoBase {
   Value clock(ConversionPatternRewriter &rewriter, Location loc,
               bool isClock64) const override;
 
+  Value realtime(ConversionPatternRewriter &rewriter,
+                 Location loc) const override;
+
   Value processorId(ConversionPatternRewriter &rewriter,
                     Location loc) const override;
 

@@ -17,6 +17,9 @@ class TargetInfo : public mlir::triton::proton::gpu::TargetInfoBase {
   Value clock(ConversionPatternRewriter &rewriter, Location loc,
               bool isClock64) const override;
 
+  Value realtime(ConversionPatternRewriter &rewriter,
+                 Location loc) const override;
+
   Value processorId(ConversionPatternRewriter &rewriter,
                     Location loc) const override;
 

@@ -17,9 +17,15 @@ class TargetInfoBase {
     return helper;
   }
 
+  // Return the local cycle counter value.
   virtual Value clock(ConversionPatternRewriter &rewriter, Location loc,
                       bool isClock64) const = 0;
 
+  // Return the global cycle counter value (i.e., synchonized across SMs) in
+  // nanoseconds, regardless of the clock frequency.
+  virtual Value realtime(ConversionPatternRewriter &rewriter,
+                         Location loc) const = 0;
+
   virtual Value processorId(ConversionPatternRewriter &rewriter,
                             Location loc) const = 0;
 

@@ -21,7 +21,8 @@ def ConvertProtonToProtonGPU: Pass<"convert-proton-to-protongpu", "mlir::ModuleO
               "MetricType", /*default*/"MetricType::CYCLE",
               "The performance counter metric type we are profiling",
               /*parser*/[{::llvm::cl::values(
-                    clEnumValN(MetricType::CYCLE, "cycle", "Cycle")
+                    clEnumValN(MetricType::CYCLE, "cycle", "Cycle"),
+                    clEnumValN(MetricType::REALTIME, "realtime", "Realtime")
               )}]>,
        Option<"granularity", "granularity",
               "gpu::Granularity", /*default*/"gpu::Granularity::WARP",

@@ -7,12 +7,14 @@ def MetricTypeAttr : I32EnumAttr<
   "MetricType", "The type of metric to be profiled",
   [
     I32EnumAttrCase<"CYCLE", 0, "cycle">,
+    I32EnumAttrCase<"REALTIME", 1, "realtime">,
   ]> {
   let cppNamespace = "::mlir::triton::proton";
   let description = [{
     Attribute to indicate the metric to be profiled.
     The following metrics are supported:
     - CYCLE: Cycle count metric.
+    - REALTIME: Wallclock time metric.
   }];
 }
 

@@ -68,7 +68,7 @@ def PTG_ReadCounterOp : PTG_Op<"read_counter", [
   }];
 
   let arguments = (ins
-    DefaultValuedAttr<MetricTypeAttr, "MetricType::CYCLE">:$metric
+    MetricTypeAttr:$metric
   );
 
   let results = (outs AnyTypeOf<[I32, I64]>:$counter);