6464)
6565
6666var (
67- tempDir string
68- localRepoRoot string
69- testCases []string
70- goldenModel string
71- integrationModel string
72- judgeModel string
67+ tempDir string
68+ localRepoRoot string
69+ testCases []string
70+ goldenModel string
71+ integrationModel string
72+ judgeModel string
73+ totalReviewerCost float64
74+ totalJudgeCost float64
7375)
7476
77+ type claudeOutput struct {
78+ Type string `json:"type"`
79+ Result string `json:"result"`
80+ TotalCostUSD float64 `json:"total_cost_usd"`
81+ }
82+
7583func TestEval (t * testing.T ) {
7684 RegisterFailHandler (Fail )
7785 RunSpecs (t , "API Review Eval Suite" )
@@ -123,6 +131,7 @@ var _ = AfterSuite(func() {
123131 By ("cleaning up temp directory" )
124132 os .RemoveAll (tempDir )
125133 }
134+ fmt .Printf ("\n Total Cost: $%.4f (Reviewer: $%.4f, Judge: $%.4f)\n " , totalReviewerCost + totalJudgeCost , totalReviewerCost , totalJudgeCost )
126135})
127136
128137func copyLocalFiles () {
@@ -251,7 +260,7 @@ func readAndApplyPatch(patchPath string) {
251260}
252261
253262// runAPIReview and runJudge can probably share some common code.
254- func runAPIReview (model string ) string {
263+ func runAPIReview (model string ) ( string , float64 ) {
255264 By (fmt .Sprintf ("running API review via Claude (%s)" , model ))
256265 ctx , cancel := context .WithTimeout (context .Background (), claudeTimeout )
257266 defer cancel ()
@@ -262,15 +271,22 @@ func runAPIReview(model string) string {
262271 "--model" , model ,
263272 "-p" , "/api-review" ,
264273 "--allowedTools" , "Bash,Read,Grep,Glob,Task" ,
274+ "--output-format" , "json" ,
265275 )
266276 cmd .Dir = tempDir
267277
268278 output , err := cmd .CombinedOutput ()
269279 Expect (err ).NotTo (HaveOccurred (), "claude command failed: %s" , string (output ))
270- return string (output )
280+
281+ var parsed claudeOutput
282+ err = json .Unmarshal (output , & parsed )
283+ Expect (err ).NotTo (HaveOccurred (), "failed to parse claude output: %s" , string (output ))
284+
285+ totalReviewerCost += parsed .TotalCostUSD
286+ return parsed .Result , parsed .TotalCostUSD
271287}
272288
273- func runJudge (model , reviewOutput , expectedIssues string ) evalResult {
289+ func runJudge (model , reviewOutput , expectedIssues string ) ( evalResult , float64 ) {
274290 By (fmt .Sprintf ("comparing results with Claude judge (%s)" , model ))
275291 ctx , cancel := context .WithTimeout (context .Background (), claudeTimeout )
276292 defer cancel ()
@@ -281,17 +297,24 @@ func runJudge(model, reviewOutput, expectedIssues string) evalResult {
281297 "--dangerously-skip-permissions" ,
282298 "--model" , model ,
283299 "-p" , prompt ,
300+ "--output-format" , "json" ,
284301 )
285302 cmd .Dir = tempDir
286303
287304 output , err := cmd .CombinedOutput ()
288305 Expect (err ).NotTo (HaveOccurred (), "claude judge command failed: %s" , string (output ))
289306
307+ var parsed claudeOutput
308+ err = json .Unmarshal (output , & parsed )
309+ Expect (err ).NotTo (HaveOccurred (), "failed to parse judge output: %s" , string (output ))
310+
311+ totalJudgeCost += parsed .TotalCostUSD
312+
290313 var result evalResult
291- jsonStr := stripMarkdownCodeBlock (string ( output ) )
314+ jsonStr := stripMarkdownCodeBlock (parsed . Result )
292315 err = json .Unmarshal ([]byte (jsonStr ), & result )
293- Expect (err ).NotTo (HaveOccurred (), "failed to parse judge response as JSON: %s" , string ( output ) )
294- return result
316+ Expect (err ).NotTo (HaveOccurred (), "failed to parse judge response as JSON: %s" , parsed . Result )
317+ return result , parsed . TotalCostUSD
295318}
296319
297320func runTestCase (tier , tc , reviewModel , judgeModelName string ) {
@@ -304,9 +327,10 @@ func runTestCase(tier, tc, reviewModel, judgeModelName string) {
304327 Expect (err ).NotTo (HaveOccurred ())
305328 expectedIssues := strings .TrimSpace (string (expectedContent ))
306329
307- reviewOutput := runAPIReview (reviewModel )
308- result := runJudge (judgeModelName , reviewOutput , expectedIssues )
330+ reviewOutput , reviewCost := runAPIReview (reviewModel )
331+ result , judgeCost := runJudge (judgeModelName , reviewOutput , expectedIssues )
309332
333+ GinkgoWriter .Printf ("Cost: Reviewer=$%.4f, Judge=$%.4f, Total=$%.4f\n " , reviewCost , judgeCost , reviewCost + judgeCost )
310334 GinkgoWriter .Printf ("Judge result: pass=%v, reason=%s\n " , result .Pass , result .Reason )
311335 Expect (result .Pass ).To (BeTrue (), "API review did not match expected issues.\n Judge reason: %s\n Review output:\n %s\n Expected issues:\n %s" , result .Reason , reviewOutput , expectedIssues )
312336}
0 commit comments