From 0656ceaa6972087ba510c6da60ee549c0a7de392 Mon Sep 17 00:00:00 2001
From: cypress evelyn masso <coding@outofambit.com>
Date: Tue, 15 Oct 2024 19:05:35 -0700
Subject: [PATCH] Update flakiness script (#30)

* update test names

* increase default runs number

* use most common responses for each test id as comparison

* also use presentationNumber from API responses

* Update stress-test.mts

* logging cleanup - now with markdown!

* use a nice human label for workflow id in markdown

* output final summary as markdown table

* markdown reformatting progress (#34)

* WIP: markdown reformatting progress

* fix for header section

* some changes based on carmen feedback

* Total unequal % - carmen request

* remove old code comment

* remove old code comment

* link to header

* adding some additional notes to stress test readme

* add notes about using a personal non-network fork

* simplify join in formatResponses

* remove old settle tracking

we got this from p-limit

* Update stressor/stress-test.mts

* Apply suggestions from code review

Co-authored-by: Mx Corey Frang <corey@bocoup.com>

---------

Co-authored-by: cypress evelyn masso <coding@outofambit.com>

* Update stressor/README.md

Co-authored-by: jugglinmike <mike@mikepennisi.com>

* Update stressor/README.md

Co-authored-by: jugglinmike <mike@mikepennisi.com>

* Add try...finally for clearInterval

* use _.isEqual instead of JSON.stringify

---------

Co-authored-by: Mx Corey Frang <corey@bocoup.com>
Co-authored-by: jugglinmike <mike@mikepennisi.com>
Co-authored-by: Mx. Corey Frang <gnarf37@gmail.com>
---
 .gitignore                 |   1 +
 stressor/README.md         |   8 +-
 stressor/package-lock.json |  57 ++++-
 stressor/package.json      |   6 +-
 stressor/stress-test.mts   | 498 +++++++++++++++++++++++++++----------
 5 files changed, 427 insertions(+), 143 deletions(-)

diff --git a/.gitignore b/.gitignore
index abe1958..b9d6d75 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 # Logs
+stressor/stressor-run.json
 logs
 *.log
 npm-debug.log*
diff --git a/stressor/README.md b/stressor/README.md
index 4887de8..67c77d3 100644
--- a/stressor/README.md
+++ b/stressor/README.md
@@ -15,8 +15,10 @@ It works sequentially through the list of test plans, completing the runs for on
 
 ## Running
 
-Run it with `npm run stress-test`.
-
-Running the script can take a while, as it is constrained by GitHub Actions availability and speed.
+1. It is prefered for you to run the stress test against your own personal "non-fork" of this repo (create a personal repo and push to it instead of using "fork" so it isn't part of the "network") to limit the number of action runs against the main branch.
+2. Update the stress-test.mts file `owner`, `repo`, and `defaultBranch` definitions near the top, as well as setting up the tests / matrix you want to test.
+3. Run it with `npm run --silent stress-test | tee some-output-file.md`.
+4. Running the script can take a while, as it is constrained by GitHub Actions availability and speed.
+Will need the occasional manual job restart on GitHub when the ngrok tunnel sometimes fails (maybe 1 out of 20 runs).
 
 Set an environment variable `DEBUG` to `1` or `true` to get extra logging
diff --git a/stressor/package-lock.json b/stressor/package-lock.json
index 48d8795..3e6de14 100644
--- a/stressor/package-lock.json
+++ b/stressor/package-lock.json
@@ -10,10 +10,14 @@
       "license": "MIT",
       "dependencies": {
         "@octokit/rest": "^21.0.1",
+        "@types/lodash.isequal": "^4.5.8",
         "jest-diff": "^29.7.0",
+        "lodash.isequal": "^4.5.0",
         "ngrok": "^5.0.0-beta.2",
+        "p-limit": "^6.1.0",
         "ts-node": "^10.9.2",
-        "typescript": "^5.5.4"
+        "typescript": "^5.5.4",
+        "word-wrap": "^1.2.5"
       },
       "devDependencies": {
         "@octokit/types": "^13.5.0",
@@ -287,6 +291,19 @@
         "@types/node": "*"
       }
     },
+    "node_modules/@types/lodash": {
+      "version": "4.17.10",
+      "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.10.tgz",
+      "integrity": "sha512-YpS0zzoduEhuOWjAotS6A5AVCva7X4lVlYLF0FYHAY9sdraBfnatttHItlWeZdGhuEkf+OzMNg2ZYAx8t+52uQ=="
+    },
+    "node_modules/@types/lodash.isequal": {
+      "version": "4.5.8",
+      "resolved": "https://registry.npmjs.org/@types/lodash.isequal/-/lodash.isequal-4.5.8.tgz",
+      "integrity": "sha512-uput6pg4E/tj2LGxCZo9+y27JNyB2OZuuI/T5F+ylVDYuqICLG2/ktjxx0v6GvVntAf8TvEzeQLcV0ffRirXuA==",
+      "dependencies": {
+        "@types/lodash": "*"
+      }
+    },
     "node_modules/@types/node": {
       "version": "20.14.12",
       "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.12.tgz",
@@ -647,6 +664,11 @@
       "resolved": "https://registry.npmjs.org/lodash.clonedeep/-/lodash.clonedeep-4.5.0.tgz",
       "integrity": "sha512-H5ZhCF25riFd9uB5UCkVKo61m3S/xZk1x4wA6yp/L3RFP6Z/eHH1ymQcGLo7J3GMPfm0V/7m1tryHuGVxpqEBQ=="
     },
+    "node_modules/lodash.isequal": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/lodash.isequal/-/lodash.isequal-4.5.0.tgz",
+      "integrity": "sha512-pDo3lu8Jhfjqls6GkMgpahsF9kCyayhgykjyLMNFTKWrpVdAQtYyB4muAMWozBB4ig/dtWAmsMxLEI8wuz+DYQ=="
+    },
     "node_modules/lowercase-keys": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-2.0.0.tgz",
@@ -722,6 +744,20 @@
         "node": ">=8"
       }
     },
+    "node_modules/p-limit": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-6.1.0.tgz",
+      "integrity": "sha512-H0jc0q1vOzlEk0TqAKXKZxdl7kX3OFUzCnNVUnq5Pc3DGo0kpeaMuPqxQn235HibwBEb0/pm9dgKTjXy66fBkg==",
+      "dependencies": {
+        "yocto-queue": "^1.1.1"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
     "node_modules/pend": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz",
@@ -886,6 +922,14 @@
       "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz",
       "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg=="
     },
+    "node_modules/word-wrap": {
+      "version": "1.2.5",
+      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
+      "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
     "node_modules/wrappy": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
@@ -918,6 +962,17 @@
       "engines": {
         "node": ">=6"
       }
+    },
+    "node_modules/yocto-queue": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-1.1.1.tgz",
+      "integrity": "sha512-b4JR1PFR10y1mKjhHY9LaGo6tmrgjit7hxVIeAmyMw3jegXR4dhYqLaQF5zMXZxY7tLpMyJeLjr1C4rLmkVe8g==",
+      "engines": {
+        "node": ">=12.20"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
     }
   }
 }
diff --git a/stressor/package.json b/stressor/package.json
index 955cb52..e3ce42d 100644
--- a/stressor/package.json
+++ b/stressor/package.json
@@ -11,10 +11,14 @@
   "license": "MIT",
   "dependencies": {
     "@octokit/rest": "^21.0.1",
+    "@types/lodash.isequal": "^4.5.8",
     "jest-diff": "^29.7.0",
+    "lodash.isequal": "^4.5.0",
     "ngrok": "^5.0.0-beta.2",
+    "p-limit": "^6.1.0",
     "ts-node": "^10.9.2",
-    "typescript": "^5.5.4"
+    "typescript": "^5.5.4",
+    "word-wrap": "^1.2.5"
   },
   "devDependencies": {
     "@octokit/types": "^13.5.0",
diff --git a/stressor/stress-test.mts b/stressor/stress-test.mts
index 8875169..28e45ff 100644
--- a/stressor/stress-test.mts
+++ b/stressor/stress-test.mts
@@ -2,42 +2,53 @@ import * as http from "node:http";
 import ngrok from "ngrok";
 import { Octokit } from "@octokit/rest";
 import { diff } from "jest-diff";
+import test, { run } from "node:test";
+import wrap from "word-wrap";
+import pLimit from "p-limit";
+import isEqual from "lodash.isequal";
 
 const DEBUG = process.env.DEBUG === "true" || process.env.DEBUG === "1";
+const limitWorkflows = pLimit(8);
 
 const testPlans = [
   "tests/menu-button-actions-active-descendant",
-  // "tests/alert",
-  // "tests/horizontal-slider",
-  // "tests/command-button",
-  // "tests/disclosure-navigation",
-  // "tests/link-span-text",
-  // "tests/dialog",
-  // "tests/menu-button-navigation",
-  // "tests/radiogroup-aria-activedescendant",
-  // "tests/toggle-button/toggle-button-navigation",
+  "tests/alert",
+  "tests/horizontal-slider",
+  "tests/command-button",
+  "tests/disclosure-navigation",
+  "tests/link-span-text",
+  "tests/modal-dialog",
+  "tests/menu-button-navigation",
+  "tests/radiogroup-aria-activedescendant",
+  "tests/toggle-button",
 ];
 const owner = "bocoup",
   repo = "aria-at-gh-actions-helper";
 const defaultBranch = "main";
+
+// ordered this way because voiceover usually finishes quicker, and when you only
+// have 3 jobs left it matters... :)
 const testingMatrix = [
-  {
-    workflowId: "voiceover-test.yml",
-    browsers: ["safari", "chrome", "firefox"],
-  },
   {
     workflowId: "nvda-test.yml",
     browsers: ["chrome", "firefox"],
   },
+  {
+    workflowId: "voiceover-test.yml",
+    browsers: ["safari", "chrome", "firefox"],
+  },
 ];
+
 const port = 8888;
 const workflowHeaderKey = "x-workflow-key";
-const numRuns = 2;
+const numRuns = 5;
 
 interface WorkflowCallbackPayload {
   status: string;
-  testCsvRow: number;
+  testCsvRow?: number;
+  presentationNumber?: number;
   responses?: Array<string>;
+  externalLogsUrl?: string;
 }
 
 interface TestCombination {
@@ -51,22 +62,67 @@ type WorkflowRunResults = Array<{
   testCsvRow: number;
 }>;
 
+type WorkflowRun = {
+  runLogsUrl: string;
+  results: WorkflowRunResults;
+};
+
+type ComparisonTestRunDifference = {
+  runId: number;
+  responses: Array<string>;
+}
+
+type ComparisonTestRunResult = {
+  testCsvRow: number;
+  baselineResponses: Array<string>;
+  differences: Array<ComparisonTestRunDifference>;
+};
+
+interface ComparisonRunResult {
+  percentUnequal: number;
+  totalRows: number;
+  equalRows: number;
+  unequalRows: number;
+  comparedResults: Array<ComparisonTestRunResult>;
+}
+
+type CompleteTestComboRunResult = ComparisonRunResult & TestCombination & {
+  logUrls: Array<string>;
+};
+
 /**
  * Logs the message to the console if DEBUG is true
  */
 const debugLog = (...args: Parameters<typeof console.debug>): void => {
   if (DEBUG) {
-    console.debug(...args);
+    // using console.error to print to STDERR
+    console.error("[DEBUG]:", ...args);
   }
 };
 
+/*
+ * Get a nice human readable string for the given GitHub workflow id
+ */
+function workflowIdAsLabel(workflowId: string): string {
+  switch (workflowId) {
+    case "voiceover-test.yml":
+      return "VoiceOver";
+
+    case "nvda-test.yml":
+      return "NVDA";
+
+    default:
+      return workflowId;
+  }
+}
+
 /**
  * Creates a unique key for a workflow run, given the test combo and run index
  * The key is used to identify the callbacks for a given test combo run
  */
 function getWorkflowRunKey(combination: TestCombination, runIndex: number) {
   const { workflowId, workflowBrowser, workflowTestPlan } = combination;
-  return `${runIndex}-${workflowId}-${workflowBrowser}-${workflowTestPlan}`;
+  return `${workflowTestPlan}-${workflowId}-${workflowBrowser}-${runIndex}`;
 }
 
 /**
@@ -103,7 +159,7 @@ async function setUpTestComboCallbackListener(
   testCombination: TestCombination,
   runIndex: number
 ) {
-  const promise = new Promise<WorkflowRunResults>((resolvePromise) => {
+  const promise = new Promise<WorkflowRun>((resolvePromise) => {
     const uniqueWorkflowHeaderValue = `${getWorkflowRunKey(
       testCombination,
       runIndex
@@ -123,29 +179,37 @@ async function setUpTestComboCallbackListener(
           const parsedBody: WorkflowCallbackPayload = JSON.parse(body);
 
           if (parsedBody.status === "COMPLETED") {
+            debugLog(`${getWorkflowRunKey(testCombination, runIndex)}: received\n${body}`);
             // if results are included, then we collect them
             // if not, then we assume this is a status update and the test plan is done
             if (parsedBody.responses !== undefined) {
               results.push({
                 screenreaderResponses: parsedBody.responses,
-                testCsvRow: parsedBody.testCsvRow,
+                testCsvRow:
+                  parsedBody.testCsvRow ?? parsedBody.presentationNumber ?? -1,
               });
             } else {
+              const runLogsUrl = parsedBody.externalLogsUrl ?? "url not collected";
               debugLog(
                 `Workflow run ${getWorkflowRunKey(
                   testCombination,
                   runIndex
                 )} finished.`
               );
-              resolvePromise(results);
+              resolvePromise({results, runLogsUrl});
               server.removeListener("request", requestListener);
             }
+          } else if (parsedBody.status === "ERROR") {
+            // BELL in case the terminal supports it
+            process.stderr.write('\u0007');
+            console.error("[ERROR]:", `${getWorkflowRunKey(testCombination, runIndex)}: received\n${body}`);
           }
           res.end();
         });
       }
     };
     server.on("request", requestListener);
+    debugLog(`Workflow run ${getWorkflowRunKey(testCombination, runIndex)} listener started.`);
   });
 
   return promise;
@@ -176,9 +240,10 @@ async function dispatchWorkflowForTestCombo(
         )}`,
       },
     });
+    debugLog(`Dispatched ${testComboToString(testCombo)} Run #${runIndex}`)
     return true;
   } catch (e) {
-    console.log(
+    console.error(
       `Run ${runIndex} of ${testComboToString(testCombo)} failed to dispatch.`
     );
     console.error(e);
@@ -186,82 +251,135 @@ async function dispatchWorkflowForTestCombo(
   }
 }
 
+/**
+ * Find the most common set of screenreader responses for each test in this set of runs
+ * In other words, it finds the most for results of the same testCsv number
+ * within this collection of run results.
+ *
+ * @returns a synthetic results array where each element is the mode for its csvRow
+ */
+function findMostCommonRunResults(
+  runs: ReadonlyArray<WorkflowRun>
+): WorkflowRunResults {
+  // Group responses by testCsvRow
+  const groupedResponses: Map<number, Array<Array<string>>> = new Map();
+
+  runs.forEach((run) => {
+    run.results.forEach((row) => {
+      if (!groupedResponses.has(row.testCsvRow)) {
+        groupedResponses.set(row.testCsvRow, []);
+      }
+      groupedResponses.get(row.testCsvRow)!.push(row.screenreaderResponses);
+    });
+  });
+
+  // Find mode for each testCsvRow
+  const modeResponses: WorkflowRunResults = Array.from(
+    groupedResponses.entries()
+  ).map(([testCsvRow, responses]) => {
+    const mode = findMode(responses);
+    return {
+      testCsvRow,
+      screenreaderResponses: mode,
+    };
+  });
+
+  return modeResponses;
+}
+
+function findMode(arr: Array<Array<string>>): Array<string> {
+  const counts = new Map<string, number>();
+  let maxCount = 0;
+  let mode: Array<string> = [];
+
+  arr.forEach((item) => {
+    const key = JSON.stringify(item);
+    const count = (counts.get(key) || 0) + 1;
+    counts.set(key, count);
+
+    if (count > maxCount) {
+      maxCount = count;
+      mode = item;
+    }
+  });
+
+  return mode;
+}
+
 /**
  * Checks the results in a set of workflow runs for population and equality
  * @returns An object with percentages of populated and equal results
  */
-function checkRunSetResults(results: Array<WorkflowRunResults>) {
+function checkRunSetResults(runs: Array<WorkflowRun>): ComparisonRunResult {
   let totalRows = 0;
-  let populatedRows = 0;
   let equalRows = 0;
 
-  results.forEach((workflowResults, workflowIndex) => {
-    totalRows += workflowResults.length;
-
-    workflowResults.forEach((row, rowIndex) => {
-      // Check for populated responses
-      const isRowPopulated = row.screenreaderResponses.every(
-        (s: string) => s !== null && s.trim().length !== 0
-      );
-      if (isRowPopulated) {
-        populatedRows++;
+  const comparisonWorkflowRunResults = findMostCommonRunResults(runs);
+  const comparedResults: Array<ComparisonTestRunResult> = [];
+  comparisonWorkflowRunResults.forEach((compTest) => {
+    const { testCsvRow, screenreaderResponses: baselineResponses } = compTest;
+    const differences: Array<ComparisonTestRunDifference> = [];
+    runs.forEach((run, i) => {
+      totalRows++;
+      const resultResponses =
+        run.results.findLast((l) => l.testCsvRow === compTest.testCsvRow)
+          ?.screenreaderResponses ?? [];
+      if (isEqual(resultResponses, baselineResponses)) {
+        equalRows++;
       } else {
-        console.error(
-          `Test CSV row ${row.testCsvRow} has a blank response from screenreader`
-        );
-        console.error(row.screenreaderResponses);
-      }
-
-      // Check for equal responses (skip first workflow as it's the reference)
-      if (workflowIndex > 0) {
-        const isRowEqual = row.screenreaderResponses.every(
-          (a: string, j: number) =>
-            a === results[0][rowIndex].screenreaderResponses[j]
-        );
-        if (isRowEqual) {
-          equalRows++;
-        } else {
-          console.error(
-            `Run #${workflowIndex} of Test CSV row ${row.testCsvRow} has screenreader responses different from Run 0`
-          );
-          console.error(
-            diff(
-              row.screenreaderResponses,
-              results[0][rowIndex].screenreaderResponses
-            )
-          );
-        }
+        differences.push({ runId: i, responses: resultResponses });
       }
     });
+    comparedResults.push({ testCsvRow, baselineResponses, differences });
   });
 
-  const totalRowsExcludingFirst = totalRows - results[0].length;
-  const percentPopulated = ((totalRows - populatedRows) / totalRows) * 100;
-  const percentEqual =
-    ((totalRowsExcludingFirst - equalRows) / totalRowsExcludingFirst) * 100;
+  const percentUnequal = ((totalRows - equalRows) / totalRows) * 100;
 
-  console.log(
-    `Percentage of rows with unpopulated responses: ${percentPopulated.toFixed(
-      2
-    )}%, (${totalRows - populatedRows} of ${totalRows})`
-  );
-  console.log(
-    `Percentage of rows with unequal responses: ${percentEqual.toFixed(2)}%, (${
-      totalRowsExcludingFirst - equalRows
-    } of ${totalRowsExcludingFirst})`
-  );
 
   return {
-    percentUnpopulated: percentPopulated,
-    percentUnequal: percentEqual,
+    comparedResults,
+    totalRows: totalRows,
+    equalRows: equalRows,
+    unequalRows: totalRows - equalRows,
+    percentUnequal,
   };
 }
 
+const dispatchAndListen = async(testCombo: TestCombination, runIndex: number): Promise<WorkflowRun> => {
+  const dispatched = await dispatchWorkflowForTestCombo(
+    testCombo,
+    runIndex
+  );
+  if (dispatched) {
+    return await setUpTestComboCallbackListener(
+      testCombo,
+      runIndex
+    );
+  } else {
+    throw new Error('dispatch failed');
+  }
+};
+
+const spawnAndCollectWorkflows = async (testCombo: TestCombination): Promise<CompleteTestComboRunResult> => {
+  const runPromises: Array<Promise<WorkflowRun>> = [];
+  for (let runIndex = 0; runIndex < numRuns; runIndex++) {
+      runPromises.push(limitWorkflows(() => dispatchAndListen(testCombo, runIndex)));
+  }
+  // Wait to get all results from parallel runs of the same test combo
+  const runResults = await Promise.all(runPromises);
+  // Check if all the results are good
+  const runResultStats = checkRunSetResults(runResults);
+  const comboResult: CompleteTestComboRunResult = { ...testCombo, ...runResultStats, logUrls: runResults.map(run => run.runLogsUrl) };
+  debugLog(`${testComboToString(testCombo)} done`, comboResult);
+  allResults.set(testCombo, comboResult);
+  return comboResult;
+}
+
 // Get all the test combos
 const testCombinations = enumerateTestCombinations(testingMatrix, testPlans);
-console.log("Test Plans:\n", testPlans);
-console.log("Testing Matrix:\n", testingMatrix);
-console.log(
+debugLog("Test Plans:\n", testPlans);
+debugLog("Testing Matrix:\n", testingMatrix);
+debugLog(
   `Will dispatch ${
     testCombinations.length
   } test combinations ${numRuns} times, for a total of ${
@@ -271,86 +389,190 @@ console.log(
 
 const server = http.createServer();
 server.listen(port);
-console.log(`Local server started at port ${port}`);
+debugLog(`Local server started at port ${port}`);
 server.setMaxListeners(50);
 
 const ngrokUrl = await ngrok.connect({
   port,
 });
-console.log(`Ngrok tunnel started at ${ngrokUrl}`);
+debugLog(`Ngrok tunnel started at ${ngrokUrl}`);
 
 process.on("beforeExit", (code) => {
   server.close();
   ngrok.kill();
-  console.log("Exiting with code: ", code);
+  console.error("Exiting with code: ", code);
 });
 
 const octokitClient = new Octokit({
   auth: process.env.GITHUB_TOKEN,
 });
 
-// Step through testPlans, waiting for those CI runs to finish before the next begin
-for (const testPlan of testPlans) {
-  console.log(
-    `===============\nRunning tests for test plan ${testPlan}.\n===============`
-  );
-  // Filter the list of test combos to only those for this test plan
-  const testCombosForTestPlan = testCombinations.filter(
-    (testCombo) => testCombo.workflowTestPlan === testPlan
-  );
-  // For each test plan, run each test combo in parallel
-  const testCombinationResults = await Promise.all(
-    testCombosForTestPlan.map(async (testCombo: TestCombination) => {
-      const runPromises = [];
-      for (let runIndex = 0; runIndex < numRuns; runIndex++) {
-        const dispatched = await dispatchWorkflowForTestCombo(
-          testCombo,
-          runIndex
-        );
-        if (dispatched) {
-          const listenerPromise = setUpTestComboCallbackListener(
-            testCombo,
-            runIndex
-          );
-          runPromises.push(listenerPromise);
-        }
-      }
-      debugLog(
-        `Dispatched ${
-          runPromises.length
-        } workflow runs for combination ${testComboToString(testCombo)}.`
-      );
 
-      // Wait to get all results from parallel runs of the same test combo
-      const runResults = await Promise.all(runPromises);
 
-      // Check if all the results are good
-      console.log(
-        `Checking results for test combo ${testComboToString(testCombo)}.`
-      );
-      const runResultStats = checkRunSetResults(runResults);
+const allResults: Map<TestCombination, CompleteTestComboRunResult> = new Map();
 
-      return { ...testCombo, ...runResultStats };
-    })
-  );
+// Debug helper: read the needed "allResults" for this run to a json file
+// import { readFile } from "node:fs/promises";
+// const allResults: Map<TestCombination, CompleteTestComboRunResult> = new Map(
+//  JSON.parse(await readFile("stressor-run.json", "utf-8"))
+// );
 
-  console.log(
-    `===============\nCompleted tests for test plan ${testPlan} with results: \n===============`
-  );
-  testCombinationResults.forEach((result) => {
-    console.log(`${result.workflowId} + ${result.workflowBrowser}`);
-    console.log(
-      `Unpopulated responses across all ${numRuns} runs: ${result.percentUnpopulated.toFixed(
-        2
-      )}%`
-    );
-    console.log(
-      `Unequal responses between all ${numRuns} runs: ${result.percentUnequal.toFixed(
-        2
-      )}%`
-    );
-  });
-  console.log(`==============================`);
+
+if (allResults.size == 0) {
+  const logStatusInterval = setInterval(() => {
+    // write direct to stderr to not get piped to markdown output.
+    process.stderr.write(`Workflow queue status: ${limitWorkflows.activeCount} active, ${limitWorkflows.pendingCount} pending.\n`);
+  }, 60000);
+
+  try {
+    // Step through testPlans, waiting for those CI runs to finish before the next begin
+    await Promise.all(testPlans.flatMap(testPlan => {
+      // Filter the list of test combos to only those for this test plan
+      const testCombosForTestPlan = testCombinations.filter(
+        (testCombo) => testCombo.workflowTestPlan === testPlan
+      );
+      // For each test plan, run each test combo in parallel
+      return testCombosForTestPlan.map(spawnAndCollectWorkflows);
+    }));
+  }
+  finally {
+    clearInterval(logStatusInterval);
+  }
 }
 
+const formatResultsForMD = (results: Map<TestCombination, CompleteTestComboRunResult>) => {
+  const keys = [...results.keys()];
+  const values = [...results.values()];
+
+  const scoring = {
+    workflowTestPlan: [...new Set(keys.map(key => key.workflowTestPlan))].sort(),
+    workflowId: [...new Set(keys.map(key => key.workflowId))].sort(),
+    workflowBrowser: [...new Set(keys.map(key => key.workflowBrowser))].sort(),
+  };
+  // generate a distinct ordering score for keys
+  // browser - least significant
+  // workflow - next most
+  // test plan - most significant
+  const score = (key: TestCombination) => (
+    scoring.workflowBrowser.indexOf(key.workflowBrowser) +
+    (scoring.workflowId.indexOf(key.workflowId) * scoring.workflowBrowser.length) +
+    (scoring.workflowTestPlan.indexOf(key.workflowTestPlan) * scoring.workflowId.length * scoring.workflowBrowser.length)
+  )
+  keys.sort((a, b) => score(a) - score(b));
+
+  console.log(`# Stress Test Run - Completed ${new Date().toISOString()}\n`);
+
+  const generalSummary = values.reduce((memo, result) => {
+    return {
+      totalRuns: memo.totalRuns + result.totalRows,
+      totalEqual: memo.totalEqual + result.equalRows,
+    };
+  }, { totalRuns: 0, totalEqual: 0 });
+
+
+  console.log(`* __Total Tests:__ ${generalSummary.totalRuns}`);
+  console.log(`* __Total Unequal %:__ ${((generalSummary.totalRuns - generalSummary.totalEqual) * 100 / generalSummary.totalRuns).toFixed(2)}%`)
+  console.log(`* __Number of runs per combo:__ ${numRuns}`);
+  console.log(`* __Maximum possible "Unequal %" based on number of runs:__ ${((numRuns - 1) * 100 / numRuns).toFixed(2)}%`);
+  console.log(`* __Test Plans:__\n`);
+  for (const plan of testPlans) {
+    console.log(`  * ${plan}`)
+  }
+  console.log(`\n* __Test Matrix:__\n`);
+  for (const entry of testingMatrix) {
+    console.log(`  * ${entry.workflowId}`);
+    for (const browser of entry.browsers) {
+      console.log(`    * ${browser}`);
+    }
+  }
+
+  type GenerateBy = (arg0: CompleteTestComboRunResult) => string;
+  type Formatter = (arg0: string) => string;
+  const generateSummary = (displayTitle: string, by: GenerateBy, formatter: Formatter = identity => identity) => {
+    console.log(`\n## Summary by ${displayTitle}\n`);
+    console.log(`| ${displayTitle} | Total Tests | Unequal Responses | Unequal % |`);
+    console.log("| --- | --- | --- | --- |");
+    const allKeys = new Set(values.map(by));
+    for (const key of allKeys) {
+      const { totalRuns, totalEqual } = values
+        .filter((result) => by(result) === key)
+        .reduce((memo, result) => {
+          return {
+            totalRuns: memo.totalRuns + result.totalRows,
+            totalEqual: memo.totalEqual + result.equalRows,
+          };
+        }, { totalRuns: 0, totalEqual: 0 });
+      const totalUnequal = totalRuns - totalEqual;
+      console.log(`| ${formatter(key)} | ${totalRuns} | ${totalUnequal} | ${(totalUnequal * 100 / totalRuns).toFixed(2)}% |`);
+    }
+  }
+
+  generateSummary('Test Plan', result => result.workflowTestPlan);
+  generateSummary('AT', result => result.workflowId, workflowIdAsLabel);
+  generateSummary('Browser', result => result.workflowBrowser);
+
+  const generateHeaderTextForCombo = (combo: TestCombination):string =>
+    `${combo.workflowTestPlan} ${workflowIdAsLabel(combo.workflowId)} ${combo.workflowBrowser}`;
+
+  const generateHeaderLinkForCombo = (combo: TestCombination):string =>
+    '#' + generateHeaderTextForCombo(combo)
+      .replace(/[^\s\w-]/g, '')
+      .replace(/\s+/g, '-')
+      .toLowerCase();
+
+  console.log(`\n## Summary by All\n`);
+  console.log(`| Test Plan | AT | Browser | Total Tests | Unequal Responses | Unequal % | Heading Link |`);
+  console.log("| --- | --- | --- | --- | --- | --- | --- |");
+  for (const combo of keys) {
+    const comboResults = results.get(combo);
+    // typescript insists this is possibly undefined
+    if (comboResults) {
+      console.log(`| ${comboResults.workflowTestPlan} | ${workflowIdAsLabel(comboResults.workflowId)} | ${comboResults.workflowBrowser} | ${comboResults.totalRows} | ${comboResults.unequalRows} | ${comboResults.percentUnequal.toFixed(2)}% | [#](${generateHeaderLinkForCombo(combo)}) |`)
+    }
+  }
+
+  const formatResponses = (responses: Array<string>, newlineTab: string = '\n    '): string =>
+    responses
+      .map((response, index) => {
+        const responseWrapped = response.split('\n').map(line => wrap(line, {width: 60, newline: newlineTab})).join(newlineTab);
+        return `Response ${index+1}:${newlineTab}${responseWrapped}`;
+      })
+      .join('\n')
+      .replace(/\n(\s*\n)+/g, '\n');
+
+  for (const combo of keys) {
+    const comboResults = results.get(combo);
+    // typescript insists this is possibly undefined
+    if (comboResults) {
+      console.log(`\n## ${generateHeaderTextForCombo(combo)}\n`);
+      console.log(`\n### Run Logs\n`);
+      let logNumber = 0;
+      for (const url of comboResults.logUrls) {
+        console.log(`* [Run #${logNumber++}](${url})`);
+      }
+      for (const comparedResult of comboResults.comparedResults) {
+        console.log(`\n### Test Number: ${comparedResult.testCsvRow}\n`);
+        console.log(`__${combo.workflowTestPlan} ${workflowIdAsLabel(combo.workflowId)} ${combo.workflowBrowser}__`);
+        console.log(`#### Most Common Responses:`);
+        console.log("```");
+        console.log(formatResponses(comparedResult.baselineResponses));
+        console.log("```");
+        for(const diverges of comparedResult.differences) {
+          console.log(`#### Divergent responses from [Run ${diverges.runId}](${comboResults.logUrls[diverges.runId]}):`);
+          console.log("```diff");
+          console.log(diff(formatResponses(comparedResult.baselineResponses), formatResponses(diverges.responses)));
+          console.log("```");
+        }
+      }
+    }
+    console.log(``)
+  }
+};
+
+formatResultsForMD(allResults);
+
+// Debug helper: write the needed "allResults" for this run to a json file
+import { writeFile } from "node:fs/promises";
+await writeFile("stressor-run.json", JSON.stringify([...allResults.entries()]), "utf-8");
+
 process.exit(0);