diff --git a/identify-best-comments.ipynb b/identify-best-comments.ipynb
index 4b08414..a514d93 100644
--- a/identify-best-comments.ipynb
+++ b/identify-best-comments.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
@@ -70,7 +70,7 @@
"└──────────┴─────────┴────────────┴────────────┴───┴────────────┴────────────┴───────────┴─────────┘"
]
},
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -87,7 +87,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -106,23 +106,37 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.\n"
- ]
+ "data": {
+ "text/plain": [
+ "\"I'm sorry but no\""
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
+ "source": [
+ "html.unescape(\"I'm sorry but no\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
"source": [
"import html\n",
+ "import re\n",
"\n",
"\n",
"def unescape_html(text):\n",
- " return html.unescape(text)\n",
+ " unescaped = html.unescape(text).replace(\"
\", \"\\n\\n\")\n",
+ " return re.sub(r']*>[^<]+', r\"\\1\", unescaped)\n",
"\n",
"\n",
"df = df.with_columns(\n",
@@ -131,22 +145,23 @@
" pl.col(\"id\"),\n",
" ).alias(\"link\"),\n",
" pl.col(\"time\").dt.strftime(\"%B %d, %Y\").alias(\"date\"),\n",
- " pl.col(\"text\").str.replace_all(\"
\", \"\\n\\n\").alias(\"text\"),\n",
- " pl.col(\"text\")\n",
- " .map_elements(unescape_html, return_dtype=pl.String)\n",
- " .alias(\"text_unescaped\"),\n",
+ " pl.col(\"text\").map_elements(unescape_html, return_dtype=pl.String),\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df.select(pl.col(\"date\", \"by\", \"link\", \"story_title\", \"text\", \"reward\")).head(\n",
" 100\n",
- ").write_csv(\"./data/top_comments_with_links.csv\")\n"
+ ").write_csv(\"./data/top_comments_with_links.csv\")\n",
+ "\n",
+ "df.sort(by=\"reward\", descending=False).select(\n",
+ " pl.col(\"date\", \"by\", \"link\", \"story_title\", \"text\", \"reward\")\n",
+ ").head(100).write_csv(\"./data/bottom_comments_with_links.csv\")"
]
}
],
diff --git a/prepare-dataset.ipynb b/prepare-dataset.ipynb
index 8b1bfde..b7c1e39 100644
--- a/prepare-dataset.ipynb
+++ b/prepare-dataset.ipynb
@@ -2,149 +2,62 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "import utils\n",
- "import polars as pl\n",
- "\n",
- "df = utils.dataset()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
- "text/html": [
- "
\n",
- "
shape: (36_323_460, 15)id | type | by | time | title | text | url | score | parent | top_level_parent | descendants | kids | deleted | dead | siblings_count |
---|
i64 | str | str | datetime[μs] | str | str | str | i64 | i64 | i64 | i64 | list[i64] | bool | bool | u32 |
15 | "comment" | "sama" | 2006-10-09 19:51:01 | null | ""the rising star of ventur… | null | null | 1 | 1 | null | [17] | null | null | 4 |
17 | "comment" | "pg" | 2006-10-09 19:52:45 | null | "Is there anywhere to eat on Sa… | null | null | 15 | 1 | null | [1079] | null | null | 1 |
22 | "comment" | "pg" | 2006-10-10 02:18:22 | null | "It's kind of funny that Sevin … | null | null | 21 | 21 | null | null | null | null | 1 |
23 | "comment" | "starklysnarky" | 2006-10-10 02:30:53 | null | "This is interesting, but the l… | null | null | 20 | 20 | null | null | null | null | 1 |
30 | "comment" | "spez" | 2006-10-10 15:34:59 | null | "Stay tuned..." | null | null | 29 | 29 | null | [31] | null | null | 1 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
41813379 | "comment" | "kbolino" | 2024-10-11 20:26:09 | null | "A non-blocking send would work… | null | null | 41812752 | 41809262 | null | null | null | null | 1 |
41813380 | "comment" | "slightwinder" | 2024-10-11 20:26:18 | null | "> It genuinely astounds me … | null | null | 41810691 | 41808943 | null | null | null | null | 4 |
41813381 | "comment" | "marcosdumay" | 2024-10-11 20:26:21 | null | "Add forced sedentarism into th… | null | null | 41812891 | 41811263 | null | null | null | null | 1 |
41813383 | "comment" | "davio" | 2024-10-11 20:26:28 | null | "hims sells the generic version… | null | null | 41813102 | 41811263 | null | null | null | null | 1 |
41813384 | "comment" | "btilly" | 2024-10-11 20:26:29 | null | "Well, if you want a simple arg… | null | null | 41812134 | 41808127 | null | null | null | null | 1 |
"
- ],
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "2559539da6124d6d81613429dbf2c720",
+ "version_major": 2,
+ "version_minor": 0
+ },
"text/plain": [
- "shape: (36_323_460, 15)\n",
- "┌──────────┬─────────┬──────────────┬──────────────┬───┬───────────┬─────────┬──────┬──────────────┐\n",
- "│ id ┆ type ┆ by ┆ time ┆ … ┆ kids ┆ deleted ┆ dead ┆ siblings_cou │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ nt │\n",
- "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ list[i64] ┆ bool ┆ bool ┆ --- │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 │\n",
- "╞══════════╪═════════╪══════════════╪══════════════╪═══╪═══════════╪═════════╪══════╪══════════════╡\n",
- "│ 15 ┆ comment ┆ sama ┆ 2006-10-09 ┆ … ┆ [17] ┆ null ┆ null ┆ 4 │\n",
- "│ ┆ ┆ ┆ 19:51:01 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 17 ┆ comment ┆ pg ┆ 2006-10-09 ┆ … ┆ [1079] ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 19:52:45 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 22 ┆ comment ┆ pg ┆ 2006-10-10 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 02:18:22 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 23 ┆ comment ┆ starklysnark ┆ 2006-10-10 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ y ┆ 02:30:53 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 30 ┆ comment ┆ spez ┆ 2006-10-10 ┆ … ┆ [31] ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 15:34:59 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ 41813379 ┆ comment ┆ kbolino ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:09 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813380 ┆ comment ┆ slightwinder ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 4 │\n",
- "│ ┆ ┆ ┆ 20:26:18 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813381 ┆ comment ┆ marcosdumay ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:21 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813383 ┆ comment ┆ davio ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:28 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813384 ┆ comment ┆ btilly ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:29 ┆ ┆ ┆ ┆ ┆ │\n",
- "└──────────┴─────────┴──────────────┴──────────────┴───┴───────────┴─────────┴──────┴──────────────┘"
+ "Resolving data files: 0%| | 0/39 [00:00, ?it/s]"
]
},
- "execution_count": 3,
"metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Ok, I want to start by limiting to comments\n",
- "comments_df = df.filter(pl.col(\"type\") == \"comment\")\n",
- "\n",
- "# add a new column siblings_count\n",
- "comments_df = comments_df.with_columns(\n",
- " [pl.col(\"id\").count().over(\"parent\").alias(\"siblings_count\")]\n",
- ")\n",
- "comments_df"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
+ "output_type": "display_data"
+ },
{
"data": {
- "text/html": [
- "\n",
- "
shape: (24_134_648, 15)id | type | by | time | title | text | url | score | parent | top_level_parent | descendants | kids | deleted | dead | siblings_count |
---|
i64 | str | str | datetime[μs] | str | str | str | i64 | i64 | i64 | i64 | list[i64] | bool | bool | u32 |
194 | "comment" | "jmzachary" | 2007-02-20 22:33:51 | null | "Thanks for the rationale. I'm … | null | null | 189 | 189 | null | [205, 422, 199] | null | null | 20 |
195 | "comment" | "jdroid" | 2007-02-20 22:36:52 | null | "You've filled a hole reddit wa… | null | null | 189 | 189 | null | [259] | null | null | 20 |
199 | "comment" | "Zak" | 2007-02-20 22:48:33 | null | "I don't think the fact that th… | null | null | 194 | 189 | null | [1644, 1897] | null | null | 3 |
205 | "comment" | "ninwa" | 2007-02-20 23:30:23 | null | "Really? I was most interested … | null | null | 194 | 189 | null | [210, 209] | null | null | 3 |
209 | "comment" | "ninwa" | 2007-02-20 23:41:34 | null | "This comment added through the… | null | null | 205 | 189 | null | null | null | null | 2 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
41813369 | "comment" | "whall6" | 2024-10-11 20:25:08 | null | "If you truly only have 5 minut… | null | null | 41812596 | 41811263 | null | null | null | null | 1 |
41813371 | "comment" | "julianeon" | 2024-10-11 20:25:25 | null | "But consider the tradeoff: it&… | null | null | 41811539 | 41811263 | null | null | null | null | 13 |
41813375 | "comment" | "throw0101c" | 2024-10-11 20:25:51 | null | "> <i>There is so much coal.… | null | null | 41811328 | 41807681 | null | null | null | null | 1 |
41813381 | "comment" | "marcosdumay" | 2024-10-11 20:26:21 | null | "Add forced sedentarism into th… | null | null | 41812891 | 41811263 | null | null | null | null | 1 |
41813383 | "comment" | "davio" | 2024-10-11 20:26:28 | null | "hims sells the generic version… | null | null | 41813102 | 41811263 | null | null | null | null | 1 |
"
- ],
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "d957acf686e74e1cb3060d9f7b3742cc",
+ "version_major": 2,
+ "version_minor": 0
+ },
"text/plain": [
- "shape: (24_134_648, 15)\n",
- "┌──────────┬─────────┬─────────────┬──────────────┬───┬─────────────┬─────────┬──────┬─────────────┐\n",
- "│ id ┆ type ┆ by ┆ time ┆ … ┆ kids ┆ deleted ┆ dead ┆ siblings_co │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ unt │\n",
- "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ list[i64] ┆ bool ┆ bool ┆ --- │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 │\n",
- "╞══════════╪═════════╪═════════════╪══════════════╪═══╪═════════════╪═════════╪══════╪═════════════╡\n",
- "│ 194 ┆ comment ┆ jmzachary ┆ 2007-02-20 ┆ … ┆ [205, 422, ┆ null ┆ null ┆ 20 │\n",
- "│ ┆ ┆ ┆ 22:33:51 ┆ ┆ 199] ┆ ┆ ┆ │\n",
- "│ 195 ┆ comment ┆ jdroid ┆ 2007-02-20 ┆ … ┆ [259] ┆ null ┆ null ┆ 20 │\n",
- "│ ┆ ┆ ┆ 22:36:52 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 199 ┆ comment ┆ Zak ┆ 2007-02-20 ┆ … ┆ [1644, ┆ null ┆ null ┆ 3 │\n",
- "│ ┆ ┆ ┆ 22:48:33 ┆ ┆ 1897] ┆ ┆ ┆ │\n",
- "│ 205 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ [210, 209] ┆ null ┆ null ┆ 3 │\n",
- "│ ┆ ┆ ┆ 23:30:23 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 209 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n",
- "│ ┆ ┆ ┆ 23:41:34 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ 41813369 ┆ comment ┆ whall6 ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:25:08 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813371 ┆ comment ┆ julianeon ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 13 │\n",
- "│ ┆ ┆ ┆ 20:25:25 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813375 ┆ comment ┆ throw0101c ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:25:51 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813381 ┆ comment ┆ marcosdumay ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:21 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813383 ┆ comment ┆ davio ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:28 ┆ ┆ ┆ ┆ ┆ │\n",
- "└──────────┴─────────┴─────────────┴──────────────┴───┴─────────────┴─────────┴──────┴─────────────┘"
+ "Resolving data files: 0%| | 0/39 [00:00, ?it/s]"
]
},
- "execution_count": 4,
"metadata": {},
- "output_type": "execute_result"
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "920de4af16a64027bf4da19f37f2fcac",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Loading dataset shards: 0%| | 0/39 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
- "stories_with_100_points = df.filter(pl.col(\"score\") >= 100)\n",
- "\n",
- "# filter comments_df to only include comments whose top_level_parent is in the stories_with_100_points dataframe\n",
- "comments_df = comments_df.filter(\n",
- " pl.col(\"top_level_parent\").is_in(stories_with_100_points[\"id\"])\n",
- ")\n",
+ "import utils\n",
+ "import polars as pl\n",
"\n",
- "comments_df"
+ "df = utils.full_dataset()"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -156,81 +69,29 @@
]
},
{
- "data": {
- "text/html": [
- "\n",
- "
shape: (24_134_648, 16)id | type | by | time | title | text | url | score | parent | top_level_parent | descendants | kids | deleted | dead | siblings_count | sibling_rank |
---|
i64 | str | str | datetime[μs] | str | str | str | i64 | i64 | i64 | i64 | list[i64] | bool | bool | u32 | i64 |
194 | "comment" | "jmzachary" | 2007-02-20 22:33:51 | null | "Thanks for the rationale. I'm … | null | null | 189 | 189 | null | [205, 422, 199] | null | null | 20 | 13 |
195 | "comment" | "jdroid" | 2007-02-20 22:36:52 | null | "You've filled a hole reddit wa… | null | null | 189 | 189 | null | [259] | null | null | 20 | 3 |
199 | "comment" | "Zak" | 2007-02-20 22:48:33 | null | "I don't think the fact that th… | null | null | 194 | 189 | null | [1644, 1897] | null | null | 3 | 3 |
205 | "comment" | "ninwa" | 2007-02-20 23:30:23 | null | "Really? I was most interested … | null | null | 194 | 189 | null | [210, 209] | null | null | 3 | 1 |
209 | "comment" | "ninwa" | 2007-02-20 23:41:34 | null | "This comment added through the… | null | null | 205 | 189 | null | null | null | null | 2 | 2 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
41813369 | "comment" | "whall6" | 2024-10-11 20:25:08 | null | "If you truly only have 5 minut… | null | null | 41812596 | 41811263 | null | null | null | null | 1 | 1 |
41813371 | "comment" | "julianeon" | 2024-10-11 20:25:25 | null | "But consider the tradeoff: it&… | null | null | 41811539 | 41811263 | null | null | null | null | 13 | 1 |
41813375 | "comment" | "throw0101c" | 2024-10-11 20:25:51 | null | "> <i>There is so much coal.… | null | null | 41811328 | 41807681 | null | null | null | null | 1 | 1 |
41813381 | "comment" | "marcosdumay" | 2024-10-11 20:26:21 | null | "Add forced sedentarism into th… | null | null | 41812891 | 41811263 | null | null | null | null | 1 | 1 |
41813383 | "comment" | "davio" | 2024-10-11 20:26:28 | null | "hims sells the generic version… | null | null | 41813102 | 41811263 | null | null | null | null | 1 | 1 |
"
- ],
- "text/plain": [
- "shape: (24_134_648, 16)\n",
- "┌──────────┬─────────┬─────────────┬──────────────┬───┬─────────┬──────┬─────────────┬─────────────┐\n",
- "│ id ┆ type ┆ by ┆ time ┆ … ┆ deleted ┆ dead ┆ siblings_co ┆ sibling_ran │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ unt ┆ k │\n",
- "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ bool ┆ bool ┆ --- ┆ --- │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 ┆ i64 │\n",
- "╞══════════╪═════════╪═════════════╪══════════════╪═══╪═════════╪══════╪═════════════╪═════════════╡\n",
- "│ 194 ┆ comment ┆ jmzachary ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 20 ┆ 13 │\n",
- "│ ┆ ┆ ┆ 22:33:51 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 195 ┆ comment ┆ jdroid ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 20 ┆ 3 │\n",
- "│ ┆ ┆ ┆ 22:36:52 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 199 ┆ comment ┆ Zak ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 3 ┆ 3 │\n",
- "│ ┆ ┆ ┆ 22:48:33 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 205 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 3 ┆ 1 │\n",
- "│ ┆ ┆ ┆ 23:30:23 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 209 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 2 ┆ 2 │\n",
- "│ ┆ ┆ ┆ 23:41:34 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
- "│ 41813369 ┆ comment ┆ whall6 ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:25:08 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813371 ┆ comment ┆ julianeon ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 13 ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:25:25 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813375 ┆ comment ┆ throw0101c ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:25:51 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813381 ┆ comment ┆ marcosdumay ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:21 ┆ ┆ ┆ ┆ ┆ │\n",
- "│ 41813383 ┆ comment ┆ davio ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n",
- "│ ┆ ┆ ┆ 20:26:28 ┆ ┆ ┆ ┆ ┆ │\n",
- "└──────────┴─────────┴─────────────┴──────────────┴───┴─────────┴──────┴─────────────┴─────────────┘"
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Calculating nested levels: 100%|██████████| 24/24 [00:14<00:00, 1.71it/s]\n"
+ ]
}
],
"source": [
- "sibling_ranks = dict()\n",
- "\n",
- "# iterate over the rows of the dataframe\n",
- "for groupable_comment in df.select(pl.col(\"kids\")).iter_rows():\n",
- " kids = groupable_comment[0]\n",
- " if kids is not None:\n",
- " for i, kid in enumerate(kids):\n",
- " sibling_ranks[kid] = i + 1\n",
- "\n",
- "# Find the maximum value in the sibling_ranks dictionary\n",
- "max_sibling_rank = max(sibling_ranks.values())\n",
- "\n",
- "# Print the maximum value\n",
- "print(f\"The maximum sibling rank is: {max_sibling_rank}\")\n",
- "print(f\"The number of sibling ranks is: {len(sibling_ranks)}\")\n",
- "\n",
- "comments_df = comments_df.with_columns(\n",
- " [\n",
- " pl.col(\"id\")\n",
- " .replace_strict(\n",
- " list(sibling_ranks.keys()), list(sibling_ranks.values()), default=-1\n",
- " )\n",
- " .alias(\"sibling_rank\")\n",
- " ]\n",
+ "comments_df = utils.augmented_comments()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "stories_with_100_points = df.filter(pl.col(\"score\") >= 100)\n",
+ "\n",
+ "# filter comments_df to only include comments whose top_level_parent is in the stories_with_100_points dataframe\n",
+ "comments_df = comments_df.filter(\n",
+ " pl.col(\"top_level_parent\").is_in(stories_with_100_points[\"id\"])\n",
")\n",
- "del sibling_ranks\n",
"\n",
"comments_df"
]
diff --git a/prepare_env.sh b/prepare_env.sh
index 61b0b31..9b67eb3 100755
--- a/prepare_env.sh
+++ b/prepare_env.sh
@@ -18,7 +18,7 @@ source ~/.bashrc
# Install system dependencies
apt-get update
-apt-get install -y tmux nvtop entr build-essential
+apt-get install -y tmux nvtop htop entr build-essential
# Install python dependencies
curl -LsSf https://astral.sh/uv/0.4.6/install.sh | sh
diff --git a/rate-random-comments.ipynb b/rate-random-comments.ipynb
new file mode 100644
index 0000000..b94ba19
--- /dev/null
+++ b/rate-random-comments.ipynb
@@ -0,0 +1,254 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import polars as pl\n",
+ "from utils import dataset, build_all_prompts, run_inference_sglang\n",
+ "\n",
+ "df = dataset()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Building prompts: 100%|██████████| 100000/100000 [00:58<00:00, 1698.95it/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (100_000, 15)id | type | by | time | title | text | url | score | parent | top_level_parent | descendants | kids | deleted | dead | prompt |
---|
i64 | str | str | datetime[μs] | str | str | str | i64 | i64 | i64 | i64 | list[i64] | bool | bool | str |
7209407 | "comment" | "ricardobeat" | 2014-02-10 06:00:25 | null | "The article presents some (non… | null | null | 7207821 | 7207506 | null | [7210958, 7213827] | null | null | "<instructions>Your goal is to … |
13520503 | "comment" | "throwawayish" | 2017-01-30 12:11:14 | null | "Reaaally not that simple." | null | null | 13519742 | 13517389 | null | [13528758] | null | null | "<instructions>Your goal is to … |
13152625 | "comment" | "spikels" | 2016-12-11 19:19:47 | null | "That's an exaggeration. A… | null | null | 13151772 | 13147495 | null | [13165894] | null | null | "<instructions>Your goal is to … |
30957389 | "comment" | "ss108" | 2022-04-08 14:38:05 | null | "Ah, you're probably right… | null | null | 30956591 | 30955290 | null | null | null | null | "<instructions>Your goal is to … |
23335760 | "comment" | "en3r0" | 2020-05-28 10:41:15 | null | "I have been using Trilium late… | null | null | 23335759 | 23335759 | null | [23335860] | null | null | "<instructions>Your goal is to … |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
8110666 | "comment" | "erickookoo" | 2014-07-30 20:17:48 | null | "Part of the reason we built th… | null | null | 8109374 | 8109114 | null | null | null | null | "<instructions>Your goal is to … |
8039481 | "comment" | "josteink" | 2014-07-15 22:22:52 | null | "If you're worried Google … | null | null | 8039322 | 8038990 | null | null | null | null | "<instructions>Your goal is to … |
37965302 | "comment" | "williamdclt" | 2023-10-21 09:21:21 | null | "Do you have an example?" | null | null | 37965257 | 37962370 | null | [37965954, 37965936] | null | null | "<instructions>Your goal is to … |
10112234 | "comment" | "afshin" | 2015-08-24 18:58:41 | null | "Just a guess, but it might be … | null | null | 10111922 | 10108472 | null | [10112273] | null | null | "<instructions>Your goal is to … |
3202168 | "comment" | "Tycho" | 2011-11-06 08:25:19 | null | "I hate how in HN discussions l… | null | null | 3198171 | 3198171 | null | null | null | null | "<instructions>Your goal is to … |
"
+ ],
+ "text/plain": [
+ "shape: (100_000, 15)\n",
+ "┌──────────┬─────────┬──────────────┬─────────────┬───┬─────────────┬─────────┬──────┬─────────────┐\n",
+ "│ id ┆ type ┆ by ┆ time ┆ … ┆ kids ┆ deleted ┆ dead ┆ prompt │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ str ┆ str ┆ datetime[μs ┆ ┆ list[i64] ┆ bool ┆ bool ┆ str │\n",
+ "│ ┆ ┆ ┆ ] ┆ ┆ ┆ ┆ ┆ │\n",
+ "╞══════════╪═════════╪══════════════╪═════════════╪═══╪═════════════╪═════════╪══════╪═════════════╡\n",
+ "│ 7209407 ┆ comment ┆ ricardobeat ┆ 2014-02-10 ┆ … ┆ [7210958, ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 13520503 ┆ comment ┆ throwawayish ┆ 2017-01-30 ┆ … ┆ [13528758] ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 13152625 ┆ comment ┆ spikels ┆ 2016-12-11 ┆ … ┆ [13165894] ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 30957389 ┆ comment ┆ ss108 ┆ 2022-04-08 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 23335760 ┆ comment ┆ en3r0 ┆ 2020-05-28 ┆ … ┆ [23335860] ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ 8110666 ┆ comment ┆ erickookoo ┆ 2014-07-30 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 8039481 ┆ comment ┆ josteink ┆ 2014-07-15 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 37965302 ┆ comment ┆ williamdclt ┆ 2023-10-21 ┆ … ┆ [37965954, ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 10112234 ┆ comment ┆ afshin ┆ 2015-08-24 ┆ … ┆ [10112273] ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "│ 3202168 ┆ comment ┆ Tycho ┆ 2011-11-06 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n",
+ "└──────────┴─────────┴──────────────┴─────────────┴───┴─────────────┴─────────┴──────┴─────────────┘"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "comments_df = df.filter(pl.col(\"type\") == \"comment\").sample(n=100000, seed=42)\n",
+ "comments_df = comments_df.with_columns(\n",
+ " pl.Series(\"prompt\", build_all_prompts(comments_df[\"id\"])),\n",
+ ")\n",
+ "\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Running inference: 100%|██████████| 1000/1000 [23:29<00:00, 1.41s/it]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
shape: (100_000, 16)id | type | by | time | title | text | url | score | parent | top_level_parent | descendants | kids | deleted | dead | prompt | reward |
---|
i64 | str | str | datetime[μs] | str | str | str | i64 | i64 | i64 | i64 | list[i64] | bool | bool | str | f64 |
7209407 | "comment" | "ricardobeat" | 2014-02-10 06:00:25 | null | "The article presents some (non… | null | null | 7207821 | 7207506 | null | [7210958, 7213827] | null | null | "<instructions>Your goal is to … | -4.71875 |
13520503 | "comment" | "throwawayish" | 2017-01-30 12:11:14 | null | "Reaaally not that simple." | null | null | 13519742 | 13517389 | null | [13528758] | null | null | "<instructions>Your goal is to … | -15.125 |
13152625 | "comment" | "spikels" | 2016-12-11 19:19:47 | null | "That's an exaggeration. A… | null | null | 13151772 | 13147495 | null | [13165894] | null | null | "<instructions>Your goal is to … | 5.4375 |
30957389 | "comment" | "ss108" | 2022-04-08 14:38:05 | null | "Ah, you're probably right… | null | null | 30956591 | 30955290 | null | null | null | null | "<instructions>Your goal is to … | -21.0 |
23335760 | "comment" | "en3r0" | 2020-05-28 10:41:15 | null | "I have been using Trilium late… | null | null | 23335759 | 23335759 | null | [23335860] | null | null | "<instructions>Your goal is to … | -20.125 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
8110666 | "comment" | "erickookoo" | 2014-07-30 20:17:48 | null | "Part of the reason we built th… | null | null | 8109374 | 8109114 | null | null | null | null | "<instructions>Your goal is to … | 0.484375 |
8039481 | "comment" | "josteink" | 2014-07-15 22:22:52 | null | "If you're worried Google … | null | null | 8039322 | 8038990 | null | null | null | null | "<instructions>Your goal is to … | -12.125 |
37965302 | "comment" | "williamdclt" | 2023-10-21 09:21:21 | null | "Do you have an example?" | null | null | 37965257 | 37962370 | null | [37965954, 37965936] | null | null | "<instructions>Your goal is to … | -22.25 |
10112234 | "comment" | "afshin" | 2015-08-24 18:58:41 | null | "Just a guess, but it might be … | null | null | 10111922 | 10108472 | null | [10112273] | null | null | "<instructions>Your goal is to … | -7.6875 |
3202168 | "comment" | "Tycho" | 2011-11-06 08:25:19 | null | "I hate how in HN discussions l… | null | null | 3198171 | 3198171 | null | null | null | null | "<instructions>Your goal is to … | -4.0 |
"
+ ],
+ "text/plain": [
+ "shape: (100_000, 16)\n",
+ "┌──────────┬─────────┬──────────────┬───────────────┬───┬─────────┬──────┬──────────────┬──────────┐\n",
+ "│ id ┆ type ┆ by ┆ time ┆ … ┆ deleted ┆ dead ┆ prompt ┆ reward │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ bool ┆ bool ┆ str ┆ f64 │\n",
+ "╞══════════╪═════════╪══════════════╪═══════════════╪═══╪═════════╪══════╪══════════════╪══════════╡\n",
+ "│ 7209407 ┆ comment ┆ ricardobeat ┆ 2014-02-10 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 13520503 ┆ comment ┆ throwawayish ┆ 2017-01-30 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 13152625 ┆ comment ┆ spikels ┆ 2016-12-11 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 30957389 ┆ comment ┆ ss108 ┆ 2022-04-08 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 23335760 ┆ comment ┆ en3r0 ┆ 2020-05-28 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ 8110666 ┆ comment ┆ erickookoo ┆ 2014-07-30 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 8039481 ┆ comment ┆ josteink ┆ 2014-07-15 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 37965302 ┆ comment ┆ williamdclt ┆ 2023-10-21 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 10112234 ┆ comment ┆ afshin ┆ 2015-08-24 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "│ 3202168 ┆ comment ┆ Tycho ┆ 2011-11-06 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n",
+ "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n",
+ "└──────────┴─────────┴──────────────┴───────────────┴───┴─────────┴──────┴──────────────┴──────────┘"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "rewards = run_inference_sglang(comments_df[\"prompt\"])\n",
+ "\n",
+ "comments_df = comments_df.with_columns(pl.Series(\"reward\", rewards))\n",
+ "\n",
+ "comments_df.write_parquet(\"./data/random_comments_with_reward.parquet\")\n",
+ "\n",
+ "comments_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import polars as pl\n",
+ "\n",
+ "comments_df = pl.read_parquet(\"./data/random_comments_with_reward.parquet\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import html\n",
+ "import re\n",
+ "from utils import with_story_info\n",
+ "\n",
+ "\n",
+ "def unescape_html(text):\n",
+ " unescaped = html.unescape(text).replace(\"\", \"\\n\\n\")\n",
+ " return re.sub(r']*>[^<]+', r\"\\1\", unescaped)\n",
+ "\n",
+ "\n",
+ "comments_df = with_story_info(comments_df)\n",
+ "\n",
+ "comments_df = comments_df.with_columns(\n",
+ " pl.concat_str(\n",
+ " pl.lit(\"https://news.ycombinator.com/item?id=\"),\n",
+ " pl.col(\"id\"),\n",
+ " ).alias(\"link\"),\n",
+ " pl.col(\"time\").dt.strftime(\"%B %d, %Y\").alias(\"date\"),\n",
+ " pl.col(\"text\").map_elements(unescape_html, return_dtype=pl.String),\n",
+ ")\n",
+ "\n",
+ "comments_df.select(pl.col(\"date\", \"by\", \"link\", \"text\", \"reward\")).sort(\n",
+ " \"reward\", descending=True\n",
+ ").head(100).write_csv(\"./data/top_random_comments_with_links.csv\")\n",
+ "\n",
+ "comments_df.select(pl.col(\"date\", \"by\", \"link\", \"text\", \"reward\")).sort(\n",
+ " \"reward\", descending=False\n",
+ ").head(100).write_csv(\"./data/bottom_random_comments_with_links.csv\")\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/utils.py b/utils.py
index fbf6602..187ce9c 100644
--- a/utils.py
+++ b/utils.py
@@ -1,10 +1,35 @@
+from typing import Union
import polars as pl
from functools import lru_cache
import dicttoxml
+import tqdm
+import requests
+import os
-@lru_cache(maxsize=None)
-def dataset() -> pl.DataFrame:
+def cache_dataframe(path):
+ def decorator(func):
+ cache = {}
+
+ def wrapper(*args, **kwargs):
+ if path in cache:
+ return cache[path]
+ if os.path.exists(path):
+ df = pl.read_parquet(path)
+ else:
+ df = func(*args, **kwargs)
+ print(f"Caching dataframe to {path}")
+ df.write_parquet(path)
+ cache[path] = df
+ return df
+
+ return wrapper
+
+ return decorator
+
+
+@cache_dataframe("./data/full_dataset.parquet")
+def full_dataset() -> pl.DataFrame:
from datasets import load_dataset, Dataset
dataset: Dataset = load_dataset("OpenPipe/hacker-news", split="train")
@@ -12,6 +37,81 @@ def dataset() -> pl.DataFrame:
return dataset.to_polars()
+@cache_dataframe("./data/augmented_comments.parquet")
+def augmented_comments() -> pl.DataFrame:
+ df = full_dataset()
+
+ comments_df = df.filter((pl.col("type") == "comment"))
+
+ comments_df = comments_df.select(
+ pl.col("id", "by", "time", "text", "parent", "top_level_parent", "kids")
+ )
+
+ # add a new column siblings_count
+ comments_df = comments_df.with_columns(
+ [pl.col("id").count().over("parent").alias("siblings_count")]
+ )
+
+ sibling_ranks = dict()
+
+ # iterate over the rows of the dataframe
+ for groupable_comment in df.select(pl.col("kids")).iter_rows():
+ kids = groupable_comment[0]
+ if kids is not None:
+ for i, kid in enumerate(kids):
+ sibling_ranks[kid] = i + 1
+
+ # Find the maximum value in the sibling_ranks dictionary
+ max_sibling_rank = max(sibling_ranks.values())
+
+ # Print the maximum value
+ print(f"The maximum sibling rank is: {max_sibling_rank}")
+ print(f"The number of sibling ranks is: {len(sibling_ranks)}")
+
+ comments_df = comments_df.with_columns(
+ [
+ pl.col("id")
+ .replace_strict(
+ list(sibling_ranks.keys()), list(sibling_ranks.values()), default=-1
+ )
+ .alias("sibling_rank")
+ ]
+ )
+ del sibling_ranks
+
+ comments_df = comments_df.with_columns(pl.lit(-1).alias("nested_level"))
+
+ comments_df = comments_df.with_columns(
+ pl.when(pl.col("top_level_parent") == pl.col("parent"))
+ .then(0)
+ .otherwise(pl.col("nested_level"))
+ .alias("nested_level")
+ )
+
+ for i in tqdm.tqdm(range(1, 25), desc="Calculating nested levels"):
+ parent_ids = comments_df.filter(pl.col("nested_level") == i - 1)["id"]
+
+ comments_df = comments_df.with_columns(
+ pl.when(pl.col("parent").is_in(parent_ids))
+ .then(i)
+ .otherwise(pl.col("nested_level"))
+ .alias("nested_level")
+ )
+
+ return comments_df
+
+
+def build_all_prompts(ids: Union[list[int], pl.Series]) -> list[str]:
+ if isinstance(ids, pl.Series):
+ ids = ids.to_list()
+
+ prompts = []
+ for id in tqdm.tqdm(ids, desc="Building prompts"):
+ prompts.append(build_prompt(id))
+
+ return prompts
+
+
def build_prompt(comment_id: int) -> str:
df = dataset()
comment = df.row(comment_id, named=True)
@@ -42,3 +142,42 @@ def build_prompt(comment_id: int) -> str:
xml: bytes = dicttoxml.dicttoxml(data, attr_type=False, root=False)
return xml.decode("utf-8")
+
+
+def run_inference_sglang(
+ prompts: Union[list[str], pl.Series], chunk_size: int = 100
+) -> list[float]:
+ if isinstance(prompts, pl.Series):
+ prompts = prompts.to_list()
+
+ # Chunk prompts into lists of INFERENCE_CHUNK_SIZE
+ chunks = [prompts[i : i + chunk_size] for i in range(0, len(prompts), chunk_size)]
+
+ rewards = []
+ for chunk in tqdm.tqdm(chunks, desc="Running inference"):
+ json_data = {
+ "conv": chunk,
+ }
+ response = requests.post("http://127.0.0.1:30000/judge", json=json_data).json()
+ rewards.extend([x["embedding"][0] for x in response])
+
+ return rewards
+
+
+def with_story_info(comments_df: pl.DataFrame) -> pl.DataFrame:
+ stories_df = (
+ dataset()
+ .filter(pl.col("type") == "story")
+ .select(pl.col("id", "title", "url"))
+ .rename(
+ {
+ "id": "story_id",
+ "title": "story_title",
+ "url": "story_url",
+ }
+ )
+ )
+
+ return comments_df.join(
+ stories_df, left_on="top_level_parent", right_on="story_id", how="left"
+ )