diff --git a/identify-best-comments.ipynb b/identify-best-comments.ipynb index 4b08414..a514d93 100644 --- a/identify-best-comments.ipynb +++ b/identify-best-comments.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -70,7 +70,7 @@ "└──────────┴─────────┴────────────┴────────────┴───┴────────────┴────────────┴───────────┴─────────┘" ] }, - "execution_count": 3, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -87,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -106,23 +106,37 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.\n" - ] + "data": { + "text/plain": [ + "\"I'm sorry but no\"" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" } ], + "source": [ + "html.unescape(\"I'm sorry but no\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], "source": [ "import html\n", + "import re\n", "\n", "\n", "def unescape_html(text):\n", - " return html.unescape(text)\n", + " unescaped = html.unescape(text).replace(\"

\", \"\\n\\n\")\n", + " return re.sub(r']*>[^<]+', r\"\\1\", unescaped)\n", "\n", "\n", "df = df.with_columns(\n", @@ -131,22 +145,23 @@ " pl.col(\"id\"),\n", " ).alias(\"link\"),\n", " pl.col(\"time\").dt.strftime(\"%B %d, %Y\").alias(\"date\"),\n", - " pl.col(\"text\").str.replace_all(\"

\", \"\\n\\n\").alias(\"text\"),\n", - " pl.col(\"text\")\n", - " .map_elements(unescape_html, return_dtype=pl.String)\n", - " .alias(\"text_unescaped\"),\n", + " pl.col(\"text\").map_elements(unescape_html, return_dtype=pl.String),\n", ")" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "df.select(pl.col(\"date\", \"by\", \"link\", \"story_title\", \"text\", \"reward\")).head(\n", " 100\n", - ").write_csv(\"./data/top_comments_with_links.csv\")\n" + ").write_csv(\"./data/top_comments_with_links.csv\")\n", + "\n", + "df.sort(by=\"reward\", descending=False).select(\n", + " pl.col(\"date\", \"by\", \"link\", \"story_title\", \"text\", \"reward\")\n", + ").head(100).write_csv(\"./data/bottom_comments_with_links.csv\")" ] } ], diff --git a/prepare-dataset.ipynb b/prepare-dataset.ipynb index 8b1bfde..b7c1e39 100644 --- a/prepare-dataset.ipynb +++ b/prepare-dataset.ipynb @@ -2,149 +2,62 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import utils\n", - "import polars as pl\n", - "\n", - "df = utils.dataset()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "

\n", - "shape: (36_323_460, 15)
idtypebytimetitletexturlscoreparenttop_level_parentdescendantskidsdeleteddeadsiblings_count
i64strstrdatetime[μs]strstrstri64i64i64i64list[i64]boolboolu32
15"comment""sama"2006-10-09 19:51:01null"&#34;the rising star of ventur…nullnull11null[17]nullnull4
17"comment""pg"2006-10-09 19:52:45null"Is there anywhere to eat on Sa…nullnull151null[1079]nullnull1
22"comment""pg"2006-10-10 02:18:22null"It's kind of funny that Sevin …nullnull2121nullnullnullnull1
23"comment""starklysnarky"2006-10-10 02:30:53null"This is interesting, but the l…nullnull2020nullnullnullnull1
30"comment""spez"2006-10-10 15:34:59null"Stay tuned..."nullnull2929null[31]nullnull1
41813379"comment""kbolino"2024-10-11 20:26:09null"A non-blocking send would work…nullnull4181275241809262nullnullnullnull1
41813380"comment""slightwinder"2024-10-11 20:26:18null"&gt; It genuinely astounds me …nullnull4181069141808943nullnullnullnull4
41813381"comment""marcosdumay"2024-10-11 20:26:21null"Add forced sedentarism into th…nullnull4181289141811263nullnullnullnull1
41813383"comment""davio"2024-10-11 20:26:28null"hims sells the generic version…nullnull4181310241811263nullnullnullnull1
41813384"comment""btilly"2024-10-11 20:26:29null"Well, if you want a simple arg…nullnull4181213441808127nullnullnullnull1
" - ], + "application/vnd.jupyter.widget-view+json": { + "model_id": "2559539da6124d6d81613429dbf2c720", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "shape: (36_323_460, 15)\n", - "┌──────────┬─────────┬──────────────┬──────────────┬───┬───────────┬─────────┬──────┬──────────────┐\n", - "│ id ┆ type ┆ by ┆ time ┆ … ┆ kids ┆ deleted ┆ dead ┆ siblings_cou │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ nt │\n", - "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ list[i64] ┆ bool ┆ bool ┆ --- │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 │\n", - "╞══════════╪═════════╪══════════════╪══════════════╪═══╪═══════════╪═════════╪══════╪══════════════╡\n", - "│ 15 ┆ comment ┆ sama ┆ 2006-10-09 ┆ … ┆ [17] ┆ null ┆ null ┆ 4 │\n", - "│ ┆ ┆ ┆ 19:51:01 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 17 ┆ comment ┆ pg ┆ 2006-10-09 ┆ … ┆ [1079] ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 19:52:45 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 22 ┆ comment ┆ pg ┆ 2006-10-10 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 02:18:22 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 23 ┆ comment ┆ starklysnark ┆ 2006-10-10 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ y ┆ 02:30:53 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 30 ┆ comment ┆ spez ┆ 2006-10-10 ┆ … ┆ [31] ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 15:34:59 ┆ ┆ ┆ ┆ ┆ │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 41813379 ┆ comment ┆ kbolino ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:09 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813380 ┆ comment ┆ slightwinder ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 4 │\n", - "│ ┆ ┆ ┆ 20:26:18 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813381 ┆ comment ┆ marcosdumay ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:21 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813383 ┆ comment ┆ davio ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:28 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813384 ┆ comment ┆ btilly ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:29 ┆ ┆ ┆ ┆ ┆ │\n", - "└──────────┴─────────┴──────────────┴──────────────┴───┴───────────┴─────────┴──────┴──────────────┘" + "Resolving data files: 0%| | 0/39 [00:00\n", - "shape: (24_134_648, 15)
idtypebytimetitletexturlscoreparenttop_level_parentdescendantskidsdeleteddeadsiblings_count
i64strstrdatetime[μs]strstrstri64i64i64i64list[i64]boolboolu32
194"comment""jmzachary"2007-02-20 22:33:51null"Thanks for the rationale. I'm …nullnull189189null[205, 422, 199]nullnull20
195"comment""jdroid"2007-02-20 22:36:52null"You've filled a hole reddit wa…nullnull189189null[259]nullnull20
199"comment""Zak"2007-02-20 22:48:33null"I don't think the fact that th…nullnull194189null[1644, 1897]nullnull3
205"comment""ninwa"2007-02-20 23:30:23null"Really? I was most interested …nullnull194189null[210, 209]nullnull3
209"comment""ninwa"2007-02-20 23:41:34null"This comment added through the…nullnull205189nullnullnullnull2
41813369"comment""whall6"2024-10-11 20:25:08null"If you truly only have 5 minut…nullnull4181259641811263nullnullnullnull1
41813371"comment""julianeon"2024-10-11 20:25:25null"But consider the tradeoff: it&…nullnull4181153941811263nullnullnullnull13
41813375"comment""throw0101c"2024-10-11 20:25:51null"&gt; <i>There is so much coal.…nullnull4181132841807681nullnullnullnull1
41813381"comment""marcosdumay"2024-10-11 20:26:21null"Add forced sedentarism into th…nullnull4181289141811263nullnullnullnull1
41813383"comment""davio"2024-10-11 20:26:28null"hims sells the generic version…nullnull4181310241811263nullnullnullnull1
" - ], + "application/vnd.jupyter.widget-view+json": { + "model_id": "d957acf686e74e1cb3060d9f7b3742cc", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ - "shape: (24_134_648, 15)\n", - "┌──────────┬─────────┬─────────────┬──────────────┬───┬─────────────┬─────────┬──────┬─────────────┐\n", - "│ id ┆ type ┆ by ┆ time ┆ … ┆ kids ┆ deleted ┆ dead ┆ siblings_co │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ unt │\n", - "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ list[i64] ┆ bool ┆ bool ┆ --- │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 │\n", - "╞══════════╪═════════╪═════════════╪══════════════╪═══╪═════════════╪═════════╪══════╪═════════════╡\n", - "│ 194 ┆ comment ┆ jmzachary ┆ 2007-02-20 ┆ … ┆ [205, 422, ┆ null ┆ null ┆ 20 │\n", - "│ ┆ ┆ ┆ 22:33:51 ┆ ┆ 199] ┆ ┆ ┆ │\n", - "│ 195 ┆ comment ┆ jdroid ┆ 2007-02-20 ┆ … ┆ [259] ┆ null ┆ null ┆ 20 │\n", - "│ ┆ ┆ ┆ 22:36:52 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 199 ┆ comment ┆ Zak ┆ 2007-02-20 ┆ … ┆ [1644, ┆ null ┆ null ┆ 3 │\n", - "│ ┆ ┆ ┆ 22:48:33 ┆ ┆ 1897] ┆ ┆ ┆ │\n", - "│ 205 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ [210, 209] ┆ null ┆ null ┆ 3 │\n", - "│ ┆ ┆ ┆ 23:30:23 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 209 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ null ┆ 2 │\n", - "│ ┆ ┆ ┆ 23:41:34 ┆ ┆ ┆ ┆ ┆ │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 41813369 ┆ comment ┆ whall6 ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:25:08 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813371 ┆ comment ┆ julianeon ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 13 │\n", - "│ ┆ ┆ ┆ 20:25:25 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813375 ┆ comment ┆ throw0101c ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:25:51 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813381 ┆ comment ┆ marcosdumay ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:21 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813383 ┆ comment ┆ davio ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ null ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:28 ┆ ┆ ┆ ┆ ┆ │\n", - "└──────────┴─────────┴─────────────┴──────────────┴───┴─────────────┴─────────┴──────┴─────────────┘" + "Resolving data files: 0%| | 0/39 [00:00= 100)\n", - "\n", - "# filter comments_df to only include comments whose top_level_parent is in the stories_with_100_points dataframe\n", - "comments_df = comments_df.filter(\n", - " pl.col(\"top_level_parent\").is_in(stories_with_100_points[\"id\"])\n", - ")\n", + "import utils\n", + "import polars as pl\n", "\n", - "comments_df" + "df = utils.full_dataset()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -156,81 +69,29 @@ ] }, { - "data": { - "text/html": [ - "
\n", - "shape: (24_134_648, 16)
idtypebytimetitletexturlscoreparenttop_level_parentdescendantskidsdeleteddeadsiblings_countsibling_rank
i64strstrdatetime[μs]strstrstri64i64i64i64list[i64]boolboolu32i64
194"comment""jmzachary"2007-02-20 22:33:51null"Thanks for the rationale. I'm …nullnull189189null[205, 422, 199]nullnull2013
195"comment""jdroid"2007-02-20 22:36:52null"You've filled a hole reddit wa…nullnull189189null[259]nullnull203
199"comment""Zak"2007-02-20 22:48:33null"I don't think the fact that th…nullnull194189null[1644, 1897]nullnull33
205"comment""ninwa"2007-02-20 23:30:23null"Really? I was most interested …nullnull194189null[210, 209]nullnull31
209"comment""ninwa"2007-02-20 23:41:34null"This comment added through the…nullnull205189nullnullnullnull22
41813369"comment""whall6"2024-10-11 20:25:08null"If you truly only have 5 minut…nullnull4181259641811263nullnullnullnull11
41813371"comment""julianeon"2024-10-11 20:25:25null"But consider the tradeoff: it&…nullnull4181153941811263nullnullnullnull131
41813375"comment""throw0101c"2024-10-11 20:25:51null"&gt; <i>There is so much coal.…nullnull4181132841807681nullnullnullnull11
41813381"comment""marcosdumay"2024-10-11 20:26:21null"Add forced sedentarism into th…nullnull4181289141811263nullnullnullnull11
41813383"comment""davio"2024-10-11 20:26:28null"hims sells the generic version…nullnull4181310241811263nullnullnullnull11
" - ], - "text/plain": [ - "shape: (24_134_648, 16)\n", - "┌──────────┬─────────┬─────────────┬──────────────┬───┬─────────┬──────┬─────────────┬─────────────┐\n", - "│ id ┆ type ┆ by ┆ time ┆ … ┆ deleted ┆ dead ┆ siblings_co ┆ sibling_ran │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ unt ┆ k │\n", - "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ bool ┆ bool ┆ --- ┆ --- │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ u32 ┆ i64 │\n", - "╞══════════╪═════════╪═════════════╪══════════════╪═══╪═════════╪══════╪═════════════╪═════════════╡\n", - "│ 194 ┆ comment ┆ jmzachary ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 20 ┆ 13 │\n", - "│ ┆ ┆ ┆ 22:33:51 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 195 ┆ comment ┆ jdroid ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 20 ┆ 3 │\n", - "│ ┆ ┆ ┆ 22:36:52 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 199 ┆ comment ┆ Zak ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 3 ┆ 3 │\n", - "│ ┆ ┆ ┆ 22:48:33 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 205 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 3 ┆ 1 │\n", - "│ ┆ ┆ ┆ 23:30:23 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 209 ┆ comment ┆ ninwa ┆ 2007-02-20 ┆ … ┆ null ┆ null ┆ 2 ┆ 2 │\n", - "│ ┆ ┆ ┆ 23:41:34 ┆ ┆ ┆ ┆ ┆ │\n", - "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", - "│ 41813369 ┆ comment ┆ whall6 ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:25:08 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813371 ┆ comment ┆ julianeon ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 13 ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:25:25 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813375 ┆ comment ┆ throw0101c ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:25:51 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813381 ┆ comment ┆ marcosdumay ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:21 ┆ ┆ ┆ ┆ ┆ │\n", - "│ 41813383 ┆ comment ┆ davio ┆ 2024-10-11 ┆ … ┆ null ┆ null ┆ 1 ┆ 1 │\n", - "│ ┆ ┆ ┆ 20:26:28 ┆ ┆ ┆ ┆ ┆ │\n", - "└──────────┴─────────┴─────────────┴──────────────┴───┴─────────┴──────┴─────────────┴─────────────┘" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "name": "stderr", + "output_type": "stream", + "text": [ + "Calculating nested levels: 100%|██████████| 24/24 [00:14<00:00, 1.71it/s]\n" + ] } ], "source": [ - "sibling_ranks = dict()\n", - "\n", - "# iterate over the rows of the dataframe\n", - "for groupable_comment in df.select(pl.col(\"kids\")).iter_rows():\n", - " kids = groupable_comment[0]\n", - " if kids is not None:\n", - " for i, kid in enumerate(kids):\n", - " sibling_ranks[kid] = i + 1\n", - "\n", - "# Find the maximum value in the sibling_ranks dictionary\n", - "max_sibling_rank = max(sibling_ranks.values())\n", - "\n", - "# Print the maximum value\n", - "print(f\"The maximum sibling rank is: {max_sibling_rank}\")\n", - "print(f\"The number of sibling ranks is: {len(sibling_ranks)}\")\n", - "\n", - "comments_df = comments_df.with_columns(\n", - " [\n", - " pl.col(\"id\")\n", - " .replace_strict(\n", - " list(sibling_ranks.keys()), list(sibling_ranks.values()), default=-1\n", - " )\n", - " .alias(\"sibling_rank\")\n", - " ]\n", + "comments_df = utils.augmented_comments()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stories_with_100_points = df.filter(pl.col(\"score\") >= 100)\n", + "\n", + "# filter comments_df to only include comments whose top_level_parent is in the stories_with_100_points dataframe\n", + "comments_df = comments_df.filter(\n", + " pl.col(\"top_level_parent\").is_in(stories_with_100_points[\"id\"])\n", ")\n", - "del sibling_ranks\n", "\n", "comments_df" ] diff --git a/prepare_env.sh b/prepare_env.sh index 61b0b31..9b67eb3 100755 --- a/prepare_env.sh +++ b/prepare_env.sh @@ -18,7 +18,7 @@ source ~/.bashrc # Install system dependencies apt-get update -apt-get install -y tmux nvtop entr build-essential +apt-get install -y tmux nvtop htop entr build-essential # Install python dependencies curl -LsSf https://astral.sh/uv/0.4.6/install.sh | sh diff --git a/rate-random-comments.ipynb b/rate-random-comments.ipynb new file mode 100644 index 0000000..b94ba19 --- /dev/null +++ b/rate-random-comments.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "from utils import dataset, build_all_prompts, run_inference_sglang\n", + "\n", + "df = dataset()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Building prompts: 100%|██████████| 100000/100000 [00:58<00:00, 1698.95it/s]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (100_000, 15)
idtypebytimetitletexturlscoreparenttop_level_parentdescendantskidsdeleteddeadprompt
i64strstrdatetime[μs]strstrstri64i64i64i64list[i64]boolboolstr
7209407"comment""ricardobeat"2014-02-10 06:00:25null"The article presents some (non…nullnull72078217207506null[7210958, 7213827]nullnull"<instructions>Your goal is to …
13520503"comment""throwawayish"2017-01-30 12:11:14null"Reaaally not that simple."nullnull1351974213517389null[13528758]nullnull"<instructions>Your goal is to …
13152625"comment""spikels"2016-12-11 19:19:47null"That&#x27;s an exaggeration. A…nullnull1315177213147495null[13165894]nullnull"<instructions>Your goal is to …
30957389"comment""ss108"2022-04-08 14:38:05null"Ah, you&#x27;re probably right…nullnull3095659130955290nullnullnullnull"<instructions>Your goal is to …
23335760"comment""en3r0"2020-05-28 10:41:15null"I have been using Trilium late…nullnull2333575923335759null[23335860]nullnull"<instructions>Your goal is to …
8110666"comment""erickookoo"2014-07-30 20:17:48null"Part of the reason we built th…nullnull81093748109114nullnullnullnull"<instructions>Your goal is to …
8039481"comment""josteink"2014-07-15 22:22:52null"If you&#x27;re worried Google …nullnull80393228038990nullnullnullnull"<instructions>Your goal is to …
37965302"comment""williamdclt"2023-10-21 09:21:21null"Do you have an example?"nullnull3796525737962370null[37965954, 37965936]nullnull"<instructions>Your goal is to …
10112234"comment""afshin"2015-08-24 18:58:41null"Just a guess, but it might be …nullnull1011192210108472null[10112273]nullnull"<instructions>Your goal is to …
3202168"comment""Tycho"2011-11-06 08:25:19null"I hate how in HN discussions l…nullnull31981713198171nullnullnullnull"<instructions>Your goal is to …
" + ], + "text/plain": [ + "shape: (100_000, 15)\n", + "┌──────────┬─────────┬──────────────┬─────────────┬───┬─────────────┬─────────┬──────┬─────────────┐\n", + "│ id ┆ type ┆ by ┆ time ┆ … ┆ kids ┆ deleted ┆ dead ┆ prompt │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ str ┆ datetime[μs ┆ ┆ list[i64] ┆ bool ┆ bool ┆ str │\n", + "│ ┆ ┆ ┆ ] ┆ ┆ ┆ ┆ ┆ │\n", + "╞══════════╪═════════╪══════════════╪═════════════╪═══╪═════════════╪═════════╪══════╪═════════════╡\n", + "│ 7209407 ┆ comment ┆ ricardobeat ┆ 2014-02-10 ┆ … ┆ [7210958, ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 13520503 ┆ comment ┆ throwawayish ┆ 2017-01-30 ┆ … ┆ [13528758] ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 13152625 ┆ comment ┆ spikels ┆ 2016-12-11 ┆ … ┆ [13165894] ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 30957389 ┆ comment ┆ ss108 ┆ 2022-04-08 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 23335760 ┆ comment ┆ en3r0 ┆ 2020-05-28 ┆ … ┆ [23335860] ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 8110666 ┆ comment ┆ erickookoo ┆ 2014-07-30 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 8039481 ┆ comment ┆ josteink ┆ 2014-07-15 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 37965302 ┆ comment ┆ williamdclt ┆ 2023-10-21 ┆ … ┆ [37965954, ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 10112234 ┆ comment ┆ afshin ┆ 2015-08-24 ┆ … ┆ [10112273] ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "│ 3202168 ┆ comment ┆ Tycho ┆ 2011-11-06 ┆ … ┆ null ┆ null ┆ null ┆ Your │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ goal is to │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ … │\n", + "└──────────┴─────────┴──────────────┴─────────────┴───┴─────────────┴─────────┴──────┴─────────────┘" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comments_df = df.filter(pl.col(\"type\") == \"comment\").sample(n=100000, seed=42)\n", + "comments_df = comments_df.with_columns(\n", + " pl.Series(\"prompt\", build_all_prompts(comments_df[\"id\"])),\n", + ")\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Running inference: 100%|██████████| 1000/1000 [23:29<00:00, 1.41s/it]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "shape: (100_000, 16)
idtypebytimetitletexturlscoreparenttop_level_parentdescendantskidsdeleteddeadpromptreward
i64strstrdatetime[μs]strstrstri64i64i64i64list[i64]boolboolstrf64
7209407"comment""ricardobeat"2014-02-10 06:00:25null"The article presents some (non…nullnull72078217207506null[7210958, 7213827]nullnull"<instructions>Your goal is to …-4.71875
13520503"comment""throwawayish"2017-01-30 12:11:14null"Reaaally not that simple."nullnull1351974213517389null[13528758]nullnull"<instructions>Your goal is to …-15.125
13152625"comment""spikels"2016-12-11 19:19:47null"That&#x27;s an exaggeration. A…nullnull1315177213147495null[13165894]nullnull"<instructions>Your goal is to …5.4375
30957389"comment""ss108"2022-04-08 14:38:05null"Ah, you&#x27;re probably right…nullnull3095659130955290nullnullnullnull"<instructions>Your goal is to …-21.0
23335760"comment""en3r0"2020-05-28 10:41:15null"I have been using Trilium late…nullnull2333575923335759null[23335860]nullnull"<instructions>Your goal is to …-20.125
8110666"comment""erickookoo"2014-07-30 20:17:48null"Part of the reason we built th…nullnull81093748109114nullnullnullnull"<instructions>Your goal is to …0.484375
8039481"comment""josteink"2014-07-15 22:22:52null"If you&#x27;re worried Google …nullnull80393228038990nullnullnullnull"<instructions>Your goal is to …-12.125
37965302"comment""williamdclt"2023-10-21 09:21:21null"Do you have an example?"nullnull3796525737962370null[37965954, 37965936]nullnull"<instructions>Your goal is to …-22.25
10112234"comment""afshin"2015-08-24 18:58:41null"Just a guess, but it might be …nullnull1011192210108472null[10112273]nullnull"<instructions>Your goal is to …-7.6875
3202168"comment""Tycho"2011-11-06 08:25:19null"I hate how in HN discussions l…nullnull31981713198171nullnullnullnull"<instructions>Your goal is to …-4.0
" + ], + "text/plain": [ + "shape: (100_000, 16)\n", + "┌──────────┬─────────┬──────────────┬───────────────┬───┬─────────┬──────┬──────────────┬──────────┐\n", + "│ id ┆ type ┆ by ┆ time ┆ … ┆ deleted ┆ dead ┆ prompt ┆ reward │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ i64 ┆ str ┆ str ┆ datetime[μs] ┆ ┆ bool ┆ bool ┆ str ┆ f64 │\n", + "╞══════════╪═════════╪══════════════╪═══════════════╪═══╪═════════╪══════╪══════════════╪══════════╡\n", + "│ 7209407 ┆ comment ┆ ricardobeat ┆ 2014-02-10 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 13520503 ┆ comment ┆ throwawayish ┆ 2017-01-30 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 13152625 ┆ comment ┆ spikels ┆ 2016-12-11 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 30957389 ┆ comment ┆ ss108 ┆ 2022-04-08 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 23335760 ┆ comment ┆ en3r0 ┆ 2020-05-28 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ 8110666 ┆ comment ┆ erickookoo ┆ 2014-07-30 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 8039481 ┆ comment ┆ josteink ┆ 2014-07-15 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 37965302 ┆ comment ┆ williamdclt ┆ 2023-10-21 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 10112234 ┆ comment ┆ afshin ┆ 2015-08-24 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "│ 3202168 ┆ comment ┆ Tycho ┆ 2011-11-06 ┆ … ┆ null ┆ null ┆ Your goal ┆ │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ is to … ┆ │\n", + "└──────────┴─────────┴──────────────┴───────────────┴───┴─────────┴──────┴──────────────┴──────────┘" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rewards = run_inference_sglang(comments_df[\"prompt\"])\n", + "\n", + "comments_df = comments_df.with_columns(pl.Series(\"reward\", rewards))\n", + "\n", + "comments_df.write_parquet(\"./data/random_comments_with_reward.parquet\")\n", + "\n", + "comments_df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "\n", + "comments_df = pl.read_parquet(\"./data/random_comments_with_reward.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import html\n", + "import re\n", + "from utils import with_story_info\n", + "\n", + "\n", + "def unescape_html(text):\n", + " unescaped = html.unescape(text).replace(\"

\", \"\\n\\n\")\n", + " return re.sub(r']*>[^<]+', r\"\\1\", unescaped)\n", + "\n", + "\n", + "comments_df = with_story_info(comments_df)\n", + "\n", + "comments_df = comments_df.with_columns(\n", + " pl.concat_str(\n", + " pl.lit(\"https://news.ycombinator.com/item?id=\"),\n", + " pl.col(\"id\"),\n", + " ).alias(\"link\"),\n", + " pl.col(\"time\").dt.strftime(\"%B %d, %Y\").alias(\"date\"),\n", + " pl.col(\"text\").map_elements(unescape_html, return_dtype=pl.String),\n", + ")\n", + "\n", + "comments_df.select(pl.col(\"date\", \"by\", \"link\", \"text\", \"reward\")).sort(\n", + " \"reward\", descending=True\n", + ").head(100).write_csv(\"./data/top_random_comments_with_links.csv\")\n", + "\n", + "comments_df.select(pl.col(\"date\", \"by\", \"link\", \"text\", \"reward\")).sort(\n", + " \"reward\", descending=False\n", + ").head(100).write_csv(\"./data/bottom_random_comments_with_links.csv\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/utils.py b/utils.py index fbf6602..187ce9c 100644 --- a/utils.py +++ b/utils.py @@ -1,10 +1,35 @@ +from typing import Union import polars as pl from functools import lru_cache import dicttoxml +import tqdm +import requests +import os -@lru_cache(maxsize=None) -def dataset() -> pl.DataFrame: +def cache_dataframe(path): + def decorator(func): + cache = {} + + def wrapper(*args, **kwargs): + if path in cache: + return cache[path] + if os.path.exists(path): + df = pl.read_parquet(path) + else: + df = func(*args, **kwargs) + print(f"Caching dataframe to {path}") + df.write_parquet(path) + cache[path] = df + return df + + return wrapper + + return decorator + + +@cache_dataframe("./data/full_dataset.parquet") +def full_dataset() -> pl.DataFrame: from datasets import load_dataset, Dataset dataset: Dataset = load_dataset("OpenPipe/hacker-news", split="train") @@ -12,6 +37,81 @@ def dataset() -> pl.DataFrame: return dataset.to_polars() +@cache_dataframe("./data/augmented_comments.parquet") +def augmented_comments() -> pl.DataFrame: + df = full_dataset() + + comments_df = df.filter((pl.col("type") == "comment")) + + comments_df = comments_df.select( + pl.col("id", "by", "time", "text", "parent", "top_level_parent", "kids") + ) + + # add a new column siblings_count + comments_df = comments_df.with_columns( + [pl.col("id").count().over("parent").alias("siblings_count")] + ) + + sibling_ranks = dict() + + # iterate over the rows of the dataframe + for groupable_comment in df.select(pl.col("kids")).iter_rows(): + kids = groupable_comment[0] + if kids is not None: + for i, kid in enumerate(kids): + sibling_ranks[kid] = i + 1 + + # Find the maximum value in the sibling_ranks dictionary + max_sibling_rank = max(sibling_ranks.values()) + + # Print the maximum value + print(f"The maximum sibling rank is: {max_sibling_rank}") + print(f"The number of sibling ranks is: {len(sibling_ranks)}") + + comments_df = comments_df.with_columns( + [ + pl.col("id") + .replace_strict( + list(sibling_ranks.keys()), list(sibling_ranks.values()), default=-1 + ) + .alias("sibling_rank") + ] + ) + del sibling_ranks + + comments_df = comments_df.with_columns(pl.lit(-1).alias("nested_level")) + + comments_df = comments_df.with_columns( + pl.when(pl.col("top_level_parent") == pl.col("parent")) + .then(0) + .otherwise(pl.col("nested_level")) + .alias("nested_level") + ) + + for i in tqdm.tqdm(range(1, 25), desc="Calculating nested levels"): + parent_ids = comments_df.filter(pl.col("nested_level") == i - 1)["id"] + + comments_df = comments_df.with_columns( + pl.when(pl.col("parent").is_in(parent_ids)) + .then(i) + .otherwise(pl.col("nested_level")) + .alias("nested_level") + ) + + return comments_df + + +def build_all_prompts(ids: Union[list[int], pl.Series]) -> list[str]: + if isinstance(ids, pl.Series): + ids = ids.to_list() + + prompts = [] + for id in tqdm.tqdm(ids, desc="Building prompts"): + prompts.append(build_prompt(id)) + + return prompts + + def build_prompt(comment_id: int) -> str: df = dataset() comment = df.row(comment_id, named=True) @@ -42,3 +142,42 @@ def build_prompt(comment_id: int) -> str: xml: bytes = dicttoxml.dicttoxml(data, attr_type=False, root=False) return xml.decode("utf-8") + + +def run_inference_sglang( + prompts: Union[list[str], pl.Series], chunk_size: int = 100 +) -> list[float]: + if isinstance(prompts, pl.Series): + prompts = prompts.to_list() + + # Chunk prompts into lists of INFERENCE_CHUNK_SIZE + chunks = [prompts[i : i + chunk_size] for i in range(0, len(prompts), chunk_size)] + + rewards = [] + for chunk in tqdm.tqdm(chunks, desc="Running inference"): + json_data = { + "conv": chunk, + } + response = requests.post("http://127.0.0.1:30000/judge", json=json_data).json() + rewards.extend([x["embedding"][0] for x in response]) + + return rewards + + +def with_story_info(comments_df: pl.DataFrame) -> pl.DataFrame: + stories_df = ( + dataset() + .filter(pl.col("type") == "story") + .select(pl.col("id", "title", "url")) + .rename( + { + "id": "story_id", + "title": "story_title", + "url": "story_url", + } + ) + ) + + return comments_df.join( + stories_df, left_on="top_level_parent", right_on="story_id", how="left" + )