Skip to content

Commit

Permalink
more helpers
Browse files Browse the repository at this point in the history
  • Loading branch information
corbt committed Oct 17, 2024
1 parent fcf5613 commit fcfa0a0
Show file tree
Hide file tree
Showing 5 changed files with 480 additions and 211 deletions.
47 changes: 31 additions & 16 deletions identify-best-comments.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -70,7 +70,7 @@
"└──────────┴─────────┴────────────┴────────────┴───┴────────────┴────────────┴───────────┴─────────┘"
]
},
"execution_count": 3,
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -87,7 +87,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -106,23 +106,37 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"sys:1: MapWithoutReturnDtypeWarning: Calling `map_elements` without specifying `return_dtype` can lead to unpredictable results. Specify `return_dtype` to silence this warning.\n"
]
"data": {
"text/plain": [
"\"I'm sorry but no\""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"html.unescape(\"I'm sorry but no\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import html\n",
"import re\n",
"\n",
"\n",
"def unescape_html(text):\n",
" return html.unescape(text)\n",
" unescaped = html.unescape(text).replace(\"<p>\", \"\\n\\n\")\n",
" return re.sub(r'<a href=\"([^\"]+)\"[^>]*>[^<]+</a>', r\"\\1\", unescaped)\n",
"\n",
"\n",
"df = df.with_columns(\n",
Expand All @@ -131,22 +145,23 @@
" pl.col(\"id\"),\n",
" ).alias(\"link\"),\n",
" pl.col(\"time\").dt.strftime(\"%B %d, %Y\").alias(\"date\"),\n",
" pl.col(\"text\").str.replace_all(\"<p>\", \"\\n\\n\").alias(\"text\"),\n",
" pl.col(\"text\")\n",
" .map_elements(unescape_html, return_dtype=pl.String)\n",
" .alias(\"text_unescaped\"),\n",
" pl.col(\"text\").map_elements(unescape_html, return_dtype=pl.String),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"df.select(pl.col(\"date\", \"by\", \"link\", \"story_title\", \"text\", \"reward\")).head(\n",
" 100\n",
").write_csv(\"./data/top_comments_with_links.csv\")\n"
").write_csv(\"./data/top_comments_with_links.csv\")\n",
"\n",
"df.sort(by=\"reward\", descending=False).select(\n",
" pl.col(\"date\", \"by\", \"link\", \"story_title\", \"text\", \"reward\")\n",
").head(100).write_csv(\"./data/bottom_comments_with_links.csv\")"
]
}
],
Expand Down
Loading

0 comments on commit fcfa0a0

Please sign in to comment.