From 7755d874fe93225886fbd7ec0d20d11260262d46 Mon Sep 17 00:00:00 2001 From: Sarah-Alabbadi7 Date: Wed, 27 Aug 2025 16:07:45 +0400 Subject: [PATCH] News Search Engine task.ipynb --- .../News Search Engine task.ipynb | 676 ++++++++++++++++++ 1 file changed, 676 insertions(+) create mode 100644 Team_members/from_sarah_ali/News Search Engine task.ipynb diff --git a/Team_members/from_sarah_ali/News Search Engine task.ipynb b/Team_members/from_sarah_ali/News Search Engine task.ipynb new file mode 100644 index 000000000..b1948a825 --- /dev/null +++ b/Team_members/from_sarah_ali/News Search Engine task.ipynb @@ -0,0 +1,676 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7da84d8f-d2e0-491c-9f80-2b818b31c154", + "metadata": {}, + "source": [ + "Tasks\n", + "1. Data Preprocessing\n", + "Load dataset and filter categories.\n", + "Balance dataset (1000 per category).\n", + "Keep only headline and category.\n", + "2. Vectorization\n", + "Train a TF-IDF Vectorizer on the 4000 headlines.\n", + "Store vectors for all articles.\n", + "\n", + "3. User Experience\n", + "Results should include:\n", + "Headline text\n", + "Category label\n", + "Similarity score\n", + "Results should be clearly ranked.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2f06184c-bb8b-4df9-9db9-e5abaaa09339", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "addacbcf-e04e-4820-abce-432a53a7e134", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linkheadlinecategoryshort_descriptionauthorsdate
0https://www.huffpost.com/entry/covid-boosters-...Over 4 Million Americans Roll Up Sleeves For O...U.S. NEWSHealth experts said it is too early to predict...Carla K. Johnson, AP2022-09-23
1https://www.huffpost.com/entry/american-airlin...American Airlines Flyer Charged, Banned For Li...U.S. NEWSHe was subdued by passengers and crew when he ...Mary Papenfuss2022-09-23
2https://www.huffpost.com/entry/funniest-tweets...23 Of The Funniest Tweets About Cats And Dogs ...COMEDY\"Until you have a dog you don't understand wha...Elyse Wanshel2022-09-23
3https://www.huffpost.com/entry/funniest-parent...The Funniest Tweets From Parents This Week (Se...PARENTING\"Accidentally put grown-up toothpaste on my to...Caroline Bologna2022-09-23
4https://www.huffpost.com/entry/amy-cooper-lose...Woman Who Called Cops On Black Bird-Watcher Lo...U.S. NEWSAmy Cooper accused investment firm Franklin Te...Nina Golgowski2022-09-22
.....................
209522https://www.huffingtonpost.com/entry/rim-ceo-t...RIM CEO Thorsten Heins' 'Significant' Plans Fo...TECHVerizon Wireless and AT&T are already promotin...Reuters, Reuters2012-01-28
209523https://www.huffingtonpost.com/entry/maria-sha...Maria Sharapova Stunned By Victoria Azarenka I...SPORTSAfterward, Azarenka, more effusive with the pr...2012-01-28
209524https://www.huffingtonpost.com/entry/super-bow...Giants Over Patriots, Jets Over Colts Among M...SPORTSLeading up to Super Bowl XLVI, the most talked...2012-01-28
209525https://www.huffingtonpost.com/entry/aldon-smi...Aldon Smith Arrested: 49ers Linebacker Busted ...SPORTSCORRECTION: An earlier version of this story i...2012-01-28
209526https://www.huffingtonpost.com/entry/dwight-ho...Dwight Howard Rips Teammates After Magic Loss ...SPORTSThe five-time all-star center tore into his te...2012-01-28
\n", + "

209527 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " link \\\n", + "0 https://www.huffpost.com/entry/covid-boosters-... \n", + "1 https://www.huffpost.com/entry/american-airlin... \n", + "2 https://www.huffpost.com/entry/funniest-tweets... \n", + "3 https://www.huffpost.com/entry/funniest-parent... \n", + "4 https://www.huffpost.com/entry/amy-cooper-lose... \n", + "... ... \n", + "209522 https://www.huffingtonpost.com/entry/rim-ceo-t... \n", + "209523 https://www.huffingtonpost.com/entry/maria-sha... \n", + "209524 https://www.huffingtonpost.com/entry/super-bow... \n", + "209525 https://www.huffingtonpost.com/entry/aldon-smi... \n", + "209526 https://www.huffingtonpost.com/entry/dwight-ho... \n", + "\n", + " headline category \\\n", + "0 Over 4 Million Americans Roll Up Sleeves For O... U.S. NEWS \n", + "1 American Airlines Flyer Charged, Banned For Li... U.S. NEWS \n", + "2 23 Of The Funniest Tweets About Cats And Dogs ... COMEDY \n", + "3 The Funniest Tweets From Parents This Week (Se... PARENTING \n", + "4 Woman Who Called Cops On Black Bird-Watcher Lo... U.S. NEWS \n", + "... ... ... \n", + "209522 RIM CEO Thorsten Heins' 'Significant' Plans Fo... TECH \n", + "209523 Maria Sharapova Stunned By Victoria Azarenka I... SPORTS \n", + "209524 Giants Over Patriots, Jets Over Colts Among M... SPORTS \n", + "209525 Aldon Smith Arrested: 49ers Linebacker Busted ... SPORTS \n", + "209526 Dwight Howard Rips Teammates After Magic Loss ... SPORTS \n", + "\n", + " short_description \\\n", + "0 Health experts said it is too early to predict... \n", + "1 He was subdued by passengers and crew when he ... \n", + "2 \"Until you have a dog you don't understand wha... \n", + "3 \"Accidentally put grown-up toothpaste on my to... \n", + "4 Amy Cooper accused investment firm Franklin Te... \n", + "... ... \n", + "209522 Verizon Wireless and AT&T are already promotin... \n", + "209523 Afterward, Azarenka, more effusive with the pr... \n", + "209524 Leading up to Super Bowl XLVI, the most talked... \n", + "209525 CORRECTION: An earlier version of this story i... \n", + "209526 The five-time all-star center tore into his te... \n", + "\n", + " authors date \n", + "0 Carla K. Johnson, AP 2022-09-23 \n", + "1 Mary Papenfuss 2022-09-23 \n", + "2 Elyse Wanshel 2022-09-23 \n", + "3 Caroline Bologna 2022-09-23 \n", + "4 Nina Golgowski 2022-09-22 \n", + "... ... ... \n", + "209522 Reuters, Reuters 2012-01-28 \n", + "209523 2012-01-28 \n", + "209524 2012-01-28 \n", + "209525 2012-01-28 \n", + "209526 2012-01-28 \n", + "\n", + "[209527 rows x 6 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_json(\"News Search Engine_Dataset_v3.json\", lines=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "dc1526b9-d9c5-4c41-bf80-9c605591d8fb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\bbuser\\AppData\\Local\\Temp\\ipykernel_14104\\1082960361.py:7: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", + " .apply(lambda group: group.sample(n=1000, random_state=42))\n" + ] + } + ], + "source": [ + "# Filter to specified categories only\n", + "allowed_categories = {\"POLITICS\", \"TRAVEL\", \"SPORTS\", \"HOME & LIVING\"}\n", + "df = df.loc[df[\"category\"].isin(allowed_categories)]\n", + "\n", + "# Create balanced dataset with 1000 samples per group\n", + "df_balanced = (df.groupby(\"category\", group_keys=False)\n", + " .apply(lambda group: group.sample(n=1000, random_state=42))\n", + " .reset_index(drop=True))\n", + "\n", + "# Select only required columns\n", + "df_balanced = df_balanced.loc[:, [\"headline\", \"category\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "868a13b6-d8e6-4b72-a67b-b34de131dc86", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(54899, 6)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linkheadlinecategoryshort_descriptionauthorsdate
0https://www.huffpost.com/entry/dodgers-basebal...Maury Wills, Base-Stealing Shortstop For Dodge...SPORTSMaury Wills, who helped the Los Angeles Dodger...Beth Harris, AP2022-09-20
1https://www.huffpost.com/entry/biden-us-forces...Biden Says U.S. Forces Would Defend Taiwan If ...POLITICSPresident issues vow as tensions with China rise.2022-09-19
2https://www.huffpost.com/entry/ukraine-festiva...‘Beautiful And Sad At The Same Time’: Ukrainia...POLITICSAn annual celebration took on a different feel...Jonathan Nicholson2022-09-19
3https://www.huffpost.com/entry/2022-wnba-final...Las Vegas Aces Win First WNBA Title, Chelsea G...SPORTSLas Vegas never had a professional sports cham...Pat Eaton-Robb, AP2022-09-19
4https://www.huffpost.com/entry/europe-britain-...Biden Says Queen's Death Left 'Giant Hole' For...POLITICSU.S. President Joe Biden, in London for the fu...Darlene Superville, AP2022-09-18
5https://www.huffpost.com/entry/afghan-adjustme...Bill To Help Afghans Who Escaped Taliban Faces...POLITICSRepublican outrage over the shoddy U.S. withdr...Hamed Ahmadi and Arthur Delaney2022-09-16
6https://www.huffpost.com/entry/capitol-riot-in...Mark Meadows Complies With Justice Dept. Subpo...POLITICSThe former White House chief of staff has turn...ERIC TUCKER, AP2022-09-15
7https://www.huffpost.com/entry/seth-magaziner-...Democrats Nominate Seth Magaziner In Key Rhode...POLITICSThe state's general treasurer is slated to fac...Daniel Marans2022-09-14
8https://www.huffpost.com/entry/biden-cancer-mo...Joe Biden Urges National Unity In Speech On Re...POLITICS\"Cancer does not discriminate red and blue,\" t...Nick Visser2022-09-13
9https://www.huffpost.com/entry/tim-scott-senat...Sen. Tim Scott Downplays Electability Concerns...POLITICS\"Who we have on the field is who we’re gonna p...Marita Vlachou2022-09-12
\n", + "
" + ], + "text/plain": [ + " link \\\n", + "0 https://www.huffpost.com/entry/dodgers-basebal... \n", + "1 https://www.huffpost.com/entry/biden-us-forces... \n", + "2 https://www.huffpost.com/entry/ukraine-festiva... \n", + "3 https://www.huffpost.com/entry/2022-wnba-final... \n", + "4 https://www.huffpost.com/entry/europe-britain-... \n", + "5 https://www.huffpost.com/entry/afghan-adjustme... \n", + "6 https://www.huffpost.com/entry/capitol-riot-in... \n", + "7 https://www.huffpost.com/entry/seth-magaziner-... \n", + "8 https://www.huffpost.com/entry/biden-cancer-mo... \n", + "9 https://www.huffpost.com/entry/tim-scott-senat... \n", + "\n", + " headline category \\\n", + "0 Maury Wills, Base-Stealing Shortstop For Dodge... SPORTS \n", + "1 Biden Says U.S. Forces Would Defend Taiwan If ... POLITICS \n", + "2 ‘Beautiful And Sad At The Same Time’: Ukrainia... POLITICS \n", + "3 Las Vegas Aces Win First WNBA Title, Chelsea G... SPORTS \n", + "4 Biden Says Queen's Death Left 'Giant Hole' For... POLITICS \n", + "5 Bill To Help Afghans Who Escaped Taliban Faces... POLITICS \n", + "6 Mark Meadows Complies With Justice Dept. Subpo... POLITICS \n", + "7 Democrats Nominate Seth Magaziner In Key Rhode... POLITICS \n", + "8 Joe Biden Urges National Unity In Speech On Re... POLITICS \n", + "9 Sen. Tim Scott Downplays Electability Concerns... POLITICS \n", + "\n", + " short_description \\\n", + "0 Maury Wills, who helped the Los Angeles Dodger... \n", + "1 President issues vow as tensions with China rise. \n", + "2 An annual celebration took on a different feel... \n", + "3 Las Vegas never had a professional sports cham... \n", + "4 U.S. President Joe Biden, in London for the fu... \n", + "5 Republican outrage over the shoddy U.S. withdr... \n", + "6 The former White House chief of staff has turn... \n", + "7 The state's general treasurer is slated to fac... \n", + "8 \"Cancer does not discriminate red and blue,\" t... \n", + "9 \"Who we have on the field is who we’re gonna p... \n", + "\n", + " authors date \n", + "0 Beth Harris, AP 2022-09-20 \n", + "1 2022-09-19 \n", + "2 Jonathan Nicholson 2022-09-19 \n", + "3 Pat Eaton-Robb, AP 2022-09-19 \n", + "4 Darlene Superville, AP 2022-09-18 \n", + "5 Hamed Ahmadi and Arthur Delaney 2022-09-16 \n", + "6 ERIC TUCKER, AP 2022-09-15 \n", + "7 Daniel Marans 2022-09-14 \n", + "8 Nick Visser 2022-09-13 \n", + "9 Marita Vlachou 2022-09-12 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Final dataset\n", + "df = df.reset_index(drop=True)\n", + "print(df.shape)\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "54a64886-bce4-48af-89c8-ea973455071d", + "metadata": {}, + "outputs": [], + "source": [ + "# Configure text vectorization parameters\n", + "text_vectorizer = TfidfVectorizer(\n", + " stop_words=\"english\",\n", + " max_features=5000,\n", + " lowercase=True\n", + ")\n", + "\n", + "# Learn vocabulary and transform text data\n", + "X = text_vectorizer.fit_transform(df_balanced[\"headline\"].values)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7df430ae-e888-44c0-90ba-1cbe4020a54a", + "metadata": {}, + "outputs": [], + "source": [ + "def find_relevant_articles(search_query, result_count=10):\n", + " \"\"\"Retrieve articles most relevant to the search query\"\"\"\n", + " # Convert query to feature vector\n", + " query_vector = vectorizer.transform([search_query])\n", + " \n", + " # Calculate similarity scores\n", + " similarity_scores = cosine_similarity(query_vector, X).ravel()\n", + " \n", + " # Identify top matching indices\n", + " top_matches = similarity_scores.argsort()[-result_count:][::-1]\n", + " \n", + " # Compile results with similarity metrics\n", + " search_results = df.iloc[top_matches].copy()\n", + " search_results['relevance_score'] = similarity_scores[top_matches]\n", + " \n", + " return search_results[['headline', 'category', 'relevance_score']]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3aaca833-b19c-4a5b-bf7c-0b2b3e1c3c47", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " headline category \\\n", + "3604 Ohio Voters Have A Chance To Do Something Abou... POLITICS \n", + "3572 U.S. Olympic Committee Ignored Sexual Abuse Co... SPORTS \n", + "2792 Pete Buttigieg, 37-Year-Old Mayor Of City In I... POLITICS \n", + "3976 What The Southwest Flight Can Teach Us About O... TRAVEL \n", + "3148 Trump Is Reportedly Sending New Pal Kim Jong U... POLITICS \n", + "3205 Publix Suspends Contributions To NRA-Backed Po... POLITICS \n", + "3573 Eric Schneiderman Has Always Been A Con Man POLITICS \n", + "3550 Trump's Iran Deal Exit Is A Win For Russia POLITICS \n", + "3800 White House Releases Photos Of Mike Pompeo Wit... POLITICS \n", + "3065 The Kremlin Hates America's 'Malignant Feminis... POLITICS \n", + "\n", + " relevance_score \n", + "3604 0.566451 \n", + "3572 0.564726 \n", + "2792 0.538501 \n", + "3976 0.533820 \n", + "3148 0.487850 \n", + "3205 0.399826 \n", + "3573 0.380509 \n", + "3550 0.370808 \n", + "3800 0.342019 \n", + "3065 0.331552 \n" + ] + } + ], + "source": [ + "search_terms = \"Europe travel\"\n", + "matching_results = find_relevant_articles(search_terms)\n", + "print(matching_results)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8bf6a0d5-4d91-406d-a54a-2e149c125399", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.566 | POLITICS | Ohio Voters Have A Chance To Do Something About Gerrymandering\n", + "0.565 | SPORTS | U.S. Olympic Committee Ignored Sexual Abuse Complaints Against Taekwondo Stars: Lawsuit\n", + "0.539 | POLITICS | Pete Buttigieg, 37-Year-Old Mayor Of City In Indiana, Joins Presidential Race\n", + "0.534 | TRAVEL | What The Southwest Flight Can Teach Us About Oxygen Masks\n", + "0.488 | POLITICS | Trump Is Reportedly Sending New Pal Kim Jong Un An Awkward Gift\n", + "0.400 | POLITICS | Publix Suspends Contributions To NRA-Backed Politician Amid Protests\n", + "0.381 | POLITICS | Eric Schneiderman Has Always Been A Con Man\n", + "0.371 | POLITICS | Trump's Iran Deal Exit Is A Win For Russia\n", + "0.342 | POLITICS | White House Releases Photos Of Mike Pompeo With Kim Jong Un To Praise Confirmation\n", + "0.332 | POLITICS | The Kremlin Hates America's 'Malignant Feminism,' Loves Brett Kavanaugh\n" + ] + } + ], + "source": [ + "for i, row in results.iterrows():\n", + " print(f\"{row['similarity']:.3f} | {row['category']} | {row['headline']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "31ce1745-cc2b-42aa-ab07-2368205e2839", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter search query (or 'exit' to stop): 2\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " headline category \\\n", + "16 US, Trump Team Propose Names For Arbiter In Ma... POLITICS \n", + "17 Politician's DNA Connected To Las Vegas Journa... POLITICS \n", + "18 Michigan Supreme Court Revives Abortion Rights... POLITICS \n", + "19 Portland Residents With Disabilities Sue Over ... POLITICS \n", + "20 Baseball Players Union Joins AFL-CIO In Show O... SPORTS \n", + "21 The Unemployment Insurance System Is Not Ready... POLITICS \n", + "22 Kody Clemens Strikes Out MVP Shohei Ohtani, Tr... SPORTS \n", + "23 Michigan Secretary of State Worried About ‘Vio... POLITICS \n", + "24 Uvalde Fourth Graders Waited An Hour With Woun... POLITICS \n", + "25 Trump-Endorsed Wisconsin Gubernatorial Candida... POLITICS \n", + "\n", + " relevance_score \n", + "16 0.0 \n", + "17 0.0 \n", + "18 0.0 \n", + "19 0.0 \n", + "20 0.0 \n", + "21 0.0 \n", + "22 0.0 \n", + "23 0.0 \n", + "24 0.0 \n", + "25 0.0 \n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter search query (or 'exit' to stop): exit\n" + ] + } + ], + "source": [ + "search_active = True\n", + "while search_active:\n", + " user_query = input(\"Enter search query (or 'exit' to stop): \")\n", + " if user_query.lower() == 'exit':\n", + " search_active = False\n", + " else:\n", + " print(find_relevant_articles(user_query))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a45855d-b6b6-4865-bfb5-ef85741d7a5e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}