Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 135 additions & 0 deletions parse/DomainsResearch.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Исследование частоиспользуемых доменов"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import bs4, requests\n",
"import re\n",
"import pandas as pd\n",
"import meetup.api\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"import seaborn\n",
"from urllib.parse import urlparse"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 https://www.meetup.com/ru-RU/CyberSecurityClub...\n",
"1 https://noosphere.ru/hacknowlege\n",
"2 https://www.meetup.com/GolangKazan/events/2622...\n",
"3 https://www.facebook.com/events/649632242183168\n",
"4 https://index.timepad.ru/event/964050/\n",
"Name: link, dtype: object"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv(open('data.csv'))\n",
"data = data.dropna(subset=['link'])\n",
"data['link'].head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Топ-20 самых популярных доменов для организации мероприятий:\n",
"timepad 1144\n",
"vk 400\n",
"yandex 349\n",
"redirect link 244\n",
"facebook 216\n",
"scout-gps 172\n",
"ingria-startup 128\n",
"meetup 117\n",
"dataart 116\n",
"mail 92\n",
"dou 91\n",
"ciseventsgroup 88\n",
"iidf 85\n",
"luxoft-training 81\n",
"hse 72\n",
"epam 72\n",
"softline 61\n",
"polikom 59\n",
"cnews 58\n",
"codeib 52\n",
"Name: domains, dtype: int64\n"
]
}
],
"source": [
"def get_domain(ex):\n",
" x = urlparse(ex).netloc\n",
" l = x.replace('www.', '').replace('.ua', '').split('.')\n",
" if len(l) > 1:\n",
" y = l[-2]\n",
" if y == 'google' and '/forms/' in ex:\n",
" return 'google forms'\n",
" if y == 'bit' or y == 'goo':\n",
" return 'redirect link'\n",
" return y\n",
" return l[0]\n",
"# ex = 'https://www.facebook.com/events/649632242183168'\n",
"data['domains'] = data['link'].apply(get_domain)\n",
"domains = data['domains'].value_counts()\n",
"print('Топ-20 самых популярных доменов для организации мероприятий:')\n",
"print(domains.head(20))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading