Skip to content

Commit

Permalink
Fix code format
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Feb 23, 2023
1 parent 256c839 commit c1ed765
Show file tree
Hide file tree
Showing 50 changed files with 119 additions and 42 deletions.
2 changes: 1 addition & 1 deletion archive_query_log/cli/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def _build_search_result(
.get(archived_search_result_snippet.id)
# archived_parsed_search_result_loc = archived_parsed_search_result_index \
# .get(archived_search_result_snippet.id)
archived_parsed_search_result_loc = None
# archived_parsed_search_result_loc = None
return CorpusSearchResult(
id=archived_search_result_snippet.id,
url=archived_search_result_snippet.url,
Expand Down
26 changes: 19 additions & 7 deletions archive_query_log/cli/external.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from pathlib import Path
from re import compile, escape
from urllib.parse import quote
from numpy import nan

from click import argument
from pandas import DataFrame, read_csv, Series, concat
Expand All @@ -20,6 +19,7 @@
sheet_query_parsers = "Query Parsers"
sheet_page_parsers = "Page Parsers"


@main.group("external")
def external():
pass
Expand Down Expand Up @@ -90,6 +90,7 @@ def load_query_parsers() -> DataFrame:
df["query_parser"] = df["value"]
return df[["name", "query_parser"]]


def load_page_offset_parsers() -> DataFrame:
df = from_sheets(sheet_page_parsers, transpose=True)
df["value"].replace("NULL", "{}", inplace=True)
Expand Down Expand Up @@ -128,9 +129,12 @@ def query_parser(row: Series) -> dict:
else:
raise NotImplementedError()


page_offset_parser_map = {"parameter": "query_parameter",
"suffix": "path_suffix",
"fragment": "fragment_parameter"}


def page_offset_parser(row: Series, count="results") -> dict:
row = row.to_dict()
row.update(loads(row["page_offset_parser"]))
Expand All @@ -140,9 +144,10 @@ def page_offset_parser(row: Series, count="results") -> dict:
"url_pattern": url_pattern,
"type": page_offset_parser_map[row["type"]],
"parameter": row["key"]
}
}
else:
return NotImplementedError()
raise NotImplementedError()


def page_offset_parser_series(page_offset_parsers, services, count):
return [
Expand All @@ -151,12 +156,15 @@ def page_offset_parser_series(page_offset_parsers, services, count):
for _, row in
page_offset_parsers[
(page_offset_parsers["name"].str.fullmatch(service["name"])) &
(page_offset_parsers["page_offset_parser"].str.contains(f'"count": "{count}"'))
].iterrows()
(page_offset_parsers["page_offset_parser"].str.contains(
f'"count": "{count}"'
))
].iterrows()
), key=lambda pp: str(pp["url_pattern"]))
for _, service in services.iterrows()
]


@external.command("import-services")
@argument(
"services-file",
Expand Down Expand Up @@ -200,8 +208,12 @@ def import_services(services_file: Path):
load_page_offset_parsers()[["page_offset_parser"]]
],
axis="columns")
services["page_parsers"] = page_offset_parser_series(page_offset_parsers, services, count="pages")
services["offset_parsers"] = page_offset_parser_series(page_offset_parsers, services, count="results")
services["page_parsers"] = page_offset_parser_series(
page_offset_parsers, services, count="pages"
)
services["offset_parsers"] = page_offset_parser_series(
page_offset_parsers, services, count="results"
)
services["interpreted_query_parsers"] = [
[]
for _, service in services.iterrows()
Expand Down
6 changes: 4 additions & 2 deletions archive_query_log/cli/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,13 @@ def stats_command(
if document["archived_snippet_location"] is not None:
results[service_name]["archived-snippets"] += 1
if document[
"archived_raw_search_result_location"] is not None:
"archived_raw_search_result_location"
] is not None:
results[service_name][
"archived-raw-search-results"] += 1
if document[
"archived_parsed_search_result_location"] is not None:
"archived_parsed_search_result_location"
] is not None:
results[service_name][
"archived-parsed-search-results"] += 1

Expand Down
2 changes: 1 addition & 1 deletion archive_query_log/download/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ async def download_service(
if focused:
pages = tqdm(
pages,
desc=f"Deduplicate query URLs",
desc="Deduplicate query URLs",
unit="page",
)

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/generate_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def main():
if not test_path.exists():
with test_path.open("wt") as o:
o.write(dedent("""
# flake8: noqa
# This file is auto-generated by generate_tests.py.
""").lstrip())
o.write(dedent("""
Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_360_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_amazon_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_baidu_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_bing_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_canva_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_cnn_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_csdn_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_ebay_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_espn_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_etsy_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_github_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_google_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_imdb_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_imgur_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_indeed_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_jd_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
from archive_query_log.results.test.test_utils import verify_serp_parsing


Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
from archive_query_log.results.test.test_utils import verify_serp_parsing


Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
from archive_query_log.results.test.test_utils import verify_serp_parsing


Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_naver_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_qq_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_reddit_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_roblox_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_sogou_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_twitch_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_vk_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_weibo_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_yahoo_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
1 change: 1 addition & 0 deletions archive_query_log/results/test/test_yandex_serp_parsing.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
# This file is auto-generated by generate_tests.py.
from archive_query_log.results.test.test_utils import verify_serp_parsing

Expand Down
3 changes: 2 additions & 1 deletion archive_query_log/service_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
for service in SERVICES.values()
)
print(
f"Number of interpreted query parsers: {num_interpreted_query_parsers}")
f"Number of interpreted query parsers: {num_interpreted_query_parsers}"
)
num_results_parsers = sum(
len(service.results_parsers)
for service in SERVICES.values()
Expand Down
4 changes: 3 additions & 1 deletion archive_query_log/services/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from archive_query_log.model import Service


def read_services(path: Path, ignore_parsing_errors=True) -> Mapping[str, Service]:
def read_services(
path: Path, ignore_parsing_errors=True
) -> Mapping[str, Service]:
with path.open("r") as file:
services_dict = safe_load(file)
services = []
Expand Down
22 changes: 14 additions & 8 deletions archive_query_log/services/search_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,12 @@ def check_url(self, url: str):
# Look for elements with the pattern in them and save the snippets
soup = BeautifulSoup(html, 'html.parser')
found_input, input_snippets = find_input_tag(soup=soup)
found_search_form, form_snippets = find_search_tag(soup=soup, tag='form')
found_search_form, form_snippets = find_search_tag(
soup=soup, tag='form')
found_search_div, div_snippets = find_search_tag(soup=soup, tag='div')

return found_input, found_search_form, found_search_div, input_snippets, form_snippets, div_snippets
return found_input, found_search_form, found_search_div, \
input_snippets, form_snippets, div_snippets

def get_internet_archive_html(self, url: str, year=2022, byte_digits=4):
"""
Expand Down Expand Up @@ -170,24 +172,28 @@ def find_search_tag(soup: BeautifulSoup, tag='form'):
return found, snippet_list



if __name__ == "__main__":
# Parse input
parser = argparse.ArgumentParser(prog='Search form identification',
description='Takes in a CSV-File of services and looks for search forms '
'in their HTML')
parser = argparse.ArgumentParser(
prog='Search form identification',
description='Takes in a CSV-File of services '
'and looks for search forms in their HTML')
parser.add_argument('-f', '--csv_file', type=str)
parser.add_argument('-o', '--outfile_num', type=str)
parser.add_argument('-s', '--start_row', type=int)
parser.add_argument('-e', '--end_row', type=int)
args = parser.parse_args()

# Set/Update default values
csv_file = './alexa-top-1m-fused-domains-rrf-top-10000.csv' if args.csv_file is None else args.csv_file
csv_file = './alexa-top-1m-fused-domains-rrf-top-10000.csv' \
if args.csv_file is None else args.csv_file
outfile_num = "0" if args.outfile_num is None else args.outfile_num
start_row = 0 if args.start_row is None else args.start_row
end_row = None if args.end_row is None else args.end_row

# Run the search for specified services
identifier = SearchFormIdentifier(csv_file=csv_file, outfile_num=outfile_num, start_row=start_row, end_row=end_row)
identifier = SearchFormIdentifier(
csv_file=csv_file, outfile_num=outfile_num,
start_row=start_row, end_row=end_row
)
identifier.process_services()
Loading

0 comments on commit c1ed765

Please sign in to comment.