Skip to content

Commit

Permalink
Rename top level module
Browse files Browse the repository at this point in the history
  • Loading branch information
janheinrichmerker committed Feb 23, 2023
1 parent 36b6d82 commit ab4f324
Show file tree
Hide file tree
Showing 97 changed files with 164 additions and 164 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ The list of search providers should be stored in a single [YAML][yaml-spec] file
- ...
```

In the source code, a search provider corresponds to the Python class [`Service`](web_archive_query_log/model/__init__.py).
In the source code, a search provider corresponds to the Python class [`Service`](archive_query_log/model/__init__.py).

### 2. Archived URLs

Expand Down Expand Up @@ -131,7 +131,7 @@ Each individual file is a GZIP-compressed [JSONL][jsonl-spec] file with one arch
}
```
In the source code, an archived URL corresponds to the Python class [`ArchivedUrl`](web_archive_query_log/model/__init__.py).
In the source code, an archived URL corresponds to the Python class [`ArchivedUrl`](archive_query_log/model/__init__.py).
### 3. Archived Query URLs
Expand Down Expand Up @@ -168,7 +168,7 @@ Each individual file is a GZIP-compressed [JSONL][jsonl-spec] file with one arch
}
```
In the source code, an archived query URL corresponds to the Python class [`ArchivedQueryUrl`](web_archive_query_log/model/__init__.py).
In the source code, an archived query URL corresponds to the Python class [`ArchivedQueryUrl`](archive_query_log/model/__init__.py).
### 4. Archived Raw SERPs
Expand Down Expand Up @@ -205,7 +205,7 @@ Each individual file is a GZIP-compressed [WARC][warc-spec] file with one WARC r
}
```
In the source code, an archived raw SERP corresponds to the Python class [`ArchivedRawSerp`](web_archive_query_log/model/__init__.py).
In the source code, an archived raw SERP corresponds to the Python class [`ArchivedRawSerp`](archive_query_log/model/__init__.py).
### 5. Archived Parsed SERPs
Expand Down Expand Up @@ -255,22 +255,22 @@ Each individual file is a GZIP-compressed [JSONL][jsonl-spec] file with one arch
}
```
In the source code, an archived parsed SERP corresponds to the Python class [`ArchivedParsedSerp`](web_archive_query_log/model/__init__.py).
In the source code, an archived parsed SERP corresponds to the Python class [`ArchivedParsedSerp`](archive_query_log/model/__init__.py).
### Pro Tip: Specify a Custom Data Directory
By default, the data directory is set to [`data/`](data). You can change this with the `--data-directory` option, e.g.:
```shell
python -m web_archive_query_log make archived-urls --data-directory /mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/
python -m archive_query_log make archived-urls --data-directory /mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/
```
### Pro Tip: Limit Scraping for Testing
If the search provider you're scraping queries for is very large and has many domains, testing your settings on a smaller sample from that search provider can be helpful. You can specify a single domain to scrape from like this:

```shell
python -m web_archive_query_log make archived-urls <PROVIDER> <DOMAIN>
python -m archive_query_log make archived-urls <PROVIDER> <DOMAIN>
```

If a domain is very popular and therefore has many archived URLs,
Expand All @@ -280,7 +280,7 @@ from the Wayback Machine's
[CDX API](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api):
```shell
python -m web_archive_query_log make archived-urls <PROVIDER> <DOMAIN> <CDX_PAGE>
python -m archive_query_log make archived-urls <PROVIDER> <DOMAIN> <CDX_PAGE>
```
## Development
Expand Down
File renamed without changes.
4 changes: 4 additions & 0 deletions archive_query_log/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from archive_query_log.cli import main

if __name__ == "__main__":
main()
File renamed without changes.
8 changes: 8 additions & 0 deletions archive_query_log/cli/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# flake8: noqa
from archive_query_log.cli.main import main
from archive_query_log.cli.alexa import alexa
from archive_query_log.cli.external import external
from archive_query_log.cli.make import make_group
from archive_query_log.cli.stats import stats_command
from archive_query_log.cli.corpus import corpus_command
from archive_query_log.cli.index import index_command
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

from click import option, Path as PathParam, argument, IntRange

from web_archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL
from web_archive_query_log.cli.main import main
from web_archive_query_log.cli.util import URL
from archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL
from archive_query_log.cli.main import main
from archive_query_log.cli.util import URL


@main.group("alexa")
Expand Down Expand Up @@ -32,7 +32,7 @@ def alexa():
default=DATA_DIRECTORY_PATH / "alexa-top-1m-archived-urls.jsonl"
)
def archived_urls(api_url: str, output_path: Path) -> None:
from web_archive_query_log.services.alexa import AlexaTop1MArchivedUrls
from archive_query_log.services.alexa import AlexaTop1MArchivedUrls
AlexaTop1MArchivedUrls(
output_path=output_path,
cdx_api_url=api_url,
Expand Down Expand Up @@ -64,7 +64,7 @@ def archived_urls(api_url: str, output_path: Path) -> None:
default=1000,
)
def domains(data_dir: Path, api_url: str, depth: int) -> None:
from web_archive_query_log.services.alexa import AlexaTop1MFusedDomains
from archive_query_log.services.alexa import AlexaTop1MFusedDomains
AlexaTop1MFusedDomains(
data_directory_path=data_dir,
cdx_api_url=api_url,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
from click import option, BOOL
from tqdm.auto import tqdm

from web_archive_query_log import DATA_DIRECTORY_PATH
from web_archive_query_log.cli import main
from web_archive_query_log.cli.util import PathParam
from web_archive_query_log.index import ArchivedRawSerpIndex, \
from archive_query_log import DATA_DIRECTORY_PATH
from archive_query_log.cli import main
from archive_query_log.cli.util import PathParam
from archive_query_log.index import ArchivedRawSerpIndex, \
ArchivedUrlIndex, ArchivedQueryUrlIndex, ArchivedParsedSerpIndex, \
ArchivedSearchResultSnippetIndex, ArchivedRawSearchResultIndex, \
LocatedRecord
from web_archive_query_log.model import ArchivedUrl, CorpusQueryUrl, \
from archive_query_log.model import ArchivedUrl, CorpusQueryUrl, \
ArchivedSearchResultSnippet, CorpusDocument, CorpusJsonlLocation, \
CorpusWarcLocation, ArchivedRawSerp, \
ArchivedQueryUrl, ArchivedParsedSerp, CorpusQuery, CorpusSearchResult
Expand Down Expand Up @@ -71,7 +71,7 @@ def corpus_command(
queries: bool,
output_directory: Path,
) -> None:
from web_archive_query_log.index import ArchivedUrlIndex, \
from archive_query_log.index import ArchivedUrlIndex, \
ArchivedQueryUrlIndex, ArchivedRawSerpIndex, ArchivedParsedSerpIndex, \
ArchivedSearchResultSnippetIndex, ArchivedRawSearchResultIndex

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
from pandas import DataFrame, read_csv, Series, concat
from yaml import dump

from web_archive_query_log import DATA_DIRECTORY_PATH
from web_archive_query_log.cli import main
from web_archive_query_log.cli.util import PathParam
from archive_query_log import DATA_DIRECTORY_PATH
from archive_query_log.cli import main
from archive_query_log.cli.util import PathParam

sheets_id = "1LnIJYFBYQtZ32rxnT6RPGMOvuRIUQMoEx7tOS0z7Mi8"
sheet_services = "Services"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from click import option, BOOL
from tqdm.auto import tqdm

from web_archive_query_log import DATA_DIRECTORY_PATH
from web_archive_query_log.cli import main
from web_archive_query_log.cli.util import PathParam
from web_archive_query_log.index import ArchivedRawSerpIndex, \
from archive_query_log import DATA_DIRECTORY_PATH
from archive_query_log.cli import main
from archive_query_log.cli.util import PathParam
from archive_query_log.index import ArchivedRawSerpIndex, \
ArchivedUrlIndex, ArchivedQueryUrlIndex, ArchivedParsedSerpIndex, \
ArchivedSearchResultSnippetIndex, ArchivedRawSearchResultIndex

Expand Down
File renamed without changes.
26 changes: 13 additions & 13 deletions web_archive_query_log/cli/make.py → archive_query_log/cli/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

from click import option, argument, STRING, IntRange, BOOL

from web_archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL, LOGGER
from web_archive_query_log.cli import main
from web_archive_query_log.cli.util import PathParam, ServiceChoice
from archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL, LOGGER
from archive_query_log.cli import main
from archive_query_log.cli.util import PathParam, ServiceChoice


@main.group("make")
Expand Down Expand Up @@ -78,8 +78,8 @@ def archived_urls_command(
domain: str | None,
cdx_page: int | None,
) -> None:
from web_archive_query_log.config import SERVICES
from web_archive_query_log.urls.fetch import ArchivedUrlsFetcher, \
from archive_query_log.config import SERVICES
from archive_query_log.urls.fetch import ArchivedUrlsFetcher, \
UrlMatchScope
service_config = SERVICES[service]
match_scope = UrlMatchScope.PREFIX if focused else UrlMatchScope.DOMAIN
Expand Down Expand Up @@ -122,8 +122,8 @@ def archived_query_urls_command(
domain: str | None,
cdx_page: int | None,
) -> None:
from web_archive_query_log.config import SERVICES
from web_archive_query_log.queries.parse import ArchivedQueryUrlParser
from archive_query_log.config import SERVICES
from archive_query_log.queries.parse import ArchivedQueryUrlParser
service_config = SERVICES[service]
if len(service_config.query_parsers) == 0:
LOGGER.warning(
Expand Down Expand Up @@ -166,8 +166,8 @@ def archived_raw_serps_command(
domain: str | None,
cdx_page: int | None,
) -> None:
from web_archive_query_log.config import SERVICES
from web_archive_query_log.download.warc import WebArchiveWarcDownloader
from archive_query_log.config import SERVICES
from archive_query_log.download.warc import WebArchiveWarcDownloader
service_config = SERVICES[service]
downloader = WebArchiveWarcDownloader(verbose=True)
if focused:
Expand Down Expand Up @@ -197,8 +197,8 @@ def archived_parsed_serps_command(
domain: str | None,
cdx_page: int | None,
) -> None:
from web_archive_query_log.config import SERVICES
from web_archive_query_log.results.parse import ArchivedParsedSerpParser
from archive_query_log.config import SERVICES
from archive_query_log.results.parse import ArchivedParsedSerpParser
service_config = SERVICES[service]
if len(service_config.results_parsers) == 0:
LOGGER.warning(
Expand Down Expand Up @@ -240,8 +240,8 @@ def archived_raw_search_results_command(
domain: str | None,
cdx_page: int | None,
) -> None:
from web_archive_query_log.config import SERVICES
from web_archive_query_log.download.warc import WebArchiveWarcDownloader
from archive_query_log.config import SERVICES
from archive_query_log.download.warc import WebArchiveWarcDownloader
service_config = SERVICES[service]
downloader = WebArchiveWarcDownloader(verbose=True)
if focused:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from pandas import DataFrame
from tqdm.auto import tqdm

from web_archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL, LOGGER
from web_archive_query_log.cli import main
from web_archive_query_log.cli.util import PathParam
from web_archive_query_log.config import SERVICES
from archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL, LOGGER
from archive_query_log.cli import main
from archive_query_log.cli.util import PathParam
from archive_query_log.config import SERVICES

# See:
# https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api
Expand All @@ -24,8 +24,8 @@ def _all_archived_urls(
focused: bool,
service: str,
) -> int:
from web_archive_query_log.config import SERVICES
from web_archive_query_log.urls.fetch import ArchivedUrlsFetcher, \
from archive_query_log.config import SERVICES
from archive_query_log.urls.fetch import ArchivedUrlsFetcher, \
UrlMatchScope
service_config = SERVICES[service]
match_scope = UrlMatchScope.PREFIX if focused else UrlMatchScope.DOMAIN
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(self) -> None:

def _ensure_choices(self):
if len(self.choices) == 0:
from web_archive_query_log.config import SERVICES
from archive_query_log.config import SERVICES
self.choices = sorted(SERVICES.keys())

def to_info_dict(self) -> Dict[str, Any]:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import Mapping

from web_archive_query_log import DATA_DIRECTORY_PATH
from web_archive_query_log.model import Service
from web_archive_query_log.services import read_services
from archive_query_log import DATA_DIRECTORY_PATH
from archive_query_log.model import Service
from archive_query_log.services import read_services

# Load all services that have parsers and create the services for them.
SERVICES_PATH = DATA_DIRECTORY_PATH / "selected-services.yaml"
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
WarcRecord
from marshmallow import Schema

from web_archive_query_log import LOGGER
from web_archive_query_log.model import ArchivedQueryUrl, ArchivedRawSerp
from archive_query_log import LOGGER
from archive_query_log.model import ArchivedQueryUrl, ArchivedRawSerp


@dataclass(frozen=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
from asyncio_pool import AioPool
from tqdm.auto import tqdm

from web_archive_query_log.model import ArchivedUrl
from web_archive_query_log.util.archive_http import archive_http_client
from archive_query_log.model import ArchivedUrl
from archive_query_log.util.archive_http import archive_http_client


class WebArchiveRawDownloader:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
from tqdm.auto import tqdm
from warcio import WARCWriter, StatusAndHeaders

from web_archive_query_log.model import ArchivedUrl, Service
from web_archive_query_log.queries.iterable import ArchivedQueryUrls
from web_archive_query_log.serps.iterable import ArchivedParsedSerps
from web_archive_query_log.util.archive_http import archive_http_client
from archive_query_log.model import ArchivedUrl, Service
from archive_query_log.queries.iterable import ArchivedQueryUrls
from archive_query_log.serps.iterable import ArchivedParsedSerps
from archive_query_log.util.archive_http import archive_http_client


class _CdxPage(NamedTuple):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
from marshmallow import Schema
from tqdm.auto import tqdm

from web_archive_query_log import DATA_DIRECTORY_PATH, LOGGER
from web_archive_query_log.model import ArchivedUrl, ArchivedQueryUrl, \
from archive_query_log import DATA_DIRECTORY_PATH, LOGGER
from archive_query_log.model import ArchivedUrl, ArchivedQueryUrl, \
ArchivedParsedSerp, ArchivedSearchResultSnippet, ArchivedRawSerp, \
ArchivedRawSearchResult, CorpusJsonlLocation, CorpusJsonlSnippetLocation, \
CorpusWarcLocation
from web_archive_query_log.util.text import count_lines
from archive_query_log.util.text import count_lines


@dataclass(frozen=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from dataclasses_json import DataClassJsonMixin, config
from marshmallow.fields import List, Nested, String, Field

from web_archive_query_log.model.highlight import HighlightedText
from web_archive_query_log.util.serialization import HighlightedTextField
from archive_query_log.model.highlight import HighlightedText
from archive_query_log.util.serialization import HighlightedTextField


@dataclass(frozen=True, slots=True)
Expand Down Expand Up @@ -254,7 +254,7 @@ class ArchivedParsedSearchResult(ArchivedSearchResultSnippet,


# flake8: noqa: E402
from web_archive_query_log.model.parse import QueryParser, \
from archive_query_log.model.parse import QueryParser, \
PageParser, OffsetParser, QueryParserField, PageOffsetParserField, \
ResultsParserField, InterpretedQueryParserField, InterpretedQueryParser, \
ResultsParser
Expand Down
File renamed without changes.
Loading

0 comments on commit ab4f324

Please sign in to comment.