Skip to content

Commit

Permalink
A small reporting hack (#3621)
Browse files Browse the repository at this point in the history
* A small reporting hack

I'd considered some way to report dataset create/upload statistics based on a
date range, but never got around to it. Today, writing up the draft report for
April, I of course didn't have a report "for April", and pasted in one from
April 25...

And then thought, well, how hard would it be to add `--since` and `--until` to
set the range?

```
$ pbench-report-generator --statistics creation --since 2023-1-1 --until 2024-1-1
Dataset statistics by creation date:
 23,147 from 2023-01-01 00:07 to 2023-12-21 13:09
    1,441 in month December 2023
    0 in week December 24 to December 31
    0 on 31 December 2023
 Total by year:
    2023:   23,147
 Total by month of year:
    Jan:    5,722    Feb:    1,449    Mar:    1,356    Apr:    1,924
    May:    1,895    Jun:      979    Jul:      639    Aug:    1,202
    Sep:    2,798    Oct:    1,877    Nov:    1,865    Dec:    1,441
 Total by day of month:
    01:    1,869    02:    1,461    03:    1,273    04:    1,303
    05:    1,366    06:      888    07:      645    08:      662
    09:      556    10:      517    11:      591    12:      544
    13:      792    14:      777    15:      508    16:      485
    17:      564    18:      706    19:      521    20:      539
    21:      441    22:      555    23:      493    24:      535
    25:      541    26:      609    27:      776    28:      771
    29:      748    30:      737    31:      374
 Total by day of week:
    Mon:    2,755    Tue:    3,995    Wed:    4,251    Thu:    3,366
    Fri:    2,478    Sat:    2,981    Sun:    3,321
 Total by hour of day:
    00:    1,022    01:      731    02:      992    03:      993
    04:    1,354    05:    1,145    06:      910    07:      845
    08:      875    09:    1,157    10:    1,317    11:    1,212
    12:    1,132    13:      956    14:      960    15:      941
    16:      783    17:      815    18:      952    19:      904
    20:      890    21:      847    22:      735    23:      679
```
  • Loading branch information
dbutenhof authored May 13, 2024
1 parent 81eb077 commit 19957c2
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 34 deletions.
17 changes: 10 additions & 7 deletions lib/pbench/cli/server/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@ class DateParser(ParamType):
def convert(
self, value: Any, param: Optional[Parameter], ctx: Optional[Context]
) -> Any:
if isinstance(value, datetime.datetime):
return value

try:
return parser.parse(value)
except Exception as e:
self.fail(f"{value!r} cannot be converted to a datetime: {str(e)!r}")
if isinstance(value, str):
try:
value = parser.parse(value)
except Exception as e:
self.fail(f"{value!r} cannot be converted to a datetime: {str(e)!r}")
if not isinstance(value, datetime.datetime):
self.fail(f"{value!r} ({type(value).__name__}) is unsupported.")
if value.tzinfo is None:
value = value.replace(tzinfo=datetime.timezone.utc)
return value


class Detail:
Expand Down
120 changes: 93 additions & 27 deletions lib/pbench/cli/server/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
import re
import shutil
import time
from typing import Any, Iterator, Optional, Union
from typing import Any, Optional, Union

import click
import humanize
from sqlalchemy import cast, inspect, Row, select, text
from sqlalchemy import cast, inspect, select, text
from sqlalchemy.orm import Query

from pbench.cli import pass_cli_context
from pbench.cli.server import config_setup, Detail, Verify, Watch
from pbench.cli.server import config_setup, DateParser, Detail, Verify, Watch
from pbench.cli.server.options import common_options
from pbench.common.logger import get_pbench_logger
from pbench.server import BadConfig
Expand Down Expand Up @@ -368,30 +369,82 @@ def columnize(
click.echo(line)


def summarize_dates(rows: Iterator[Row], width: int = 80):
def summarize_dates(base_query: Query, options: dict[str, Any]):
"""Collect and report statistics
The caller supplies a base query providing a "date" column, effectively
"SELECT dataset.upload AS date" so that we can filter on the date and
process each match.
Args:
base_query: a SQLAlchemy Query producing a "date" column
options: The Click option dictionary
"""
width: int = options.get("width")
since = options.get("since")
until = options.get("until")

if since and until and since > until:
raise Exception("The --until value must be later than the --since value")

by_year = defaultdict(int)
by_month = defaultdict(int)
by_day = defaultdict(int)
by_weekday = defaultdict(int)
by_hour = defaultdict(int)

day = datetime.datetime.now(datetime.timezone.utc).replace(
hour=0, minute=0, second=0, microsecond=0
start = (
since if since else datetime.datetime.fromtimestamp(0.0, datetime.timezone.utc)
)
end = until if until else datetime.datetime.now(datetime.timezone.utc)

# It's convenient to use `--until YYYY-MM-01` to see a month (though
# technically that would include a YYYY-MM-01:00:00.00 timestamp), but
# bucketizing the day or week based on that anomaly isn't very useful, so
# back up the "day" one millisecond to move it into the last day of the
# previous month.
day = end - datetime.timedelta(milliseconds=1)
day = day.replace(hour=0, minute=0, second=0, microsecond=0)
month = day.replace(day=1)
year = month.replace(month=1)
week = day - datetime.timedelta(days=7)

first: Optional[datetime.datetime] = None
last: Optional[datetime.datetime] = None

this_year = 0
this_month = 0
this_week = 0
this_day = 0
in_range = 0

filters = []

# Create a subquery from our basic select parameters so that we can use
# the label (SQL "AS date") in our WHERE filter clauses. (In a direct query
# PostgreSQL doesn't allow filtering on renamed columns.)
subquery = base_query.subquery()
query = Database.db_session.query(subquery.c.date).order_by(subquery.c.date)

if since:
verifier.status(f"Filter since {since}")
filters.append(subquery.c.date >= since)
if until:
verifier.status(f"Filter until {until}")
filters.append(subquery.c.date <= until)
if filters:
query = query.filter(*filters)
rows = query.execution_options(stream_results=True).yield_per(SQL_CHUNK)

for row in rows:
date: datetime.datetime = row[0]
if not isinstance(date, datetime.datetime):
detailer.message(f"Got non-datetime row {row}")
continue
if not first:
first = date
last = date
in_range += 1
by_year[date.year] += 1
by_month[date.month] += 1
by_day[date.day] += 1
Expand All @@ -407,10 +460,22 @@ def summarize_dates(rows: Iterator[Row], width: int = 80):
if date >= day:
this_day += 1

click.echo(f" {this_year:,d} this year ({year:%Y})")
click.echo(f" {this_month:,d} this month ({month:%B %Y})")
click.echo(f" {this_week:,d} this week ({week:%B %d} to {day:%B %d})")
click.echo(f" {this_day:,d} today ({day:%d %B %Y})")
if not first:
click.echo(
f" No datasets found between {start:%Y-%m-%d %H:%M} and {end:%Y-%m-%d %H:%M}"
)
return

click.echo(f" {in_range:,d} from {first:%Y-%m-%d %H:%M} to {last:%Y-%m-%d %H:%M}")

if start < year:
click.echo(f" {this_year:,d} in year {year:%Y}")
if start < month:
click.echo(f" {this_month:,d} in month {month:%B %Y}")
if start < week:
click.echo(f" {this_week:,d} in week {week:%B %d} to {day:%B %d}")
if start < day:
click.echo(f" {this_day:,d} on {day:%d %B %Y}")

click.echo(" Total by year:")
columnize(by_year, width)
Expand All @@ -429,30 +494,21 @@ def report_creation(options: dict[str, Any]):

watcher.update("analyzing upload patterns")

rows = (
Database.db_session.query(
cast(Metadata.value["pbench", "date"].as_string(), TZDateTime)
)
.filter(Metadata.key == "metalog")
.execution_options(stream_results=True)
.yield_per(SQL_CHUNK)
)
rows = Database.db_session.query(
cast(Metadata.value["pbench", "date"].as_string(), TZDateTime).label("date")
).filter(Metadata.key == "metalog")
click.echo("Dataset statistics by creation date:")
summarize_dates(rows, options.get("width"))
summarize_dates(rows, options)


def report_uploads(options: dict[str, Any]):
"""Report dataset statistics by upload date"""

watcher.update("analyzing upload patterns")

rows = (
Database.db_session.query(Dataset.uploaded)
.execution_options(stream_results=True)
.yield_per(SQL_CHUNK)
)
rows = Database.db_session.query(Dataset.uploaded.label("date"))
click.echo("Dataset statistics by upload date:")
summarize_dates(rows, options.get("width"))
summarize_dates(rows, options)


def report_audit():
Expand Down Expand Up @@ -664,14 +720,24 @@ def report_states():
@click.option(
"--progress", "-p", type=float, default=0.0, help="Show periodic progress messages"
)
@click.option(
"--since",
type=DateParser(),
help="Confine statistics to datasets uploaded/created since date/time",
)
@click.option("--sql", "-s", default=False, is_flag=True, help="Display SQL statistics")
@click.option(
"--states", "-S", default=False, is_flag=True, help="Display operational states"
)
@click.option(
"--statistics",
type=click.Choice(["creation", "upload"], case_sensitive=False),
help="Show upload statistics",
help="Show dataset statistics by creation or upload timestamp",
)
@click.option(
"--until",
type=DateParser(),
help="Confine statistics to datasets uploaded/created until date/time",
)
@click.option(
"--verify", "-v", default=False, is_flag=True, help="Display intermediate messages"
Expand Down Expand Up @@ -733,7 +799,7 @@ def report(context: object, **kwargs):
logger.exception("An error occurred discovering the file tree: {}", exc)
if kwargs.get("verify"):
raise
click.secho(exc, err=True, bg="red")
click.secho(exc, err=True, fg="red")
rv = 2 if isinstance(exc, BadConfig) else 1

click.get_current_context().exit(rv)

0 comments on commit 19957c2

Please sign in to comment.