Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Website traffic stats gathering #1679

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
"libraries",
"mailing_list",
"news",
"reports",
"core",
"slack",
]
Expand Down
77 changes: 4 additions & 73 deletions libraries/forms.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
import io
import base64
from functools import cached_property
from itertools import groupby, chain
from operator import attrgetter
from dataclasses import dataclass, field
from datetime import date, timedelta

import psycopg2
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt

from django.template.loader import render_to_string
from django.db.models import F, Q, Count, OuterRef, Sum, When, Value, Case
from django.forms import Form, ModelChoiceField, ModelForm, BooleanField
from django.conf import settings

from core.models import RenderedContent, SiteSettings
from libraries.utils import batched, boost_normalize_words, grey_color_func
from core.models import RenderedContent
from reports.generation import ReportVisualization
from slack.models import Channel, SlackActivityBucket, SlackUser
from versions.models import Version
from .models import (
Expand All @@ -25,10 +19,10 @@
Issue,
Library,
LibraryVersion,
WordcloudMergeWord,
)
from libraries.constants import SUB_LIBRARIES
from mailing_list.models import EmailData
from .utils import batched


class LibraryForm(ModelForm):
Expand Down Expand Up @@ -448,72 +442,9 @@ def _get_top_contributors_for_library_version(self, library_order):
)
return top_contributors_release

def _get_mail_content(self, version):
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-release_date")
.first()
)
if not prior_version or not settings.HYPERKITTY_DATABASE_NAME:
return []
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with conn.cursor(name="fetch-mail-content") as cursor:
cursor.execute(
"""
SELECT content FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s;
""",
{"start": prior_version.release_date, "end": version.release_date},
)
for [content] in cursor:
yield content

def _generate_hyperkitty_word_cloud(self, version):
"""Generates a wordcloud png and returns it as a base64 string."""
wc = WordCloud(
mode="RGBA",
background_color=None,
width=1400,
height=700,
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
font_path=settings.BASE_DIR / "static" / "font" / "notosans_mono.woff",
)
word_frequencies = {}
for content in self._get_mail_content(version):
for key, val in wc.process_text(content).items():
if len(key) < 2:
continue
key_lower = key.lower()
if key_lower not in word_frequencies:
word_frequencies[key_lower] = 0
word_frequencies[key_lower] += val
if not word_frequencies:
return None, {}

word_frequencies = boost_normalize_words(
word_frequencies,
{x.from_word: x.to_word for x in WordcloudMergeWord.objects.all()},
)

wc.generate_from_frequencies(word_frequencies)
plt.figure(figsize=(14, 7), facecolor=None)
plt.imshow(
wc.recolor(color_func=grey_color_func, random_state=3),
interpolation="bilinear",
)
plt.axis("off")
image_bytes = io.BytesIO()
plt.savefig(
image_bytes,
format="png",
dpi=100,
bbox_inches="tight",
pad_inches=0,
transparent=True,
)
image_bytes.seek(0)
return base64.b64encode(image_bytes.read()).decode(), word_frequencies
return ReportVisualization.generate_wordcloud(version)

def _count_mailinglist_contributors(self, version):
version_lt = list(
Expand Down
20 changes: 1 addition & 19 deletions libraries/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from itertools import islice
import random
import string
import re
from itertools import islice

import structlog
import tempfile
Expand Down Expand Up @@ -296,21 +296,3 @@ def parse_line(line: str):
"Some library versions were skipped during artifact parsing.",
skipped_library_versions=skipped_library_versions,
)


def boost_normalize_words(frequencies, word_map):
# from word, to word
for o, n in word_map.items():
from_count = frequencies.get(o, 0)
if not from_count:
continue
to_count = frequencies.get(n, 0)
frequencies[n] = from_count + to_count
del frequencies[o]
return frequencies


def grey_color_func(
word, font_size, position, orientation, random_state=None, **kwargs
):
return "hsl(0, 0%%, %d%%)" % random.randint(10, 80)
Empty file added reports/__init__.py
Empty file.
32 changes: 32 additions & 0 deletions reports/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from django.contrib import admin

from reports.models import WebsiteStatReport, WebsiteStatItem


class StatInline(admin.StackedInline):
model = WebsiteStatItem
extra = 0
fields = ("value",)
readonly_fields = fields
can_delete = False


@admin.register(WebsiteStatReport)
class WebsiteStatReportAdmin(admin.ModelAdmin):
inlines = (StatInline,)
list_display = ("version", "pageviews", "unique_visitors", "period")
ordering = ("-version",)

# def get_queryset(self, request):
# qs = super().get_queryset(request)
# return qs.prefetch_related("stats")

def pageviews(self, obj):
return f"{int(obj.stats.get(code_name='pageviews').value):,}"

def unique_visitors(self, obj):
return f"{int(obj.stats.get(code_name='visitors').value):,}"


@admin.register(WebsiteStatItem)
class WebsiteStatItemAdmin(admin.ModelAdmin): ...
6 changes: 6 additions & 0 deletions reports/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from django.apps import AppConfig


class ReportsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "reports"
103 changes: 103 additions & 0 deletions reports/generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import base64
import io
import random

import psycopg2
from django.conf import settings
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

from core.models import SiteSettings
from libraries.models import WordcloudMergeWord # TODO: move model to this app
from versions.models import Version


class ReportVisualization:
@staticmethod
def generate_wordcloud(version: Version) -> tuple[str | None, dict]:
"""Generates a wordcloud png and returns it as a base64 string and word frequencies.

Returns:
Tuple of (base64_encoded_png_string, word_frequencies_dict)
"""
wc = WordCloud(
mode="RGBA",
background_color=None,
width=1400,
height=700,
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
font_path=settings.BASE_DIR / "static" / "font" / "notosans_mono.woff",
)
word_frequencies = {}
for content in get_mail_content(version):
for key, val in wc.process_text(content).items():
if len(key) < 2:
continue
key_lower = key.lower()
if key_lower not in word_frequencies:
word_frequencies[key_lower] = 0
word_frequencies[key_lower] += val
if not word_frequencies:
return None, {}

word_frequencies = boost_normalize_words(
word_frequencies,
{x.from_word: x.to_word for x in WordcloudMergeWord.objects.all()},
)

wc.generate_from_frequencies(word_frequencies)
plt.figure(figsize=(14, 7), facecolor=None)
plt.imshow(
wc.recolor(color_func=grey_color_func, random_state=3),
interpolation="bilinear",
)
plt.axis("off")
image_bytes = io.BytesIO()
plt.savefig(
image_bytes,
format="png",
dpi=100,
bbox_inches="tight",
pad_inches=0,
transparent=True,
)
image_bytes.seek(0)
return base64.b64encode(image_bytes.read()).decode(), word_frequencies


def boost_normalize_words(frequencies, word_map):
# from word, to word
for o, n in word_map.items():
from_count = frequencies.get(o, 0)
if not from_count:
continue
to_count = frequencies.get(n, 0)
frequencies[n] = from_count + to_count
del frequencies[o]
return frequencies


def grey_color_func(*args, **kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(10, 80)


def get_mail_content(version: Version):
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-release_date")
.first()
)
if not prior_version or not settings.HYPERKITTY_DATABASE_NAME:
return []
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with conn.cursor(name="fetch-mail-content") as cursor:
cursor.execute(
"""
SELECT content FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s;
""",
{"start": prior_version.release_date, "end": version.release_date},
)
for [content] in cursor:
yield content
99 changes: 99 additions & 0 deletions reports/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Generated by Django 4.2.18 on 2025-02-26 19:02

import django.contrib.postgres.fields.ranges
from django.db import migrations, models
import django.db.models.deletion
import django_extensions.db.fields


class Migration(migrations.Migration):

initial = True

dependencies = [
("versions", "0018_version_financial_committee_members"),
]

operations = [
migrations.CreateModel(
name="WebsiteStatReport",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True, verbose_name="created"
),
),
(
"modified",
django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
("period", django.contrib.postgres.fields.ranges.DateRangeField()),
(
"version",
models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
to="versions.version",
),
),
],
options={
"get_latest_by": "modified",
"abstract": False,
},
),
migrations.CreateModel(
name="WebsiteStatItem",
fields=[
(
"id",
models.BigAutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"created",
django_extensions.db.fields.CreationDateTimeField(
auto_now_add=True, verbose_name="created"
),
),
(
"modified",
django_extensions.db.fields.ModificationDateTimeField(
auto_now=True, verbose_name="modified"
),
),
("name", models.CharField()),
("code_name", models.CharField()),
("value", models.FloatField()),
(
"report",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="stats",
to="reports.websitestatreport",
),
),
],
),
migrations.AddConstraint(
model_name="websitestatitem",
constraint=models.UniqueConstraint(
fields=("report", "code_name"), name="unique_report_code_name"
),
),
]
Empty file added reports/migrations/__init__.py
Empty file.
Loading