Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Website traffic stats gathering #1679

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
"libraries",
"mailing_list",
"news",
"reports",
"core",
"slack",
]
Expand Down
93 changes: 4 additions & 89 deletions libraries/forms.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,16 @@
import io
import base64
from functools import cached_property
from itertools import groupby, chain
from operator import attrgetter
from dataclasses import dataclass, field
from datetime import date, timedelta

import psycopg2
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt

from django.template.loader import render_to_string
from django.db.models import F, Q, Count, OuterRef, Sum, When, Value, Case
from django.forms import Form, ModelChoiceField, ModelForm, BooleanField
from django.conf import settings

from core.models import RenderedContent, SiteSettings
from libraries.utils import batched, boost_normalize_words, grey_color_func
from core.models import RenderedContent
from reports.generation import generate_wordcloud
from slack.models import Channel, SlackActivityBucket, SlackUser
from versions.models import Version
from .models import (
Expand All @@ -25,10 +19,10 @@
Issue,
Library,
LibraryVersion,
WordcloudMergeWord,
)
from libraries.constants import SUB_LIBRARIES
from mailing_list.models import EmailData
from .utils import batched


class LibraryForm(ModelForm):
Expand Down Expand Up @@ -448,73 +442,6 @@ def _get_top_contributors_for_library_version(self, library_order):
)
return top_contributors_release

def _get_mail_content(self, version):
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-release_date")
.first()
)
if not prior_version or not settings.HYPERKITTY_DATABASE_NAME:
return []
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with conn.cursor(name="fetch-mail-content") as cursor:
cursor.execute(
"""
SELECT content FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s;
""",
{"start": prior_version.release_date, "end": version.release_date},
)
for [content] in cursor:
yield content

def _generate_hyperkitty_word_cloud(self, version):
"""Generates a wordcloud png and returns it as a base64 string."""
wc = WordCloud(
mode="RGBA",
background_color=None,
width=1400,
height=700,
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
font_path=settings.BASE_DIR / "static" / "font" / "notosans_mono.woff",
)
word_frequencies = {}
for content in self._get_mail_content(version):
for key, val in wc.process_text(content).items():
if len(key) < 2:
continue
key_lower = key.lower()
if key_lower not in word_frequencies:
word_frequencies[key_lower] = 0
word_frequencies[key_lower] += val
if not word_frequencies:
return None, {}

word_frequencies = boost_normalize_words(
word_frequencies,
{x.from_word: x.to_word for x in WordcloudMergeWord.objects.all()},
)

wc.generate_from_frequencies(word_frequencies)
plt.figure(figsize=(14, 7), facecolor=None)
plt.imshow(
wc.recolor(color_func=grey_color_func, random_state=3),
interpolation="bilinear",
)
plt.axis("off")
image_bytes = io.BytesIO()
plt.savefig(
image_bytes,
format="png",
dpi=100,
bbox_inches="tight",
pad_inches=0,
transparent=True,
)
image_bytes.seek(0)
return base64.b64encode(image_bytes.read()).decode(), word_frequencies

def _count_mailinglist_contributors(self, version):
version_lt = list(
Version.objects.minor_versions()
Expand Down Expand Up @@ -838,19 +765,6 @@ def get_stats(self):
Channel.objects.filter(name__istartswith="boost").order_by("name"), 10
)
committee_members = version.financial_committee_members.all()
wordcloud_base64, word_frequencies = self._generate_hyperkitty_word_cloud(
version
)
# first sort by number, then sort the top 200 alphabetically
word_frequencies = {
key: val
for key, val in sorted(
word_frequencies.items(),
key=lambda x: x[1],
reverse=True,
)
}
wordcloud_top_words = sorted(list(word_frequencies.keys())[:200])
library_index_library_data = []
for library in self._get_libraries_by_quality():
library_index_library_data.append(
Expand All @@ -859,6 +773,7 @@ def get_stats(self):
library in [lib["library"] for lib in library_data],
)
)
wordcloud_base64, wordcloud_top_words = generate_wordcloud(version)

return {
"committee_members": committee_members,
Expand Down
20 changes: 1 addition & 19 deletions libraries/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from itertools import islice
import random
import string
import re
from itertools import islice

import structlog
import tempfile
Expand Down Expand Up @@ -296,21 +296,3 @@ def parse_line(line: str):
"Some library versions were skipped during artifact parsing.",
skipped_library_versions=skipped_library_versions,
)


def boost_normalize_words(frequencies, word_map):
# from word, to word
for o, n in word_map.items():
from_count = frequencies.get(o, 0)
if not from_count:
continue
to_count = frequencies.get(n, 0)
frequencies[n] = from_count + to_count
del frequencies[o]
return frequencies


def grey_color_func(
word, font_size, position, orientation, random_state=None, **kwargs
):
return "hsl(0, 0%%, %d%%)" % random.randint(10, 80)
Empty file added reports/__init__.py
Empty file.
32 changes: 32 additions & 0 deletions reports/admin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from django.contrib import admin

from reports.models import WebsiteStatReport, WebsiteStatItem


class StatInline(admin.StackedInline):
model = WebsiteStatItem
extra = 0
fields = ("value",)
readonly_fields = fields
can_delete = False


@admin.register(WebsiteStatReport)
class WebsiteStatReportAdmin(admin.ModelAdmin):
inlines = (StatInline,)
list_display = ("version", "pageviews", "unique_visitors", "period")
ordering = ("-version",)

# def get_queryset(self, request):
# qs = super().get_queryset(request)
# return qs.prefetch_related("stats")

def pageviews(self, obj):
return f"{int(obj.stats.get(code_name='pageviews').value):,}"

def unique_visitors(self, obj):
return f"{int(obj.stats.get(code_name='visitors').value):,}"


@admin.register(WebsiteStatItem)
class WebsiteStatItemAdmin(admin.ModelAdmin): ...
6 changes: 6 additions & 0 deletions reports/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from django.apps import AppConfig


class ReportsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "reports"
6 changes: 6 additions & 0 deletions reports/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
WORDCLOUD_FONT = "notosans_mono.woff"
WEB_ANALYTICS_DOMAIN = "preview.boost.org"
WEB_ANALYTICS_API_URL = (
f"https://plausible.io/api/stats/{WEB_ANALYTICS_DOMAIN}/top-stats/?period=custom"
"&from={:%Y-%m-%d}&to={:%Y-%m-%d}"
)
112 changes: 112 additions & 0 deletions reports/generation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import base64
import io
import random

import psycopg2
from django.conf import settings
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS

from core.models import SiteSettings
from libraries.models import WordcloudMergeWord # TODO: move model to this app
from reports.constants import WORDCLOUD_FONT
from versions.models import Version


def generate_wordcloud(version: Version) -> tuple[str | None, list]:
"""Generates a wordcloud png and returns it as a base64 string and word frequencies.

Returns:
Tuple of (base64_encoded_png_string, wordcloud_top_words)
"""
wc = WordCloud(
mode="RGBA",
background_color=None,
width=1400,
height=700,
stopwords=STOPWORDS | SiteSettings.load().wordcloud_ignore_set,
font_path=settings.STATIC_ROOT / "font" / WORDCLOUD_FONT,
)
word_frequencies = {}
for content in get_mail_content(version):
for key, val in wc.process_text(content).items():
if len(key) < 2:
continue
key_lower = key.lower()
if key_lower not in word_frequencies:
word_frequencies[key_lower] = 0
word_frequencies[key_lower] += val
if not word_frequencies:
return None, []

word_frequencies = boost_normalize_words(
word_frequencies,
{x.from_word: x.to_word for x in WordcloudMergeWord.objects.all()},
)
# first sort by number, then sort the top 200 alphabetically
word_frequencies = {
key: val
for key, val in sorted(
word_frequencies.items(),
key=lambda x: x[1],
reverse=True,
)
}
wordcloud_top_words = sorted(list(word_frequencies.keys())[:200])

wc.generate_from_frequencies(word_frequencies)
plt.figure(figsize=(14, 7), facecolor=None)
plt.imshow(
wc.recolor(color_func=grey_color_func, random_state=3),
interpolation="bilinear",
)
plt.axis("off")
image_bytes = io.BytesIO()
plt.savefig(
image_bytes,
format="png",
dpi=100,
bbox_inches="tight",
pad_inches=0,
transparent=True,
)
image_bytes.seek(0)
return base64.b64encode(image_bytes.read()).decode(), wordcloud_top_words


def boost_normalize_words(frequencies, word_map):
# from word, to word
for o, n in word_map.items():
from_count = frequencies.get(o, 0)
if not from_count:
continue
to_count = frequencies.get(n, 0)
frequencies[n] = from_count + to_count
del frequencies[o]
return frequencies


def grey_color_func(*args, **kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(10, 80)


def get_mail_content(version: Version):
prior_version = (
Version.objects.minor_versions()
.filter(version_array__lt=version.cleaned_version_parts_int)
.order_by("-release_date")
.first()
)
if not prior_version or not settings.HYPERKITTY_DATABASE_NAME:
return []
conn = psycopg2.connect(settings.HYPERKITTY_DATABASE_URL)
with conn.cursor(name="fetch-mail-content") as cursor:
cursor.execute(
"""
SELECT content FROM hyperkitty_email
WHERE date >= %(start)s AND date < %(end)s;
""",
{"start": prior_version.release_date, "end": version.release_date},
)
for [content] in cursor:
yield content
Loading