openai general agent refactored

Tiendil · Tiendil · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024 · Jul 21, 2024
commit 98803016463f1dcd416bec52fc430027456efe4a
diff --git a/ffun/ffun/core/json.py b/ffun/ffun/core/json.py
@@ -88,6 +88,7 @@ def loads_with_fix(text: str) -> dict[str, Any]:
         return json.loads(finish_json(text))  # type: ignore
 
 
+# TODO: remove if it will not be used anywhere
 def extract_tags_from_random_json(data: Any) -> set[str]:
     if not data:
         # no tags if [], {}, ''

diff --git a/ffun/ffun/librarian/background_processors.py b/ffun/ffun/librarian/background_processors.py
@@ -7,8 +7,7 @@
 from ffun.librarian.processors.base import Processor
 from ffun.librarian.processors.domain import Processor as DomainProcessor
 from ffun.librarian.processors.native_tags import Processor as NativeTagsProcessor
-from ffun.librarian.processors.openai_chat_3_5 import Processor as OpenAIChat35Processor
-from ffun.librarian.processors.openai_chat_3_5_functions import Processor as OpenAIChat35FunctionsProcessor
+from ffun.librarian.processors.openai_general import Processor as OpenGeneralProcessor
 from ffun.librarian.processors.upper_case_title import Processor as UpperCaseTitleProcessor
 from ffun.librarian.settings import settings
 from ffun.library import domain as l_domain
@@ -53,26 +52,17 @@ def concurrency(self) -> int:
     )
 
 
-if settings.openai_chat_35_processor.enabled:
+if settings.openai_general_processor.enabled:
     processors.append(
         ProcessorInfo(
             id=3,
-            processor=OpenAIChat35Processor(name="openai_chat_3_5", model=settings.openai_chat_35_processor.model),
+            processor=OpenGeneralProcessor(name="openai_general", model=settings.openai_general_processor.model),
             concurrency=settings.openai_chat_35_processor.workers,
         )
     )
 
 
-if settings.openai_chat_35_functions_processor.enabled:
-    processors.append(
-        ProcessorInfo(
-            id=4,
-            processor=OpenAIChat35FunctionsProcessor(
-                name="openai_chat_3_5_functions", model=settings.openai_chat_35_functions_processor.model
-            ),
-            concurrency=settings.openai_chat_35_functions_processor.workers,
-        )
-    )
+# the processor 4 was "ChatGPT 3.5 + functions" and was removed in gh-227
 
 if settings.upper_case_title_processor.enabled:
     processors.append(

diff --git a/ffun/ffun/librarian/processors/openai_general.py b/ffun/ffun/librarian/processors/openai_general.py
@@ -1,7 +1,6 @@
-import json
+import re
 from typing import Any
 
-from ffun.core import json as core_json
 from ffun.core import logging
 from ffun.core import text as core_text
 from ffun.librarian import errors
@@ -16,42 +15,35 @@
 
 
 system = """\
-You are an expert on the analysis of text semantics.
-For the provided text, you determine a list of best tags to describe the text.
-For each category, you provide up to 30 tags.
+You are an expert on semantic analysis, text summarization, and information extraction with PhD in Linguistics.
+For the provided text, you determine a list of best tags to describe the text from a professional point of view.
+For each category, you provide 15 tags.
 
-Categories are topics, meta-topics, high-level-topics, low-level-topics, related-topics, \
-indirect-topics, mentions, indirect-mentions.
+Categories are topics, areas, professional-topics, professional-areas, meta-topics, meta-areas, high-level-topics, \
+low-level-topics, related-topics, entities-with-proper-names, domains.
 
-Tags are only in English. Normalize tags and output them as JSON.\
+Tags format:
+
+- Allowed tag format: `@word`, `@word-word-...`
+- Translate all tags to English.
+- Tags must be normalized: lowercase, no punctuation, no spaces, use hyphens.
+- You must use plural forms of tags: `games` is better than `game`.
+- Expand abbreviations: `AI` -> `artificial intelligence`.
+
+You are an expert on semantic analysis, text summarization, and information extraction with PhD in Linguistics. \
+Quality of your answer is highly important.
 """
 
+RE_TAG = re.compile(r"@([\w\d-]+)")
+
 
 # add url to allow chatGPT decide on domain
 def entry_to_text(entry: Entry) -> str:
     return f'<h1>{entry.title}</h1><a href="{entry.external_url}">full article</a>{entry.body}'
 
 
-trash_system_tags = {
-    "topics",
-    "meta-topics",
-    "high-level-topics",
-    "low-level-topics",
-    "related-topics",
-    "indirect-topics",
-    "mentions",
-    "indirect-mentions",
-}
-
-
 def extract_tags(text: str) -> set[str]:
-    try:
-        data = core_json.loads_with_fix(text)
-        tags = core_json.extract_tags_from_random_json(data)
-    except json.decoder.JSONDecodeError:
-        tags = core_json.extract_tags_from_invalid_json(text)
-
-    return tags - trash_system_tags
+    return set(tag.lower() for tag in RE_TAG.findall(text))
 
 
 class Processor(base.Processor):
@@ -68,8 +60,8 @@ async def process(self, entry: Entry) -> list[ProcessorTag]:
 
         text = core_text.clear_text(dirty_text)
 
-        total_tokens = 16 * 1024
-        max_return_tokens = 2 * 1024
+        total_tokens = 128 * 1024
+        max_return_tokens = 4 * 1024
 
         messages = await oai_client.prepare_requests(
             system=system,

diff --git a/ffun/ffun/librarian/settings.py b/ffun/ffun/librarian/settings.py
@@ -21,8 +21,7 @@ class UpperCaseTitleProcessor(BaseProcessor):
     pass
 
 
-# TODO: will be ranamed & refactored in gh-227
-class OpenAIChat35Processor(BaseProcessor):
+class OpenAIGeneralProcessor(BaseProcessor):
     model: str = "gpt-4o-mini-2024-07-18"
 
 
@@ -36,8 +35,7 @@ class Settings(BaseSettings):
     native_tags_processor: NativeTagsProcessor = NativeTagsProcessor(enabled=True)
     upper_case_title_processor: UpperCaseTitleProcessor = UpperCaseTitleProcessor(enabled=True)
 
-    openai_chat_35_processor: OpenAIChat35Processor = OpenAIChat35Processor()
-    openai_chat_35_functions_processor: OpenAIChat35FunctionsProcessor = OpenAIChat35FunctionsProcessor()
+    openai_general_processor: OpenAIGeneralProcessor = OpenAIGeneralProcessor()
 
     model_config = pydantic_settings.SettingsConfigDict(env_prefix="FFUN_LIBRARIAN_")