Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-227 refactor llm prompt #230

Merged
merged 9 commits into from
Jul 21, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
openai general agent refactored
Tiendil committed Jul 21, 2024
commit 98803016463f1dcd416bec52fc430027456efe4a
1 change: 1 addition & 0 deletions ffun/ffun/core/json.py
Original file line number Diff line number Diff line change
@@ -88,6 +88,7 @@ def loads_with_fix(text: str) -> dict[str, Any]:
return json.loads(finish_json(text)) # type: ignore


# TODO: remove if it will not be used anywhere
def extract_tags_from_random_json(data: Any) -> set[str]:
if not data:
# no tags if [], {}, ''
18 changes: 4 additions & 14 deletions ffun/ffun/librarian/background_processors.py
Original file line number Diff line number Diff line change
@@ -7,8 +7,7 @@
from ffun.librarian.processors.base import Processor
from ffun.librarian.processors.domain import Processor as DomainProcessor
from ffun.librarian.processors.native_tags import Processor as NativeTagsProcessor
from ffun.librarian.processors.openai_chat_3_5 import Processor as OpenAIChat35Processor
from ffun.librarian.processors.openai_chat_3_5_functions import Processor as OpenAIChat35FunctionsProcessor
from ffun.librarian.processors.openai_general import Processor as OpenGeneralProcessor
from ffun.librarian.processors.upper_case_title import Processor as UpperCaseTitleProcessor
from ffun.librarian.settings import settings
from ffun.library import domain as l_domain
@@ -53,26 +52,17 @@ def concurrency(self) -> int:
)


if settings.openai_chat_35_processor.enabled:
if settings.openai_general_processor.enabled:
processors.append(
ProcessorInfo(
id=3,
processor=OpenAIChat35Processor(name="openai_chat_3_5", model=settings.openai_chat_35_processor.model),
processor=OpenGeneralProcessor(name="openai_general", model=settings.openai_general_processor.model),
concurrency=settings.openai_chat_35_processor.workers,
)
)


if settings.openai_chat_35_functions_processor.enabled:
processors.append(
ProcessorInfo(
id=4,
processor=OpenAIChat35FunctionsProcessor(
name="openai_chat_3_5_functions", model=settings.openai_chat_35_functions_processor.model
),
concurrency=settings.openai_chat_35_functions_processor.workers,
)
)
# the processor 4 was "ChatGPT 3.5 + functions" and was removed in gh-227

if settings.upper_case_title_processor.enabled:
processors.append(
50 changes: 21 additions & 29 deletions ffun/ffun/librarian/processors/openai_general.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import re
from typing import Any

from ffun.core import json as core_json
from ffun.core import logging
from ffun.core import text as core_text
from ffun.librarian import errors
@@ -16,42 +15,35 @@


system = """\
You are an expert on the analysis of text semantics.
For the provided text, you determine a list of best tags to describe the text.
For each category, you provide up to 30 tags.
You are an expert on semantic analysis, text summarization, and information extraction with PhD in Linguistics.
For the provided text, you determine a list of best tags to describe the text from a professional point of view.
For each category, you provide 15 tags.

Categories are topics, meta-topics, high-level-topics, low-level-topics, related-topics, \
indirect-topics, mentions, indirect-mentions.
Categories are topics, areas, professional-topics, professional-areas, meta-topics, meta-areas, high-level-topics, \
low-level-topics, related-topics, entities-with-proper-names, domains.

Tags are only in English. Normalize tags and output them as JSON.\
Tags format:

- Allowed tag format: `@word`, `@word-word-...`
- Translate all tags to English.
- Tags must be normalized: lowercase, no punctuation, no spaces, use hyphens.
- You must use plural forms of tags: `games` is better than `game`.
- Expand abbreviations: `AI` -> `artificial intelligence`.

You are an expert on semantic analysis, text summarization, and information extraction with PhD in Linguistics. \
Quality of your answer is highly important.
"""

RE_TAG = re.compile(r"@([\w\d-]+)")


# add url to allow chatGPT decide on domain
def entry_to_text(entry: Entry) -> str:
return f'<h1>{entry.title}</h1><a href="{entry.external_url}">full article</a>{entry.body}'


trash_system_tags = {
"topics",
"meta-topics",
"high-level-topics",
"low-level-topics",
"related-topics",
"indirect-topics",
"mentions",
"indirect-mentions",
}


def extract_tags(text: str) -> set[str]:
try:
data = core_json.loads_with_fix(text)
tags = core_json.extract_tags_from_random_json(data)
except json.decoder.JSONDecodeError:
tags = core_json.extract_tags_from_invalid_json(text)

return tags - trash_system_tags
return set(tag.lower() for tag in RE_TAG.findall(text))


class Processor(base.Processor):
@@ -68,8 +60,8 @@ async def process(self, entry: Entry) -> list[ProcessorTag]:

text = core_text.clear_text(dirty_text)

total_tokens = 16 * 1024
max_return_tokens = 2 * 1024
total_tokens = 128 * 1024
max_return_tokens = 4 * 1024

messages = await oai_client.prepare_requests(
system=system,
6 changes: 2 additions & 4 deletions ffun/ffun/librarian/settings.py
Original file line number Diff line number Diff line change
@@ -21,8 +21,7 @@ class UpperCaseTitleProcessor(BaseProcessor):
pass


# TODO: will be ranamed & refactored in gh-227
class OpenAIChat35Processor(BaseProcessor):
class OpenAIGeneralProcessor(BaseProcessor):
model: str = "gpt-4o-mini-2024-07-18"


@@ -36,8 +35,7 @@ class Settings(BaseSettings):
native_tags_processor: NativeTagsProcessor = NativeTagsProcessor(enabled=True)
upper_case_title_processor: UpperCaseTitleProcessor = UpperCaseTitleProcessor(enabled=True)

openai_chat_35_processor: OpenAIChat35Processor = OpenAIChat35Processor()
openai_chat_35_functions_processor: OpenAIChat35FunctionsProcessor = OpenAIChat35FunctionsProcessor()
openai_general_processor: OpenAIGeneralProcessor = OpenAIGeneralProcessor()

model_config = pydantic_settings.SettingsConfigDict(env_prefix="FFUN_LIBRARIAN_")