-
Notifications
You must be signed in to change notification settings - Fork 31
Refactor KG builder #52
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
4188441
refactor: use async_to_sync_method
ChenZiHong-Gavin a2b9459
delete some code
ChenZiHong-Gavin 1cafc02
refactor: refact split_chunks
ChenZiHong-Gavin e18b947
Update graphgen/graphgen.py
ChenZiHong-Gavin 6d3bdbd
fix: fix type annotation
ChenZiHong-Gavin 9a56e30
wip: add base_kg_builder
ChenZiHong-Gavin 051dc77
Merge branch 'main' of https://github.com/open-sciencelab/GraphGen in…
ChenZiHong-Gavin 8fd34b2
refactor: abstract run_concurrent & delete semaphore
ChenZiHong-Gavin e42bcb6
refactor: refact llm_client & tokenizer
ChenZiHong-Gavin 9b7ef17
Update graphgen/bases/base_llm_client.py
ChenZiHong-Gavin 128d2f8
Merge branch 'main' of https://github.com/open-sciencelab/GraphGen in…
ChenZiHong-Gavin b30f5a1
wip: add NetworkXKGBuilder
ChenZiHong-Gavin File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from .base_kg_builder import BaseKGBuilder | ||
from .base_llm_client import BaseLLMClient | ||
from .base_reader import BaseReader | ||
from .base_splitter import BaseSplitter | ||
from .base_storage import ( | ||
BaseGraphStorage, | ||
BaseKVStorage, | ||
BaseListStorage, | ||
StorageNameSpace, | ||
) | ||
from .base_tokenizer import BaseTokenizer | ||
from .datatypes import Chunk, QAPair, Token |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from abc import ABC, abstractmethod | ||
from collections import defaultdict | ||
from dataclasses import dataclass, field | ||
from typing import Dict, List, Tuple | ||
|
||
from graphgen.bases.base_llm_client import BaseLLMClient | ||
from graphgen.bases.base_storage import BaseGraphStorage | ||
from graphgen.bases.datatypes import Chunk | ||
|
||
|
||
@dataclass | ||
class BaseKGBuilder(ABC): | ||
kg_instance: BaseGraphStorage | ||
llm_client: BaseLLMClient | ||
|
||
_nodes: Dict[str, List[dict]] = field(default_factory=lambda: defaultdict(list)) | ||
_edges: Dict[Tuple[str, str], List[dict]] = field( | ||
default_factory=lambda: defaultdict(list) | ||
) | ||
|
||
def build(self, chunks: List[Chunk]) -> None: | ||
pass | ||
|
||
@abstractmethod | ||
async def extract_all(self, chunks: List[Chunk]) -> None: | ||
"""Extract nodes and edges from all chunks.""" | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
async def extract( | ||
self, chunk: Chunk | ||
) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: | ||
"""Extract nodes and edges from a single chunk.""" | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
async def merge_nodes( | ||
self, nodes_data: Dict[str, List[dict]], kg_instance: BaseGraphStorage, llm | ||
) -> None: | ||
"""Merge extracted nodes into the knowledge graph.""" | ||
raise NotImplementedError |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from __future__ import annotations | ||
|
||
import abc | ||
import re | ||
from typing import Any, List, Optional | ||
|
||
from graphgen.bases.base_tokenizer import BaseTokenizer | ||
from graphgen.bases.datatypes import Token | ||
|
||
|
||
class BaseLLMClient(abc.ABC): | ||
""" | ||
LLM client base class, agnostic to specific backends (OpenAI / Ollama / ...). | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
system_prompt: str = "", | ||
temperature: float = 0.0, | ||
max_tokens: int = 4096, | ||
repetition_penalty: float = 1.05, | ||
top_p: float = 0.95, | ||
top_k: int = 50, | ||
tokenizer: Optional[BaseTokenizer] = None, | ||
**kwargs: Any, | ||
): | ||
self.system_prompt = system_prompt | ||
self.temperature = temperature | ||
self.max_tokens = max_tokens | ||
self.repetition_penalty = repetition_penalty | ||
self.top_p = top_p | ||
self.top_k = top_k | ||
self.tokenizer = tokenizer | ||
|
||
for k, v in kwargs.items(): | ||
setattr(self, k, v) | ||
|
||
@abc.abstractmethod | ||
async def generate_answer( | ||
self, text: str, history: Optional[List[str]] = None, **extra: Any | ||
) -> str: | ||
"""Generate answer from the model.""" | ||
raise NotImplementedError | ||
|
||
@abc.abstractmethod | ||
async def generate_topk_per_token( | ||
self, text: str, history: Optional[List[str]] = None, **extra: Any | ||
) -> List[Token]: | ||
"""Generate top-k tokens for the next token prediction.""" | ||
raise NotImplementedError | ||
|
||
@abc.abstractmethod | ||
async def generate_inputs_prob( | ||
self, text: str, history: Optional[List[str]] = None, **extra: Any | ||
) -> List[Token]: | ||
"""Generate probabilities for each token in the input.""" | ||
raise NotImplementedError | ||
|
||
def count_tokens(self, text: str) -> int: | ||
"""Count the number of tokens in the text.""" | ||
if self.tokenizer is None: | ||
raise ValueError("Tokenizer is not set. Please provide a tokenizer to use count_tokens.") | ||
return len(self.tokenizer.encode(text)) | ||
|
||
@staticmethod | ||
def filter_think_tags(text: str, think_tag: str = "think") -> str: | ||
""" | ||
Remove <think> tags from the text. | ||
If the text contains <think> and </think>, it removes everything between them and the tags themselves. | ||
""" | ||
think_pattern = re.compile(rf"<{think_tag}>.*?</{think_tag}>", re.DOTALL) | ||
filtered_text = think_pattern.sub("", text).strip() | ||
return filtered_text if filtered_text else text.strip() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from __future__ import annotations | ||
|
||
from abc import ABC, abstractmethod | ||
from dataclasses import dataclass | ||
from typing import List | ||
|
||
|
||
@dataclass | ||
class BaseTokenizer(ABC): | ||
model_name: str = "cl100k_base" | ||
|
||
@abstractmethod | ||
def encode(self, text: str) -> List[int]: | ||
"""Encode text -> token ids.""" | ||
raise NotImplementedError | ||
|
||
@abstractmethod | ||
def decode(self, token_ids: List[int]) -> str: | ||
"""Decode token ids -> text.""" | ||
raise NotImplementedError | ||
|
||
def count_tokens(self, text: str) -> int: | ||
return len(self.encode(text)) | ||
|
||
def chunk_by_token_size( | ||
self, | ||
content: str, | ||
*, | ||
overlap_token_size: int = 128, | ||
max_token_size: int = 1024, | ||
) -> List[dict]: | ||
tokens = self.encode(content) | ||
results = [] | ||
step = max_token_size - overlap_token_size | ||
for index, start in enumerate(range(0, len(tokens), step)): | ||
chunk_ids = tokens[start : start + max_token_size] | ||
results.append( | ||
{ | ||
"tokens": len(chunk_ids), | ||
"content": self.decode(chunk_ids).strip(), | ||
"chunk_order_index": index, | ||
} | ||
) | ||
return results |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.