topoteretes · Vasilije1990 · Jan 16, 2025 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/.github/workflows/profiling.yaml b/.github/workflows/profiling.yaml
@@ -57,6 +57,8 @@ jobs:
       run: |
         poetry install --no-interaction --all-extras
         poetry run pip install pyinstrument
+        poetry run pip install parso
+        poetry run pip install jedi
 
 
     # Set environment variables for SHAs

diff --git a/cognee/base_config.py b/cognee/base_config.py
@@ -10,6 +10,9 @@ class BaseConfig(BaseSettings):
     monitoring_tool: object = MonitoringTool.LANGFUSE
     graphistry_username: Optional[str] = os.getenv("GRAPHISTRY_USERNAME")
     graphistry_password: Optional[str] = os.getenv("GRAPHISTRY_PASSWORD")
+    langfuse_public_key: Optional[str] = os.getenv("LANGFUSE_PUBLIC_KEY")
+    langfuse_secret_key: Optional[str] = os.getenv("LANGFUSE_SECRET_KEY")
+    langfuse_host: Optional[str] = os.getenv("LANGFUSE_HOST")
 
     model_config = SettingsConfigDict(env_file = ".env", extra = "allow")
 

diff --git a/cognee/infrastructure/engine/models/DataPoint.py b/cognee/infrastructure/engine/models/DataPoint.py
@@ -1,27 +1,39 @@
+
+
 from datetime import datetime, timezone
-from typing import Optional
+from typing import Optional, Any, Dict
 from uuid import UUID, uuid4
 
 from pydantic import BaseModel, Field
 from typing_extensions import TypedDict
+import pickle
 
-
-
-from datetime import datetime, timezone
-from typing import Optional
-from typing import Optional, Any, Dict
-from uuid import UUID, uuid4
-
-from pydantic import BaseModel, Field
-from typing_extensions import TypedDict
-import pickle
+from datetime import datetime, timezone
+from typing import Optional, Any, Dict
+from uuid import UUID, uuid4
+
+import json
+import pickle  # Consider removing in favor of json
+
+from pydantic import BaseModel, Field
+from typing_extensions import TypedDict
-
-
-from datetime import datetime, timezone
-from typing import Optional
-from typing import Optional, Any, Dict
-from uuid import UUID, uuid4
-
-from pydantic import BaseModel, Field
-from typing_extensions import TypedDict
-import pickle
+from datetime import datetime, timezone
+from typing import Optional, Any, Dict
+from uuid import UUID, uuid4
+
+import json
+import pickle  # Consider removing in favor of json
+
+from pydantic import BaseModel, Field
+from typing_extensions import TypedDict
-
+# Define metadata type
 class MetaData(TypedDict):
     index_fields: list[str]
 
+
+# Updated DataPoint model with versioning and new fields
 class DataPoint(BaseModel):
     __tablename__ = "data_point"
-    id: UUID = Field(default_factory = uuid4)
-    updated_at: Optional[datetime] = datetime.now(timezone.utc)
+    id: UUID = Field(default_factory=uuid4)
+    created_at: int = Field(default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000))
+    updated_at: int = Field(default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000))
-    created_at: int = Field(default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000))
-    updated_at: int = Field(default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000))
+    created_at: int = Field(
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000),
+        ge=0
+    )
+    updated_at: int = Field(
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000),
+        ge=0
+    )
-    created_at: int = Field(default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000))
-    updated_at: int = Field(default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000))
+    created_at: int = Field(
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000),
+        ge=0
+    )
+    updated_at: int = Field(
+        default_factory=lambda: int(datetime.now(timezone.utc).timestamp() * 1000),
+        ge=0
+    )
+    version: str = "0.1"  # Default version
+    source: Optional[str] = None  # Path to file, URL, etc.
+    type: Optional[str] = "text"  # "text", "file", "image", "video"
     topological_rank: Optional[int] = 0
+    extra: Optional[str] = "extra"  # For additional properties
     _metadata: Optional[MetaData] = {
         "index_fields": [],
         "type": "DataPoint"
     }
 
-    # class Config:
-    #     underscore_attrs_are_private = True
+    # Override the Pydantic configuration
+    class Config:
+        underscore_attrs_are_private = True
 
+    @classmethod
     @classmethod
-    @classmethod
-    @classmethod
+    @classmethod
-    @classmethod
-    @classmethod
+    @classmethod
     def get_embeddable_data(self, data_point):
         if data_point._metadata and len(data_point._metadata["index_fields"]) > 0 \
@@ -30,16 +42,51 @@ def get_embeddable_data(self, data_point):
 
             if isinstance(attribute, str):
                 return attribute.strip()
-            else:
-                return attribute
+            return attribute
 
     @classmethod
     def get_embeddable_properties(self, data_point):
+        """Retrieve all embeddable properties."""
         if data_point._metadata and len(data_point._metadata["index_fields"]) > 0:
             return [getattr(data_point, field, None) for field in data_point._metadata["index_fields"]]
-
         return []
 
     @classmethod
     def get_embeddable_property_names(self, data_point):
-        return data_point._metadata["index_fields"] or []
+        """Retrieve names of embeddable properties."""
+        return data_point._metadata["index_fields"] or []
+
+    def update_version(self, new_version: str):
+        """Update the version and updated_at timestamp."""
+        self.version = new_version
+        self.updated_at = int(datetime.now(timezone.utc).timestamp() * 1000)
+
+ # JSON Serialization
+    def to_json(self) -> str:
+        """Serialize the instance to a JSON string."""
+        return self.json()
+
+    @classmethod
+    def from_json(self, json_str: str):
+        """Deserialize the instance from a JSON string."""
+        return self.model_validate_json(json_str)
+
+    # Pickle Serialization
+    def to_pickle(self) -> bytes:
+        """Serialize the instance to pickle-compatible bytes."""
+        return pickle.dumps(self.dict())
+
+    @classmethod
+    def from_pickle(self, pickled_data: bytes):
+        """Deserialize the instance from pickled bytes."""
+        data = pickle.loads(pickled_data)
-    # Pickle Serialization
-    def to_pickle(self) -> bytes:
-        """Serialize the instance to pickle-compatible bytes."""
-        return pickle.dumps(self.dict())
-
-    @classmethod
-    def from_pickle(self, pickled_data: bytes):
-        """Deserialize the instance from pickled bytes."""
-        data = pickle.loads(pickled_data)
-        return self(**data)
+    def to_bytes(self) -> bytes:
+        """Serialize the instance to bytes using JSON."""
+        return self.json().encode('utf-8')
+
+    @classmethod
+    def from_bytes(cls, data: bytes):
+        """Deserialize the instance from JSON bytes."""
+        return cls.parse_raw(data)
-    # Pickle Serialization
-    def to_pickle(self) -> bytes:
-        """Serialize the instance to pickle-compatible bytes."""
-        return pickle.dumps(self.dict())
-
-    @classmethod
-    def from_pickle(self, pickled_data: bytes):
-        """Deserialize the instance from pickled bytes."""
-        data = pickle.loads(pickled_data)
-        return self(**data)
+    def to_bytes(self) -> bytes:
+        """Serialize the instance to bytes using JSON."""
+        return self.json().encode('utf-8')
+
+    @classmethod
+    def from_bytes(cls, data: bytes):
+        """Deserialize the instance from JSON bytes."""
+        return cls.parse_raw(data)
+        return self(**data)
+
+    def to_dict(self, **kwargs) -> Dict[str, Any]:
+        """Serialize model to a dictionary."""
+        return self.model_dump(**kwargs)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "DataPoint":
+        """Deserialize model from a dictionary."""
+        return cls.model_validate(data)
diff --git a/cognee/infrastructure/llm/openai/adapter.py b/cognee/infrastructure/llm/openai/adapter.py
@@ -6,10 +6,11 @@
 import litellm
 import instructor
 from pydantic import BaseModel
-
+from cognee.shared.data_models import MonitoringTool
 from cognee.exceptions import InvalidValueError
 from cognee.infrastructure.llm.llm_interface import LLMInterface
 from cognee.infrastructure.llm.prompts import read_query_prompt
+from cognee.base_config import get_base_config
 
 class OpenAIAdapter(LLMInterface):
     name = "OpenAI"
@@ -35,6 +36,15 @@ def __init__(
         self.endpoint = endpoint
         self.api_version = api_version
         self.streaming = streaming
+        base_config = get_base_config()
+        if base_config.monitoring_tool == MonitoringTool.LANGFUSE:
+            # set callbacks
+            # litellm.success_callback = ["langfuse"]
+            # litellm.failure_callback = ["langfuse"]
+            self.aclient.success_callback = ["langfuse"]
+            self.aclient.failure_callback = ["langfuse"]
+            self.client.success_callback = ["langfuse"]
+            self.client.failure_callback = ["langfuse"]
-        base_config = get_base_config()
-        if base_config.monitoring_tool == MonitoringTool.LANGFUSE:
-            # set callbacks
-            # litellm.success_callback = ["langfuse"]
-            # litellm.failure_callback = ["langfuse"]
-            self.aclient.success_callback = ["langfuse"]
-            self.aclient.failure_callback = ["langfuse"]
-            self.client.success_callback = ["langfuse"]
-            self.client.failure_callback = ["langfuse"]
+    def _configure_langfuse_callbacks(self, client):
+        """Configure Langfuse callbacks for the given client."""
+        client.success_callback = ["langfuse"]
+        client.failure_callback = ["langfuse"]
+
+    def __init__(
+        self,
+        api_key: str,
+        endpoint: str,
+        api_version: str,
+        model: str,
+        transcription_model: str,
+        streaming: bool = False,
+    ):
+        """Initialize OpenAI adapter with optional Langfuse monitoring.
+        
+        Args:
+            api_key (str): OpenAI API key
+            endpoint (str): API endpoint
+            api_version (str): API version
+            model (str): Model identifier
+            transcription_model (str): Model for transcription
+            streaming (bool, optional): Enable streaming. Defaults to False.
+        """
+        self.aclient = instructor.from_litellm(litellm.acompletion)
+        self.client = instructor.from_litellm(litellm.completion)
+        self.transcription_model = transcription_model
+        self.model = model
+        self.api_key = api_key
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.streaming = streaming
+
+        try:
+            base_config = get_base_config()
+            if base_config and base_config.monitoring_tool == MonitoringTool.LANGFUSE:
+                self._configure_langfuse_callbacks(self.aclient)
+                self._configure_langfuse_callbacks(self.client)
+        except Exception as e:
+            # Log the error but don't fail initialization
+            print(f"Warning: Failed to configure monitoring: {str(e)}")
-        base_config = get_base_config()
-        if base_config.monitoring_tool == MonitoringTool.LANGFUSE:
-            # set callbacks
-            # litellm.success_callback = ["langfuse"]
-            # litellm.failure_callback = ["langfuse"]
-            self.aclient.success_callback = ["langfuse"]
-            self.aclient.failure_callback = ["langfuse"]
-            self.client.success_callback = ["langfuse"]
-            self.client.failure_callback = ["langfuse"]
+    def _configure_langfuse_callbacks(self, client):
+        """Configure Langfuse callbacks for the given client."""
+        client.success_callback = ["langfuse"]
+        client.failure_callback = ["langfuse"]
+
+    def __init__(
+        self,
+        api_key: str,
+        endpoint: str,
+        api_version: str,
+        model: str,
+        transcription_model: str,
+        streaming: bool = False,
+    ):
+        """Initialize OpenAI adapter with optional Langfuse monitoring.
+        
+        Args:
+            api_key (str): OpenAI API key
+            endpoint (str): API endpoint
+            api_version (str): API version
+            model (str): Model identifier
+            transcription_model (str): Model for transcription
+            streaming (bool, optional): Enable streaming. Defaults to False.
+        """
+        self.aclient = instructor.from_litellm(litellm.acompletion)
+        self.client = instructor.from_litellm(litellm.completion)
+        self.transcription_model = transcription_model
+        self.model = model
+        self.api_key = api_key
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.streaming = streaming
+
+        try:
+            base_config = get_base_config()
+            if base_config and base_config.monitoring_tool == MonitoringTool.LANGFUSE:
+                self._configure_langfuse_callbacks(self.aclient)
+                self._configure_langfuse_callbacks(self.client)
+        except Exception as e:
+            # Log the error but don't fail initialization
+            print(f"Warning: Failed to configure monitoring: {str(e)}")
 
     async def acreate_structured_output(self, text_input: str, system_prompt: str, response_model: Type[BaseModel]) -> BaseModel:
         """Generate a response from a user query."""