-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'mindsdb:main' into patch-1
- Loading branch information
Showing
13 changed files
with
612 additions
and
38 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
__title__ = 'minds_sdk' | ||
__package_name__ = 'minds' | ||
__version__ = '1.0.8' | ||
__version__ = '1.2.0' | ||
__description__ = 'An AI-Data Mind is an LLM with the built-in power to answer data questions for Agents' | ||
__email__ = '[email protected]' | ||
__author__ = 'MindsDB Inc' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .knowledge_bases import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
from pydantic import BaseModel | ||
|
||
from minds.knowledge_bases.preprocessing import PreprocessingConfig | ||
from minds.rest_api import RestAPI | ||
|
||
|
||
class VectorStoreConfig(BaseModel): | ||
'''Configuration for the underlying vector store for knowledge base embeddings''' | ||
engine: str | ||
connection_data: Dict[str, Any] | ||
table: str = 'embeddings' | ||
|
||
|
||
class EmbeddingConfig(BaseModel): | ||
'''Configuration for embeddings to use with underlying vector store for knowledge base''' | ||
provider: str | ||
model: str | ||
params: Optional[Dict[str, Any]] = None | ||
|
||
|
||
class KnowledgeBaseConfig(BaseModel): | ||
'''Configuration for a knowledge base''' | ||
name: str | ||
description: str | ||
vector_store_config: Optional[VectorStoreConfig] = None | ||
embedding_config: Optional[EmbeddingConfig] = None | ||
# Params to apply to retrieval pipeline. | ||
params: Optional[Dict] = None | ||
|
||
|
||
class KnowledgeBaseDocument(BaseModel): | ||
'''Represents a document that can be inserted into a knowledge base''' | ||
id: Union[int, str] | ||
content: str | ||
metadata: Optional[Dict[str, Any]] = {} | ||
|
||
|
||
class KnowledgeBase: | ||
def __init__(self, name, api: RestAPI): | ||
self.name = name | ||
self.api = api | ||
|
||
def insert_from_select(self, query: str, preprocessing_config: PreprocessingConfig = None): | ||
''' | ||
Inserts select content of a connected datasource into this knowledge base | ||
:param query: The SQL SELECT query to use to retrieve content to be inserted | ||
''' | ||
update_request = { | ||
'query': query | ||
} | ||
if preprocessing_config is not None: | ||
update_request['preprocessing'] = preprocessing_config.model_dump() | ||
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request) | ||
|
||
def insert_documents(self, documents: List[KnowledgeBaseDocument], preprocessing_config: PreprocessingConfig = None): | ||
''' | ||
Inserts documents directly into this knowledge base | ||
:param documents: The documents to insert | ||
''' | ||
update_request = { | ||
'rows': [d.model_dump() for d in documents] | ||
} | ||
if preprocessing_config is not None: | ||
update_request['preprocessing'] = preprocessing_config.model_dump() | ||
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request) | ||
|
||
def insert_urls(self, urls: List[str], preprocessing_config: PreprocessingConfig = None): | ||
''' | ||
Crawls URLs & inserts the retrieved webpages into this knowledge base | ||
:param urls: Valid URLs to crawl & insert | ||
''' | ||
update_request = { | ||
'urls': urls | ||
} | ||
if preprocessing_config is not None: | ||
update_request['preprocessing'] = preprocessing_config.model_dump() | ||
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request) | ||
|
||
def insert_files(self, files: List[str], preprocessing_config: PreprocessingConfig = None): | ||
''' | ||
Inserts files that have already been uploaded to MindsDB into this knowledge base | ||
:param files: Names of preuploaded files to insert | ||
''' | ||
update_request = { | ||
'files': files | ||
} | ||
if preprocessing_config is not None: | ||
update_request['preprocessing'] = preprocessing_config.model_dump() | ||
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request) | ||
|
||
|
||
class KnowledgeBases: | ||
def __init__(self, client): | ||
self.api = client.api | ||
|
||
def create(self, config: KnowledgeBaseConfig) -> KnowledgeBase: | ||
''' | ||
Create new knowledge base and return it | ||
:param config: knowledge base configuration, properties: | ||
- name: str, name of knowledge base | ||
- description: str, description of the knowledge base. Used by minds to know what data can be retrieved. | ||
- vector_store_config: VectorStoreConfig, configuration for embeddings vector store. | ||
- embedding_config: EmbeddingConfig, configuration for embeddings. | ||
:return: knowledge base object | ||
''' | ||
create_request = { | ||
'name': config.name, | ||
'description': config.description | ||
} | ||
if config.vector_store_config is not None: | ||
vector_store_data = { | ||
'engine': config.vector_store_config.engine, | ||
'connection_data': config.vector_store_config.connection_data | ||
} | ||
create_request['vector_store'] = vector_store_data | ||
if config.embedding_config is not None: | ||
embedding_data = { | ||
'provider': config.embedding_config.provider, | ||
'name': config.embedding_config.model | ||
} | ||
if config.embedding_config.params is not None: | ||
embedding_data.update(config.embedding_config.params) | ||
create_request['embedding_model'] = embedding_data | ||
if config.params is not None: | ||
create_request['params'] = config.params | ||
|
||
_ = self.api.post('/knowledge_bases', data=create_request) | ||
return self.get(config.name) | ||
|
||
def list(self) -> List[KnowledgeBase]: | ||
''' | ||
Returns list of knowledge bases | ||
:return: iterable knowledge bases | ||
''' | ||
|
||
list_knowledge_bases_response = self.api.get('/knowledge_bases') | ||
knowledge_bases = list_knowledge_bases_response.json() | ||
|
||
all_knowledge_bases = [] | ||
for knowledge_base in knowledge_bases: | ||
all_knowledge_bases.append(KnowledgeBase(knowledge_base['name'], self.api)) | ||
return all_knowledge_bases | ||
|
||
def get(self, name: str) -> KnowledgeBase: | ||
''' | ||
Get knowledge base by name | ||
:param name: name of knowledge base | ||
:return: knowledge base object | ||
''' | ||
|
||
knowledge_base_response = self.api.get(f'/knowledge_bases/{name}') | ||
knowledge_base = knowledge_base_response.json() | ||
return KnowledgeBase(knowledge_base['name'], self.api) | ||
|
||
def drop(self, name: str, force=False): | ||
''' | ||
Drop knowledge base by name | ||
:param name: name of knowledge base | ||
:param force: if True - remove from all minds, default: False | ||
''' | ||
data = None | ||
if force: | ||
data = {'cascade': True} | ||
|
||
self.api.delete(f'/knowledge_bases/{name}', data=data) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
from typing import Any, Dict, List, Literal, Optional | ||
|
||
from pydantic import BaseModel, Field, model_validator | ||
|
||
|
||
DEFAULT_LLM_MODEL = 'gpt-4o' | ||
DEFAULT_LLM_MODEL_PROVIDER = 'openai' | ||
|
||
|
||
class TextChunkingConfig(BaseModel): | ||
'''Configuration for chunking text content before they are inserted into a knowledge base''' | ||
separators: List[str] = Field( | ||
default=['\n\n', '\n', ' ', ''], | ||
description='List of separators to use for splitting text, in order of priority' | ||
) | ||
chunk_size: int = Field( | ||
default=1000, | ||
description='The target size of each text chunk', | ||
gt=0 | ||
) | ||
chunk_overlap: int = Field( | ||
default=200, | ||
description='The number of characters to overlap between chunks', | ||
ge=0 | ||
) | ||
|
||
|
||
class LLMConfig(BaseModel): | ||
model_name: str = Field(default=DEFAULT_LLM_MODEL, description='LLM model to use for context generation') | ||
provider: str = Field(default=DEFAULT_LLM_MODEL_PROVIDER, description='LLM model provider to use for context generation') | ||
params: Dict[str, Any] = Field(default={}, description='Additional parameters to pass in when initializing the LLM') | ||
|
||
|
||
class ContextualConfig(BaseModel): | ||
'''Configuration specific to contextual preprocessing''' | ||
llm_config: LLMConfig = Field( | ||
default=LLMConfig(), | ||
description='LLM configuration to use for context generation' | ||
) | ||
context_template: Optional[str] = Field( | ||
default=None, | ||
description='Custom template for context generation' | ||
) | ||
chunk_size: int = Field( | ||
default=1000, | ||
description='The target size of each text chunk', | ||
gt=0 | ||
) | ||
chunk_overlap: int = Field( | ||
default=200, | ||
description='The number of characters to overlap between chunks', | ||
ge=0 | ||
) | ||
|
||
|
||
class PreprocessingConfig(BaseModel): | ||
'''Complete preprocessing configuration''' | ||
type: Literal['contextual', 'text_chunking'] = Field( | ||
default='text_chunking', | ||
description='Type of preprocessing to apply' | ||
) | ||
contextual_config: Optional[ContextualConfig] = Field( | ||
default=None, | ||
description='Configuration for contextual preprocessing' | ||
) | ||
text_chunking_config: Optional[TextChunkingConfig] = Field( | ||
default=None, | ||
description='Configuration for text chunking preprocessing' | ||
) | ||
|
||
@model_validator(mode='after') | ||
def validate_config_presence(self) -> 'PreprocessingConfig': | ||
'''Ensure the appropriate config is present for the chosen type''' | ||
if self.type == 'contextual' and not self.contextual_config: | ||
self.contextual_config = ContextualConfig() | ||
if self.type == 'text_chunking' and not self.text_chunking_config: | ||
self.text_chunking_config = TextChunkingConfig() | ||
return self |
Oops, something went wrong.