Skip to content

Commit

Permalink
Merge branch 'mindsdb:main' into patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
Better-Boy authored Nov 22, 2024
2 parents 722911d + 8f44e3b commit b9bf9c5
Show file tree
Hide file tree
Showing 13 changed files with 612 additions and 38 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/test_on_deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8', '3.9','3.10', '3.11']
python-version: ['3.10']
steps:
- name: Checkout code
uses: actions/checkout@v2
Expand All @@ -28,4 +28,4 @@ jobs:
env:
PYTHONPATH: ./
API_KEY: ${{ secrets.API_KEY }}
BASE_URL: ${{ secrets.BASE_URL }}
BASE_URL: 'https://mdb.ai'
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,5 +149,14 @@ client.datasources.drop('my_datasource')
```
>Note: The SDK currently does not support automatically removing a data source if it is no longer connected to any mind.
### Other SDKs
#### [Command-Line](https://github.com/Better-Boy/minds-cli-sdk)
### Community Supported SDKs

- [Java-SDK](https://github.com/Better-Boy/minds-java-sdk)
- [Ruby-SDK](https://github.com/tungnt1203/minds_ruby_sdk)
- [Dart-SDK](https://github.com/ArnavK-09/mdb_dart)
- [C# SDK](https://github.com/priyanshuverma-dev/Minds.SDK)
- [Go SDK](https://github.com/Abiji-2020/minds-go-sdk)

#### Command Line Tools
- [Minds CLI](https://github.com/Better-Boy/minds-cli-sdk)

2 changes: 1 addition & 1 deletion minds/__about__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__title__ = 'minds_sdk'
__package_name__ = 'minds'
__version__ = '1.0.8'
__version__ = '1.2.0'
__description__ = 'An AI-Data Mind is an LLM with the built-in power to answer data questions for Agents'
__email__ = '[email protected]'
__author__ = 'MindsDB Inc'
Expand Down
2 changes: 2 additions & 0 deletions minds/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from minds.rest_api import RestAPI

from minds.datasources import Datasources
from minds.knowledge_bases import KnowledgeBases
from minds.minds import Minds


Expand All @@ -12,5 +13,6 @@ def __init__(self, api_key, base_url=None):
self.api = RestAPI(api_key, base_url)

self.datasources = Datasources(self)
self.knowledge_bases = KnowledgeBases(self)

self.minds = Minds(self)
6 changes: 4 additions & 2 deletions minds/datasources/datasources.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List, Optional, Union

from pydantic import BaseModel, Field

import minds.utils as utils
import minds.exceptions as exc

class DatabaseConfig(BaseModel):
Expand Down Expand Up @@ -37,8 +37,10 @@ def create(self, ds_config: DatabaseConfig, update=False):

name = ds_config.name

utils.validate_datasource_name(name)

if update:
self.api.put('/datasources', data=ds_config.model_dump())
self.api.put(f'/datasources/{name}', data=ds_config.model_dump())
else:
self.api.post('/datasources', data=ds_config.model_dump())
return self.get(name)
Expand Down
4 changes: 4 additions & 0 deletions minds/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,8 @@ class UnknownError(Exception):


class MindNameInvalid(Exception):
...


class DatasourceNameInvalid(Exception):
...
1 change: 1 addition & 0 deletions minds/knowledge_bases/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .knowledge_bases import *
175 changes: 175 additions & 0 deletions minds/knowledge_bases/knowledge_bases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
from typing import Any, Dict, List, Optional, Union

from pydantic import BaseModel

from minds.knowledge_bases.preprocessing import PreprocessingConfig
from minds.rest_api import RestAPI


class VectorStoreConfig(BaseModel):
'''Configuration for the underlying vector store for knowledge base embeddings'''
engine: str
connection_data: Dict[str, Any]
table: str = 'embeddings'


class EmbeddingConfig(BaseModel):
'''Configuration for embeddings to use with underlying vector store for knowledge base'''
provider: str
model: str
params: Optional[Dict[str, Any]] = None


class KnowledgeBaseConfig(BaseModel):
'''Configuration for a knowledge base'''
name: str
description: str
vector_store_config: Optional[VectorStoreConfig] = None
embedding_config: Optional[EmbeddingConfig] = None
# Params to apply to retrieval pipeline.
params: Optional[Dict] = None


class KnowledgeBaseDocument(BaseModel):
'''Represents a document that can be inserted into a knowledge base'''
id: Union[int, str]
content: str
metadata: Optional[Dict[str, Any]] = {}


class KnowledgeBase:
def __init__(self, name, api: RestAPI):
self.name = name
self.api = api

def insert_from_select(self, query: str, preprocessing_config: PreprocessingConfig = None):
'''
Inserts select content of a connected datasource into this knowledge base
:param query: The SQL SELECT query to use to retrieve content to be inserted
'''
update_request = {
'query': query
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)

def insert_documents(self, documents: List[KnowledgeBaseDocument], preprocessing_config: PreprocessingConfig = None):
'''
Inserts documents directly into this knowledge base
:param documents: The documents to insert
'''
update_request = {
'rows': [d.model_dump() for d in documents]
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)

def insert_urls(self, urls: List[str], preprocessing_config: PreprocessingConfig = None):
'''
Crawls URLs & inserts the retrieved webpages into this knowledge base
:param urls: Valid URLs to crawl & insert
'''
update_request = {
'urls': urls
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)

def insert_files(self, files: List[str], preprocessing_config: PreprocessingConfig = None):
'''
Inserts files that have already been uploaded to MindsDB into this knowledge base
:param files: Names of preuploaded files to insert
'''
update_request = {
'files': files
}
if preprocessing_config is not None:
update_request['preprocessing'] = preprocessing_config.model_dump()
_ = self.api.put(f'/knowledge_bases/{self.name}', data=update_request)


class KnowledgeBases:
def __init__(self, client):
self.api = client.api

def create(self, config: KnowledgeBaseConfig) -> KnowledgeBase:
'''
Create new knowledge base and return it
:param config: knowledge base configuration, properties:
- name: str, name of knowledge base
- description: str, description of the knowledge base. Used by minds to know what data can be retrieved.
- vector_store_config: VectorStoreConfig, configuration for embeddings vector store.
- embedding_config: EmbeddingConfig, configuration for embeddings.
:return: knowledge base object
'''
create_request = {
'name': config.name,
'description': config.description
}
if config.vector_store_config is not None:
vector_store_data = {
'engine': config.vector_store_config.engine,
'connection_data': config.vector_store_config.connection_data
}
create_request['vector_store'] = vector_store_data
if config.embedding_config is not None:
embedding_data = {
'provider': config.embedding_config.provider,
'name': config.embedding_config.model
}
if config.embedding_config.params is not None:
embedding_data.update(config.embedding_config.params)
create_request['embedding_model'] = embedding_data
if config.params is not None:
create_request['params'] = config.params

_ = self.api.post('/knowledge_bases', data=create_request)
return self.get(config.name)

def list(self) -> List[KnowledgeBase]:
'''
Returns list of knowledge bases
:return: iterable knowledge bases
'''

list_knowledge_bases_response = self.api.get('/knowledge_bases')
knowledge_bases = list_knowledge_bases_response.json()

all_knowledge_bases = []
for knowledge_base in knowledge_bases:
all_knowledge_bases.append(KnowledgeBase(knowledge_base['name'], self.api))
return all_knowledge_bases

def get(self, name: str) -> KnowledgeBase:
'''
Get knowledge base by name
:param name: name of knowledge base
:return: knowledge base object
'''

knowledge_base_response = self.api.get(f'/knowledge_bases/{name}')
knowledge_base = knowledge_base_response.json()
return KnowledgeBase(knowledge_base['name'], self.api)

def drop(self, name: str, force=False):
'''
Drop knowledge base by name
:param name: name of knowledge base
:param force: if True - remove from all minds, default: False
'''
data = None
if force:
data = {'cascade': True}

self.api.delete(f'/knowledge_bases/{name}', data=data)
78 changes: 78 additions & 0 deletions minds/knowledge_bases/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, Field, model_validator


DEFAULT_LLM_MODEL = 'gpt-4o'
DEFAULT_LLM_MODEL_PROVIDER = 'openai'


class TextChunkingConfig(BaseModel):
'''Configuration for chunking text content before they are inserted into a knowledge base'''
separators: List[str] = Field(
default=['\n\n', '\n', ' ', ''],
description='List of separators to use for splitting text, in order of priority'
)
chunk_size: int = Field(
default=1000,
description='The target size of each text chunk',
gt=0
)
chunk_overlap: int = Field(
default=200,
description='The number of characters to overlap between chunks',
ge=0
)


class LLMConfig(BaseModel):
model_name: str = Field(default=DEFAULT_LLM_MODEL, description='LLM model to use for context generation')
provider: str = Field(default=DEFAULT_LLM_MODEL_PROVIDER, description='LLM model provider to use for context generation')
params: Dict[str, Any] = Field(default={}, description='Additional parameters to pass in when initializing the LLM')


class ContextualConfig(BaseModel):
'''Configuration specific to contextual preprocessing'''
llm_config: LLMConfig = Field(
default=LLMConfig(),
description='LLM configuration to use for context generation'
)
context_template: Optional[str] = Field(
default=None,
description='Custom template for context generation'
)
chunk_size: int = Field(
default=1000,
description='The target size of each text chunk',
gt=0
)
chunk_overlap: int = Field(
default=200,
description='The number of characters to overlap between chunks',
ge=0
)


class PreprocessingConfig(BaseModel):
'''Complete preprocessing configuration'''
type: Literal['contextual', 'text_chunking'] = Field(
default='text_chunking',
description='Type of preprocessing to apply'
)
contextual_config: Optional[ContextualConfig] = Field(
default=None,
description='Configuration for contextual preprocessing'
)
text_chunking_config: Optional[TextChunkingConfig] = Field(
default=None,
description='Configuration for text chunking preprocessing'
)

@model_validator(mode='after')
def validate_config_presence(self) -> 'PreprocessingConfig':
'''Ensure the appropriate config is present for the chosen type'''
if self.type == 'contextual' and not self.contextual_config:
self.contextual_config = ContextualConfig()
if self.type == 'text_chunking' and not self.text_chunking_config:
self.text_chunking_config = TextChunkingConfig()
return self
Loading

0 comments on commit b9bf9c5

Please sign in to comment.