diff --git a/mindsdb_sql/parser/dialects/mindsdb/__init__.py b/mindsdb_sql/parser/dialects/mindsdb/__init__.py index ef92b64f..565ea39d 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/__init__.py +++ b/mindsdb_sql/parser/dialects/mindsdb/__init__.py @@ -16,6 +16,8 @@ from .drop_job import DropJob from .chatbot import CreateChatBot, UpdateChatBot, DropChatBot from .trigger import CreateTrigger, DropTrigger +from .knowledge_base import CreateKnowledgeBase, DropKnowledgeBase # remove it in next release CreateDatasource = CreateDatabase + diff --git a/mindsdb_sql/parser/dialects/mindsdb/knowledge_base.py b/mindsdb_sql/parser/dialects/mindsdb/knowledge_base.py new file mode 100644 index 00000000..f7a462c1 --- /dev/null +++ b/mindsdb_sql/parser/dialects/mindsdb/knowledge_base.py @@ -0,0 +1,98 @@ +from mindsdb_sql.parser.ast.base import ASTNode +from mindsdb_sql.parser.utils import indent + + +class CreateKnowledgeBase(ASTNode): + """ + Create a new knowledge base + """ + def __init__( + self, + name, + model, + storage, + from_select=None, + params=None, + if_not_exists=False, + *args, + **kwargs, + ): + """ + Args: + name: Identifier -- name of the knowledge base + model: Identifier -- name of the model to use + storage: Identifier -- name of the storage to use + from_select: SelectStatement -- select statement to use as the source of the knowledge base + params: dict -- additional parameters to pass to the knowledge base. E.g., chunking strategy, etc. + if_not_exists: bool -- if True, do not raise an error if the knowledge base already exists + """ + super().__init__(*args, **kwargs) + self.name = name + self.model = model + self.storage = storage + self.params = params + self.if_not_exists = if_not_exists + self.from_query = from_select + + def to_tree(self, *args, level=0, **kwargs): + ind = indent(level) + out_str = f""" + {ind}CreateKnowledgeBase( + {ind} if_not_exists={self.if_not_exists}, + {ind} name={self.name.to_string()}, + {ind} from_query={self.from_query.to_tree(level=level+1) if self.from_query else None}, + {ind} model={self.model.to_string()}, + {ind} storage={self.storage.to_string()}, + {ind} params={self.params} + {ind}) + """ + return out_str + + def get_string(self, *args, **kwargs): + params = self.params.copy() + using_ar = [f"{k}={repr(v)}" for k, v in params.items()] + using_str = ", ".join(using_ar) + from_query_str = ( + f"FROM ({self.from_query.get_string()})" if self.from_query else "" + ) + + out_str = ( + f"CREATE KNOWLEDGE_BASE {'IF NOT EXISTS' if self.if_not_exists else ''}{self.name.to_string()} " + f"{from_query_str} " + f"USING {using_str}," + f" MODEL = {self.model.to_string()}, " + f" STORAGE {self.storage.to_string()} " + ) + + return out_str + + def __repr__(self) -> str: + return self.to_tree() + + +class DropKnowledgeBase(ASTNode): + """ + Delete a knowledge base + """ + def __init__(self, name, if_exists=False, *args, **kwargs): + """ + Args: + name: Identifier -- name of the knowledge base + if_exists: bool -- if True, do not raise an error if the knowledge base does not exist + """ + super().__init__(*args, **kwargs) + self.name = name + self.if_exists = if_exists + + def to_tree(self, *args, level=0, **kwargs): + ind = indent(level) + out_str = ( + f"{ind}DropKnowledgeBase(" + f"{ind} if_exists={self.if_exists}," + f"name={self.name.to_string()})" + ) + return out_str + + def get_string(self, *args, **kwargs): + out_str = f'DROP KNOWLEDGE_BASE {"IF EXISTS" if self.if_exists else ""}{self.name.to_string()}' + return out_str diff --git a/mindsdb_sql/parser/dialects/mindsdb/lexer.py b/mindsdb_sql/parser/dialects/mindsdb/lexer.py index 1cc8ce1f..5b046695 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/lexer.py +++ b/mindsdb_sql/parser/dialects/mindsdb/lexer.py @@ -30,6 +30,7 @@ class MindsDBLexer(Lexer): LATEST, HORIZON, USING, ENGINE, TRAIN, PREDICT, PARAMETERS, JOB, CHATBOT, EVERY,PROJECT, ANOMALY, DETECTION, + KNOWLEDGE_BASE, KNOWLEDGE_BASES, # SHOW/DDL Keywords @@ -115,6 +116,9 @@ class MindsDBLexer(Lexer): ANOMALY = r'\bANOMALY\b' DETECTION = r'\bDETECTION\b' + KNOWLEDGE_BASE = r'\bKNOWLEDGE[_|\s]BASE\b' + KNOWLEDGE_BASES = r'\bKNOWLEDGE[_|\s]BASES\b' + # Misc SET = r'\bSET\b' START = r'\bSTART\b' diff --git a/mindsdb_sql/parser/dialects/mindsdb/parser.py b/mindsdb_sql/parser/dialects/mindsdb/parser.py index 81058dda..c8e91070 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/parser.py +++ b/mindsdb_sql/parser/dialects/mindsdb/parser.py @@ -16,6 +16,7 @@ from mindsdb_sql.parser.dialects.mindsdb.latest import Latest from mindsdb_sql.parser.dialects.mindsdb.evaluate import Evaluate from mindsdb_sql.parser.dialects.mindsdb.create_file import CreateFile +from mindsdb_sql.parser.dialects.mindsdb.knowledge_base import CreateKnowledgeBase, DropKnowledgeBase from mindsdb_sql.exceptions import ParsingException from mindsdb_sql.parser.dialects.mindsdb.lexer import MindsDBLexer from mindsdb_sql.parser.dialects.mindsdb.retrain_predictor import RetrainPredictor @@ -80,10 +81,52 @@ class MindsDBParser(Parser): 'update_chat_bot', 'create_trigger', 'drop_trigger', + 'create_kb', + 'drop_kb', ) def query(self, p): return p[0] + # -- Knowledge Base -- + @_( + 'CREATE KNOWLEDGE_BASE if_not_exists_or_empty identifier USING kw_parameter_list', + # from select + 'CREATE KNOWLEDGE_BASE if_not_exists_or_empty identifier FROM LPAREN select RPAREN USING kw_parameter_list', + 'CREATE KNOWLEDGE_BASE if_not_exists_or_empty identifier FROM LPAREN select RPAREN', + ) + def create_kb(self, p): + params = getattr(p, 'kw_parameter_list', {}) + from_query = getattr(p, 'select', None) + name = p.identifier + # check model and storage are in params + model = params.pop('model', None) or params.pop('MODEL', None) # case insensitive + storage = params.pop('storage', None) or params.pop('STORAGE', None) # case insensitive + if not model: + if isinstance(model, str): + # convert to identifier + model = Identifier(model) + raise ParsingException('Missing model parameter') + if not storage: + if isinstance(storage, str): + # convert to identifier + storage = Identifier(storage) + raise ParsingException('Missing storage parameter') + if_not_exists = p.if_not_exists_or_empty + + return CreateKnowledgeBase( + name=name, + model=model, + storage=storage, + from_select=from_query, + params=params, + if_not_exists=if_not_exists + ) + + + @_('DROP KNOWLEDGE_BASE if_exists_or_empty identifier') + def drop_kb(self, p): + return DropKnowledgeBase(name=p.identifier, if_exists=p.if_exists_or_empty) + # -- ChatBot -- @_('CREATE CHATBOT identifier USING kw_parameter_list') def create_chat_bot(self, p): @@ -130,14 +173,10 @@ def drop_trigger(self, p): # -- Jobs -- - @_('CREATE JOB identifier LPAREN raw_query RPAREN job_schedule', - 'CREATE JOB identifier AS LPAREN raw_query RPAREN job_schedule', - 'CREATE JOB identifier LPAREN raw_query RPAREN', - 'CREATE JOB identifier AS LPAREN raw_query RPAREN', - 'CREATE JOB IF_NOT_EXISTS identifier LPAREN raw_query RPAREN job_schedule', - 'CREATE JOB IF_NOT_EXISTS identifier AS LPAREN raw_query RPAREN job_schedule', - 'CREATE JOB IF_NOT_EXISTS identifier LPAREN raw_query RPAREN', - 'CREATE JOB IF_NOT_EXISTS identifier AS LPAREN raw_query RPAREN') + @_('CREATE JOB if_not_exists_or_empty identifier LPAREN raw_query RPAREN job_schedule', + 'CREATE JOB if_not_exists_or_empty identifier AS LPAREN raw_query RPAREN job_schedule', + 'CREATE JOB if_not_exists_or_empty identifier LPAREN raw_query RPAREN', + 'CREATE JOB if_not_exists_or_empty identifier AS LPAREN raw_query RPAREN') def create_job(self, p): query_str = tokens_to_string(p.raw_query) @@ -164,7 +203,7 @@ def create_job(self, p): start_str=start_str, end_str=end_str, repeat_str=repeat_str, - if_not_exists=hasattr(p, 'IF_NOT_EXISTS') + if_not_exists=p.if_not_exists_or_empty ) @_('START string', @@ -196,10 +235,9 @@ def job_schedule(self, p): schedule = {param: value} return schedule - @_('DROP JOB identifier', - 'DROP JOB IF_EXISTS identifier') + @_('DROP JOB if_exists_or_empty identifier') def drop_job(self, p): - return DropJob(name=p.identifier, if_exists=hasattr(p, 'IF_EXISTS')) + return DropJob(name=p.identifier, if_exists=p.if_exists_or_empty) # Explain @@ -214,27 +252,20 @@ def alter_table(self, p): arg=' '.join([p.id0, p.id1])) # DROP VEW - @_('DROP VIEW identifier', - 'DROP VIEW IF_EXISTS identifier') + @_('DROP VIEW if_exists_or_empty identifier') def drop_view(self, p): - if_exists = hasattr(p, 'IF_EXISTS') - return DropView([p.identifier], if_exists=if_exists) + return DropView([p.identifier], if_exists=p.if_exists_or_empty) - @_('DROP VIEW enumeration', - 'DROP VIEW IF_EXISTS enumeration') + @_('DROP VIEW if_exists_or_empty enumeration') def drop_view(self, p): - if_exists = hasattr(p, 'IF_EXISTS') - return DropView(p.enumeration, if_exists=if_exists) + return DropView(p.enumeration, if_exists=p.if_exists_or_empty) # DROP DATABASE - @_('DROP DATABASE identifier', - 'DROP DATABASE IF_EXISTS identifier', - 'DROP PROJECT identifier', - 'DROP SCHEMA identifier', - 'DROP SCHEMA IF_EXISTS identifier') + @_('DROP DATABASE if_exists_or_empty identifier', + 'DROP PROJECT if_exists_or_empty identifier', + 'DROP SCHEMA if_exists_or_empty identifier') def drop_database(self, p): - if_exists = hasattr(p, 'IF_EXISTS') - return DropDatabase(name=p.identifier, if_exists=if_exists) + return DropDatabase(name=p.identifier, if_exists=p.if_exists_or_empty) # Transactions @@ -461,6 +492,7 @@ def show(self, p): 'ML_ENGINES', 'HANDLERS', 'SEARCH_PATH', + 'KNOWLEDGE_BASES', 'ALL') def show_category(self, p): return ' '.join([x for x in p]) @@ -584,17 +616,15 @@ def use(self, p): return Use(value=p.identifier) # CREATE VIEW - @_('CREATE VIEW identifier create_view_from_table_or_nothing AS LPAREN raw_query RPAREN', - 'CREATE VIEW identifier create_view_from_table_or_nothing LPAREN raw_query RPAREN', - 'CREATE VIEW IF_NOT_EXISTS identifier create_view_from_table_or_nothing AS LPAREN raw_query RPAREN', - 'CREATE VIEW IF_NOT_EXISTS identifier create_view_from_table_or_nothing LPAREN raw_query RPAREN') + @_('CREATE VIEW if_not_exists_or_empty identifier create_view_from_table_or_nothing AS LPAREN raw_query RPAREN', + 'CREATE VIEW if_not_exists_or_empty identifier create_view_from_table_or_nothing LPAREN raw_query RPAREN') def create_view(self, p): query_str = tokens_to_string(p.raw_query) return CreateView(name=p.identifier, from_table=p.create_view_from_table_or_nothing, query_str=query_str, - if_not_exists=hasattr(p, 'IF_NOT_EXISTS')) + if_not_exists=p.if_not_exists_or_empty) @_('FROM identifier') def create_view_from_table_or_nothing(self, p): @@ -605,38 +635,30 @@ def create_view_from_table_or_nothing(self, p): pass # DROP PREDICTOR - @_('DROP PREDICTOR identifier', - 'DROP MODEL identifier', - 'DROP PREDICTOR IF_EXISTS identifier', - 'DROP MODEL IF_EXISTS identifier') + @_('DROP PREDICTOR if_exists_or_empty identifier', + 'DROP MODEL if_exists_or_empty identifier') def drop_predictor(self, p): - if_exists = hasattr(p, 'IF_EXISTS') - return DropPredictor(p.identifier, if_exists=if_exists) + return DropPredictor(p.identifier, if_exists=p.if_exists_or_empty) # DROP DATASOURCE - @_('DROP DATASOURCE identifier', - 'DROP DATASOURCE IF_EXISTS identifier') + @_('DROP DATASOURCE if_exists_or_empty identifier') def drop_datasource(self, p): - return DropDatasource(p.identifier, if_exists=hasattr(p, 'IF_EXISTS')) + return DropDatasource(p.identifier, if_exists=p.if_exists_or_empty) # DROP DATASET - @_('DROP DATASET identifier', - 'DROP DATASET IF_EXISTS identifier') + @_('DROP DATASET if_exists_or_empty identifier') def drop_dataset(self, p): - return DropDataset(p.identifier, if_exists=hasattr(p, 'IF_EXISTS')) + return DropDataset(p.identifier, if_exists=p.if_exists_or_empty) # DROP TABLE - @_('DROP TABLE IF_EXISTS identifier', - 'DROP TABLE identifier') + @_('DROP TABLE if_exists_or_empty identifier') def drop_table(self, p): - if_exists = hasattr(p, 'IF_EXISTS') - return DropTables(tables=[p.identifier], if_exists=if_exists) + return DropTables(tables=[p.identifier], if_exists=p.if_exists_or_empty) # create table - @_('CREATE TABLE identifier select', - 'CREATE TABLE identifier LPAREN select RPAREN', - 'CREATE TABLE IF_NOT_EXISTS identifier select', - 'CREATE TABLE IF_NOT_EXISTS identifier LPAREN select RPAREN', + @_('CREATE TABLE identifier select', # TODO tests failing without it + 'CREATE TABLE if_not_exists_or_empty identifier select', + 'CREATE TABLE if_not_exists_or_empty identifier LPAREN select RPAREN', 'CREATE OR REPLACE TABLE identifier select', 'CREATE OR REPLACE TABLE identifier LPAREN select RPAREN') def create_table(self, p): @@ -648,7 +670,7 @@ def create_table(self, p): name=p.identifier, is_replace=is_replace, from_select=p.select, - if_not_exists=hasattr(p, 'IF_NOT_EXISTS') + if_not_exists=getattr(p, 'if_not_exists_or_empty', False) ) @_('CREATE TABLE identifier USING kw_parameter_list') @@ -687,14 +709,10 @@ def create_predictor(self, p): p.create_predictor.order_by = p.ordering_terms return p.create_predictor - @_('CREATE PREDICTOR identifier FROM identifier LPAREN raw_query RPAREN PREDICT result_columns', - 'CREATE PREDICTOR identifier PREDICT result_columns', - 'CREATE PREDICTOR IF_NOT_EXISTS identifier FROM identifier LPAREN raw_query RPAREN PREDICT result_columns', - 'CREATE PREDICTOR IF_NOT_EXISTS identifier PREDICT result_columns', - 'CREATE MODEL identifier FROM identifier LPAREN raw_query RPAREN PREDICT result_columns', - 'CREATE MODEL identifier PREDICT result_columns', - 'CREATE MODEL IF_NOT_EXISTS identifier FROM identifier LPAREN raw_query RPAREN PREDICT result_columns', - 'CREATE MODEL IF_NOT_EXISTS identifier PREDICT result_columns') + @_('CREATE PREDICTOR if_not_exists_or_empty identifier FROM identifier LPAREN raw_query RPAREN PREDICT result_columns', + 'CREATE PREDICTOR if_not_exists_or_empty identifier PREDICT result_columns', + 'CREATE MODEL if_not_exists_or_empty identifier FROM identifier LPAREN raw_query RPAREN PREDICT result_columns', + 'CREATE MODEL if_not_exists_or_empty identifier PREDICT result_columns') def create_predictor(self, p): query_str = None if hasattr(p, 'raw_query'): @@ -711,7 +729,7 @@ def create_predictor(self, p): integration_name=getattr(p, 'identifier1', None), query_str=query_str, targets=p.result_columns, - if_not_exists=hasattr(p, 'IF_NOT_EXISTS') + if_not_exists=p.if_not_exists_or_empty ) # Typed models @@ -815,21 +833,18 @@ def evaluate(self, p): # ML ENGINE # CREATE - @_('CREATE ML_ENGINE identifier FROM id USING kw_parameter_list', - 'CREATE ML_ENGINE identifier FROM id', - 'CREATE ML_ENGINE IF_NOT_EXISTS identifier FROM id USING kw_parameter_list', - 'CREATE ML_ENGINE IF_NOT_EXISTS identifier FROM id') + @_('CREATE ML_ENGINE if_not_exists_or_empty identifier FROM id USING kw_parameter_list', + 'CREATE ML_ENGINE if_not_exists_or_empty identifier FROM id') def create_integration(self, p): return CreateMLEngine(name=p.identifier, handler=p.id, params=getattr(p, 'kw_parameter_list', None), - if_not_exists=hasattr(p, 'IF_NOT_EXISTS')) + if_not_exists=p.if_not_exists_or_empty) # DROP - @_('DROP ML_ENGINE identifier', - 'DROP ML_ENGINE IF_EXISTS identifier') + @_('DROP ML_ENGINE if_exists_or_empty identifier') def create_integration(self, p): - return DropMLEngine(name=p.identifier, if_exists=hasattr(p, 'IF_EXISTS')) + return DropMLEngine(name=p.identifier, if_exists=p.if_exists_or_empty) # CREATE INTEGRATION @_('CREATE database_engine', @@ -856,23 +871,17 @@ def create_integration(self, p): parameters=parameters, if_not_exists=p.database_engine['if_not_exists']) - @_('DATABASE identifier', - 'PROJECT identifier', - 'DATABASE identifier ENGINE string', - 'DATABASE identifier ENGINE EQUALS string', - 'DATABASE identifier WITH ENGINE string', - 'DATABASE identifier WITH ENGINE EQUALS string', - 'DATABASE IF_NOT_EXISTS identifier', - 'DATABASE IF_NOT_EXISTS identifier ENGINE string', - 'DATABASE IF_NOT_EXISTS identifier ENGINE EQUALS string', - 'DATABASE IF_NOT_EXISTS identifier WITH ENGINE string', - 'DATABASE IF_NOT_EXISTS identifier WITH ENGINE EQUALS string', - 'PROJECT IF_NOT_EXISTS identifier') + @_('DATABASE if_not_exists_or_empty identifier', + 'DATABASE if_not_exists_or_empty identifier ENGINE string', + 'DATABASE if_not_exists_or_empty identifier ENGINE EQUALS string', + 'DATABASE if_not_exists_or_empty identifier WITH ENGINE string', + 'DATABASE if_not_exists_or_empty identifier WITH ENGINE EQUALS string', + 'PROJECT if_not_exists_or_empty identifier') def database_engine(self, p): engine = None if hasattr(p, 'string'): engine = p.string - return {'identifier': p.identifier, 'engine': engine, 'if_not_exists': hasattr(p, 'IF_NOT_EXISTS')} + return {'identifier': p.identifier, 'engine': engine, 'if_not_exists': p.if_not_exists_or_empty} # UNION / UNION ALL @_('select UNION select') @@ -1426,9 +1435,12 @@ def kw_parameter_list(self, p): return params @_('identifier EQUALS object', - 'identifier EQUALS json_value') + 'identifier EQUALS json_value', + 'identifier EQUALS identifier') def kw_parameter(self, p): - key = '.'.join(p.identifier.parts) + key = getattr(p, 'identifier', None) or getattr(p, 'identifier0', None) + assert key is not None + key = '.'.join(key.parts) return {key: p[2]} # json @@ -1622,6 +1634,24 @@ def raw_query(self, p): def raw_query(self, p): return p[0] + p[1] + @_( + 'IF_NOT_EXISTS', + 'empty' + ) + def if_not_exists_or_empty(self, p): + if hasattr(p, 'IF_NOT_EXISTS'): + return True + return False + + @_( + 'IF_EXISTS', + 'empty' + ) + def if_exists_or_empty(self, p): + if hasattr(p, 'IF_EXISTS'): + return True + return False + @_(*all_tokens_list) def raw_query(self, p): return p._slice diff --git a/tests/test_parser/test_mindsdb/test_knowledgebase.py b/tests/test_parser/test_mindsdb/test_knowledgebase.py new file mode 100644 index 00000000..50812217 --- /dev/null +++ b/tests/test_parser/test_mindsdb/test_knowledgebase.py @@ -0,0 +1,346 @@ +import pytest +from mindsdb_sql import parse_sql +from mindsdb_sql.parser.dialects.mindsdb.knowledge_base import ( + CreateKnowledgeBase, + DropKnowledgeBase, +) +from mindsdb_sql.parser.ast import ( + Select, + Identifier, + Join, + Show, + BinaryOperation, + Constant, + Star, + Delete, + Insert, + OrderBy, +) + + +def test_create_knowledeg_base(): + # create without select + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + USING + MODEL=mindsdb.my_embedding_model, + STORAGE = my_vector_database.some_table + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=False, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_select=None, + params={}, + ) + assert ast == expected_ast + + # using the alias KNOWLEDGE BASE without underscore shall also work + sql = """ + CREATE KNOWLEDGE BASE my_knowledge_base + USING + MODEL=mindsdb.my_embedding_model, + STORAGE = my_vector_database.some_table + """ + ast = parse_sql(sql, dialect="mindsdb") + assert ast == expected_ast + + # the order of MODEL and STORAGE should not matter + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + USING + STORAGE = my_vector_database.some_table, + MODEL = mindsdb.my_embedding_model + """ + ast = parse_sql(sql, dialect="mindsdb") + assert ast == expected_ast + + # create from a query + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + FROM ( + SELECT id, content, embeddings, metadata + FROM my_table + JOIN my_embedding_model + ) + USING + MODEL = mindsdb.my_embedding_model, + STORAGE = my_vector_database.some_table + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=False, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_select=Select( + targets=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + from_table=Join( + left=Identifier("my_table"), + right=Identifier("my_embedding_model"), + join_type="JOIN", + ), + ), + params={}, + ) + + assert ast == expected_ast + + # create without MODEL + # TODO: this should be an error + # we may allow this in the future when we have a default model + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + USING + STORAGE = my_vector_database.some_table + """ + with pytest.raises(Exception): + ast = parse_sql(sql, dialect="mindsdb") + + # create without STORAGE + # TODO: this should be an error + # we may allow this in the future when we have a default storage + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + USING + MODEL = mindsdb.my_embedding_model + """ + with pytest.raises(Exception): + ast = parse_sql(sql, dialect="mindsdb") + + # create if not exists + sql = """ + CREATE KNOWLEDGE_BASE IF NOT EXISTS my_knowledge_base + USING + MODEL = mindsdb.my_embedding_model, + STORAGE = my_vector_database.some_table + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=True, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_select=None, + params={}, + ) + assert ast == expected_ast + + # create with params + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + USING + MODEL = mindsdb.my_embedding_model, + STORAGE = my_vector_database.some_table, + some_param = 'some value', + other_param = 'other value' + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=False, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_select=None, + params={"some_param": "some value", "other_param": "other value"}, + ) + assert ast == expected_ast + + +def test_drop_knowledge_base(): + # drop if exists + sql = """ + DROP KNOWLEDGE_BASE IF EXISTS my_knowledge_base + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = DropKnowledgeBase( + name=Identifier("my_knowledge_base"), if_exists=True + ) + assert ast == expected_ast + + # drop without if exists + sql = """ + DROP KNOWLEDGE_BASE my_knowledge_base + """ + ast = parse_sql(sql, dialect="mindsdb") + + expected_ast = DropKnowledgeBase( + name=Identifier("my_knowledge_base"), if_exists=False + ) + assert ast == expected_ast + + +@pytest.mark.skip(reason="not implemented") +def test_alter_knowledge_base(): + pass + + +def test_show_knowledge_base(): + sql = """ + SHOW KNOWLEDGE_BASES + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Show( + category="KNOWLEDGE_BASES", + ) + assert ast == expected_ast + + # without underscore shall also work + sql = """ + SHOW KNOWLEDGE BASES + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Show( + category="KNOWLEDGE BASES", + ) + assert ast == expected_ast + + +def test_select_from_knowledge_base(): + # this is no different from a regular select + sql = """ + SELECT * FROM my_knowledge_base + WHERE + query = 'some text in natural query' + AND + metadata.some_column = 'some value' + ORDER BY + distances DESC + LIMIT 10 + """ + ast = parse_sql(sql, dialect="mindsdb") + + expected_ast = Select( + targets=[Star()], + from_table=Identifier("my_knowledge_base"), + where=BinaryOperation( + op="AND", + args=[ + BinaryOperation( + op="=", + args=[Identifier("query"), Constant("some text in natural query")], + ), + BinaryOperation( + op="=", + args=[Identifier("metadata.some_column"), Constant("some value")], + ), + ], + ), + order_by=[OrderBy(field=Identifier("distances"), direction="DESC")], + limit=Constant(10), + ) + assert ast == expected_ast + + +def test_delete_from_knowledge_base(): + # this is no different from a regular delete + sql = """ + DELETE FROM my_knowledge_base + WHERE + id = 'some id' + AND + metadata.some_column = 'some value' + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Delete( + table=Identifier("my_knowledge_base"), + where=BinaryOperation( + op="AND", + args=[ + BinaryOperation(op="=", args=[Identifier("id"), Constant("some id")]), + BinaryOperation( + op="=", + args=[Identifier("metadata.some_column"), Constant("some value")], + ), + ], + ), + ) + assert ast == expected_ast + + +def test_insert_into_knowledge_base(): + # this is no different from a regular insert + sql = """ + INSERT INTO my_knowledge_base ( + id, content, embeddings, metadata + ) + VALUES ( + 'some id', + 'some text', + '[1,2,3,4,5]', + '{"some_column": "some value"}' + ), + ( + 'some other id', + 'some other text', + '[1,2,3,4,5]', + '{"some_column": "some value"}' + ) + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Insert( + table=Identifier("my_knowledge_base"), + columns=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + values=[ + [ + Constant("some id"), + Constant("some text"), + Constant("[1,2,3,4,5]"), + Constant('{"some_column": "some value"}'), + ], + [ + Constant("some other id"), + Constant("some other text"), + Constant("[1,2,3,4,5]"), + Constant('{"some_column": "some value"}'), + ], + ], + ) + assert ast == expected_ast + + # insert from a select + sql = """ + INSERT INTO my_knowledge_base ( + id, content, embeddings, metadata + ) + SELECT id, content, embeddings, metadata + FROM my_table + WHERE + metadata.some_column = 'some value' + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Insert( + table=Identifier("my_knowledge_base"), + columns=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + from_select=Select( + targets=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + from_table=Identifier("my_table"), + where=BinaryOperation( + op="=", + args=[Identifier("metadata.some_column"), Constant("some value")], + ), + ), + ) + assert ast == expected_ast