From bc09c260b97866d0d21fd781e59839716b9beec8 Mon Sep 17 00:00:00 2001 From: Yuhui Shi Date: Thu, 31 Aug 2023 23:06:13 -0700 Subject: [PATCH] Add sql syntax for knowledge base --- .../parser/dialects/mindsdb/__init__.py | 2 + .../parser/dialects/mindsdb/knowledge_base.py | 78 +++++ mindsdb_sql/parser/dialects/mindsdb/lexer.py | 4 + mindsdb_sql/parser/dialects/mindsdb/parser.py | 39 +++ .../test_mindsdb/test_knowledgebase.py | 331 ++++++++++++++++++ 5 files changed, 454 insertions(+) create mode 100644 mindsdb_sql/parser/dialects/mindsdb/knowledge_base.py create mode 100644 tests/test_parser/test_mindsdb/test_knowledgebase.py diff --git a/mindsdb_sql/parser/dialects/mindsdb/__init__.py b/mindsdb_sql/parser/dialects/mindsdb/__init__.py index b3ca02c3..d8f1978a 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/__init__.py +++ b/mindsdb_sql/parser/dialects/mindsdb/__init__.py @@ -16,6 +16,8 @@ from .drop_job import DropJob from .chatbot import CreateChatBot, UpdateChatBot, DropChatBot from .trigger import CreateTrigger, DropTrigger +from .knowledge_base import CreateKnowledgeBase, DropKnowledgeBase # remove it in next release CreateDatasource = CreateDatabase + diff --git a/mindsdb_sql/parser/dialects/mindsdb/knowledge_base.py b/mindsdb_sql/parser/dialects/mindsdb/knowledge_base.py new file mode 100644 index 00000000..a483e870 --- /dev/null +++ b/mindsdb_sql/parser/dialects/mindsdb/knowledge_base.py @@ -0,0 +1,78 @@ +from mindsdb_sql.parser.ast.base import ASTNode +from mindsdb_sql.parser.utils import indent + + +class CreateKnowledgeBase(ASTNode): + def __init__( + self, + name, + model, + storage, + from_query=None, + params=None, + if_not_exists=False, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + self.name = name + self.model = model + self.storage = storage + self.params = params + self.if_not_exists = if_not_exists + self.from_query = from_query + + def to_tree(self, *args, level=0, **kwargs): + ind = indent(level) + out_str = f""" + {ind}CreateKnowledgeBase( + {ind} if_not_exists={self.if_not_exists}, + {ind} name={self.name.to_string()}, + {ind} from_query={self.from_query.to_tree(level=level+1) if self.from_query else None}, + {ind} model={self.model.to_string()}, + {ind} storage={self.storage.to_string()}, + {ind} params={self.params} + {ind}) + """ + return out_str + + def get_string(self, *args, **kwargs): + params = self.params.copy() + using_ar = [f"{k}={repr(v)}" for k, v in params.items()] + using_str = ", ".join(using_ar) + from_query_str = ( + f"FROM ({self.from_query.get_string()})" if self.from_query else "" + ) + + out_str = ( + f"CREATE KNOWLEDGE_BASE {'IF NOT EXISTS' if self.if_not_exists else ''}{self.name.to_string()} " + f"{from_query_str} " + f"MODEL {self.model.to_string()} " + f"STORAGE {self.storage.to_string()} " + f"USING {using_str}" + ) + + return out_str + + def __repr__(self) -> str: + return self.to_tree() + + +class DropKnowledgeBase(ASTNode): + def __init__(self, name, if_exists=False, *args, **kwargs): + super().__init__(*args, **kwargs) + self.name = name + self.if_exists = if_exists + + def to_tree(self, *args, level=0, **kwargs): + ind = indent(level) + out_str = ( + f"{ind}DropKnowledgeBase(" + f"{ind} if_exists={self.if_exists}," + f"name={self.name.to_string()})" + ) + return out_str + + def get_string(self, *args, **kwargs): + out_str = f'DROP KNOWLEDGE_BASE {"IF EXISTS" if self.if_exists else ""}{self.name.to_string()}' + return out_str diff --git a/mindsdb_sql/parser/dialects/mindsdb/lexer.py b/mindsdb_sql/parser/dialects/mindsdb/lexer.py index 2136507e..dfb994a5 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/lexer.py +++ b/mindsdb_sql/parser/dialects/mindsdb/lexer.py @@ -29,6 +29,7 @@ class MindsDBLexer(Lexer): FINETUNE, EVALUATE, LATEST, HORIZON, USING, ENGINE, TRAIN, PREDICT, PARAMETERS, JOB, CHATBOT, EVERY,PROJECT, + KNOWLEDGE_BASE, KNOWLEDGE_BASES, # SHOW/DDL Keywords @@ -110,6 +111,9 @@ class MindsDBLexer(Lexer): PROJECT = r'\bPROJECT\b' EVALUATE = r'\bEVALUATE\b' + KNOWLEDGE_BASE = r'\bKNOWLEDGE[_|\s]BASE\b' + KNOWLEDGE_BASES = r'\bKNOWLEDGE[_|\s]BASES\b' + # Misc SET = r'\bSET\b' START = r'\bSTART\b' diff --git a/mindsdb_sql/parser/dialects/mindsdb/parser.py b/mindsdb_sql/parser/dialects/mindsdb/parser.py index 14cdd817..03c6ff6f 100644 --- a/mindsdb_sql/parser/dialects/mindsdb/parser.py +++ b/mindsdb_sql/parser/dialects/mindsdb/parser.py @@ -16,6 +16,7 @@ from mindsdb_sql.parser.dialects.mindsdb.latest import Latest from mindsdb_sql.parser.dialects.mindsdb.evaluate import Evaluate from mindsdb_sql.parser.dialects.mindsdb.create_file import CreateFile +from mindsdb_sql.parser.dialects.mindsdb.knowledge_base import CreateKnowledgeBase, DropKnowledgeBase from mindsdb_sql.exceptions import ParsingException from mindsdb_sql.parser.dialects.mindsdb.lexer import MindsDBLexer from mindsdb_sql.parser.dialects.mindsdb.retrain_predictor import RetrainPredictor @@ -79,10 +80,47 @@ class MindsDBParser(Parser): 'update_chat_bot', 'create_trigger', 'drop_trigger', + 'create_kb', + 'drop_kb', ) def query(self, p): return p[0] + # -- Knowledge Base -- + @_( + 'CREATE KNOWLEDGE_BASE identifier MODEL identifier STORAGE identifier', + 'CREATE KNOWLEDGE_BASE identifier MODEL identifier STORAGE identifier USING kw_parameter_list', + # from select + 'CREATE KNOWLEDGE_BASE identifier FROM LPAREN select RPAREN MODEL identifier STORAGE identifier', + 'CREATE KNOWLEDGE_BASE identifier FROM LPAREN select RPAREN MODEL identifier STORAGE identifier USING kw_parameter_list', + 'CREATE KNOWLEDGE_BASE IF_NOT_EXISTS identifier MODEL identifier STORAGE identifier', + 'CREATE KNOWLEDGE_BASE IF_NOT_EXISTS identifier MODEL identifier STORAGE identifier USING kw_parameter_list', + 'CREATE KNOWLEDGE_BASE IF_NOT_EXISTS identifier FROM LPAREN select RPAREN MODEL identifier STORAGE identifier', + 'CREATE KNOWLEDGE_BASE IF_NOT_EXISTS identifier FROM LPAREN select RPAREN MODEL identifier STORAGE identifier USING kw_parameter_list', + ) + def create_kb(self, p): + params = getattr(p, 'kw_parameter_list', {}) + from_query = getattr(p, 'select', None) + name = p.identifier0 + model = p.identifier1 + storage = p.identifier2 + if_not_exists = hasattr(p, 'IF_NOT_EXISTS') + + return CreateKnowledgeBase( + name=name, + model=model, + storage=storage, + from_query=from_query, + params=params, + if_not_exists=if_not_exists + ) + + @_('DROP KNOWLEDGE_BASE identifier', + 'DROP KNOWLEDGE_BASE IF_EXISTS identifier') + def drop_kb(self, p): + if_exists = hasattr(p, 'IF_EXISTS') + return DropKnowledgeBase(name=p.identifier, if_exists=if_exists) + # -- ChatBot -- @_('CREATE CHATBOT identifier USING kw_parameter_list') def create_chat_bot(self, p): @@ -460,6 +498,7 @@ def show(self, p): 'ML_ENGINES', 'HANDLERS', 'SEARCH_PATH', + 'KNOWLEDGE_BASES', 'ALL') def show_category(self, p): return ' '.join([x for x in p]) diff --git a/tests/test_parser/test_mindsdb/test_knowledgebase.py b/tests/test_parser/test_mindsdb/test_knowledgebase.py new file mode 100644 index 00000000..fe9e7c80 --- /dev/null +++ b/tests/test_parser/test_mindsdb/test_knowledgebase.py @@ -0,0 +1,331 @@ +import pytest +from mindsdb_sql import parse_sql +from mindsdb_sql.parser.dialects.mindsdb.knowledge_base import ( + CreateKnowledgeBase, + DropKnowledgeBase, +) +from mindsdb_sql.parser.ast import ( + Select, + Identifier, + Join, + Show, + BinaryOperation, + Constant, + Star, + Delete, + Insert, + OrderBy, +) + + +def test_create_knowledeg_base(): + # create without select + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + MODEL mindsdb.my_embedding_model + STORAGE my_vector_database.some_table + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=False, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_query=None, + params={}, + ) + assert ast == expected_ast + + # the order of MODEL and STORAGE should not matter + # TODO: the current syntax is sensitive to the order + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + STORAGE my_vector_database.some_table + MODEL mindsdb.my_embedding_model + """ + with pytest.raises(Exception): + ast = parse_sql(sql, dialect="mindsdb") + + # create from a query + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + FROM ( + SELECT id, content, embeddings, metadata + FROM my_table + JOIN my_embedding_model + ) + MODEL mindsdb.my_embedding_model + STORAGE my_vector_database.some_table + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=False, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_query=Select( + targets=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + from_table=Join( + left=Identifier("my_table"), + right=Identifier("my_embedding_model"), + join_type="JOIN", + ), + ), + params={}, + ) + + assert ast == expected_ast + + # create without MODEL + # TODO: this should be an error + # we may allow this in the future when we have a default model + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + STORAGE my_vector_database.some_table + """ + with pytest.raises(Exception): + ast = parse_sql(sql, dialect="mindsdb") + + # create without STORAGE + # TODO: this should be an error + # we may allow this in the future when we have a default storage + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + MODEL mindsdb.my_embedding_model + """ + with pytest.raises(Exception): + ast = parse_sql(sql, dialect="mindsdb") + + # create if not exists + sql = """ + CREATE KNOWLEDGE_BASE IF NOT EXISTS my_knowledge_base + MODEL mindsdb.my_embedding_model + STORAGE my_vector_database.some_table + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=True, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_query=None, + params={}, + ) + assert ast == expected_ast + + # create with params + sql = """ + CREATE KNOWLEDGE_BASE my_knowledge_base + MODEL mindsdb.my_embedding_model + STORAGE my_vector_database.some_table + USING + some_param = 'some value', + other_param = 'other value' + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = CreateKnowledgeBase( + name=Identifier("my_knowledge_base"), + if_not_exists=False, + model=Identifier(parts=["mindsdb", "my_embedding_model"]), + storage=Identifier(parts=["my_vector_database", "some_table"]), + from_query=None, + params={"some_param": "some value", "other_param": "other value"}, + ) + assert ast == expected_ast + + +def test_drop_knowledge_base(): + # drop if exists + sql = """ + DROP KNOWLEDGE_BASE IF EXISTS my_knowledge_base + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = DropKnowledgeBase( + name=Identifier("my_knowledge_base"), if_exists=True + ) + assert ast == expected_ast + + # drop without if exists + sql = """ + DROP KNOWLEDGE_BASE my_knowledge_base + """ + ast = parse_sql(sql, dialect="mindsdb") + + expected_ast = DropKnowledgeBase( + name=Identifier("my_knowledge_base"), if_exists=False + ) + assert ast == expected_ast + + +@pytest.mark.skip(reason="not implemented") +def test_alter_knowledge_base(): + pass + + +def test_show_knowledge_base(): + sql = """ + SHOW KNOWLEDGE_BASES + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Show( + category="KNOWLEDGE_BASES", + ) + assert ast == expected_ast + + # without underscore shall also work + sql = """ + SHOW KNOWLEDGE BASES + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Show( + category="KNOWLEDGE BASES", + ) + assert ast == expected_ast + + +def test_select_from_knowledge_base(): + # this is no different from a regular select + sql = """ + SELECT * FROM my_knowledge_base + WHERE + query = 'some text in natural query' + AND + metadata.some_column = 'some value' + ORDER BY + distances DESC + LIMIT 10 + """ + ast = parse_sql(sql, dialect="mindsdb") + + expected_ast = Select( + targets=[Star()], + from_table=Identifier("my_knowledge_base"), + where=BinaryOperation( + op="AND", + args=[ + BinaryOperation( + op="=", + args=[Identifier("query"), Constant("some text in natural query")], + ), + BinaryOperation( + op="=", + args=[Identifier("metadata.some_column"), Constant("some value")], + ), + ], + ), + order_by=[OrderBy(field=Identifier("distances"), direction="DESC")], + limit=Constant(10), + ) + assert ast == expected_ast + + +def test_delete_from_knowledge_base(): + # this is no different from a regular delete + sql = """ + DELETE FROM my_knowledge_base + WHERE + id = 'some id' + AND + metadata.some_column = 'some value' + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Delete( + table=Identifier("my_knowledge_base"), + where=BinaryOperation( + op="AND", + args=[ + BinaryOperation(op="=", args=[Identifier("id"), Constant("some id")]), + BinaryOperation( + op="=", + args=[Identifier("metadata.some_column"), Constant("some value")], + ), + ], + ), + ) + assert ast == expected_ast + + +def test_insert_into_knowledge_base(): + # this is no different from a regular insert + sql = """ + INSERT INTO my_knowledge_base ( + id, content, embeddings, metadata + ) + VALUES ( + 'some id', + 'some text', + '[1,2,3,4,5]', + '{"some_column": "some value"}' + ), + ( + 'some other id', + 'some other text', + '[1,2,3,4,5]', + '{"some_column": "some value"}' + ) + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Insert( + table=Identifier("my_knowledge_base"), + columns=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + values=[ + [ + Constant("some id"), + Constant("some text"), + Constant("[1,2,3,4,5]"), + Constant('{"some_column": "some value"}'), + ], + [ + Constant("some other id"), + Constant("some other text"), + Constant("[1,2,3,4,5]"), + Constant('{"some_column": "some value"}'), + ], + ], + ) + assert ast == expected_ast + + # insert from a select + sql = """ + INSERT INTO my_knowledge_base ( + id, content, embeddings, metadata + ) + SELECT id, content, embeddings, metadata + FROM my_table + WHERE + metadata.some_column = 'some value' + """ + ast = parse_sql(sql, dialect="mindsdb") + expected_ast = Insert( + table=Identifier("my_knowledge_base"), + columns=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + from_select=Select( + targets=[ + Identifier("id"), + Identifier("content"), + Identifier("embeddings"), + Identifier("metadata"), + ], + from_table=Identifier("my_table"), + where=BinaryOperation( + op="=", + args=[Identifier("metadata.some_column"), Constant("some value")], + ), + ), + ) + assert ast == expected_ast