Merge pull request #361 from mindsdb/staging

Release 0.10.6
mindsdb · Mar 14, 2024 · c766c87 · c766c87
2 parents 0b1c3a1 + 727cd00
commit c766c87
Show file tree

Hide file tree

Showing 22 changed files with 459 additions and 124 deletions.
diff --git a/README.md b/README.md
@@ -73,6 +73,32 @@ SLY does not support inheritance, therefore every dialect is described completel
   - get_string - to return object as sql expression (or sub-expression)
   - copy - to copy AST-tree to new object
 
+### Error handling
+
+For better user experience parsing error contains useful information about problem location and possible solution to solve it. 
+1. it shows location of error if 
+  - character isn't parsed (by lexer)
+  - token is unexpected (by parser)
+2. it tries to propose correct token instead (or before) error location. Possible options
+  - Keyword will be showed as is.
+  - '[number]' - if float and integer is expected
+  - '[string]' - if string is expected
+  - '[identifier]' - if name of the objects is expected. For example, they are bold words here:
+    - "select **x** as **name** from **tbl1** where **col**=1"
+
+How suggestion works:
+It uses next possible tokens defined by syntax rules.
+If this is the end of the query: just shows these tokens.
+Else:
+- it tries to replace bad token with other token from list of possible tokens
+- tries to parse query once again, if there is no error:
+  - add this token to suggestion list
+- second iteration: put possible token before bad token (instead of replacement) and repeat the same operation.
+
+Example:
+![image](https://github.com/mindsdb/mindsdb_sql/assets/8502631/c4707087-ca6e-47f6-aaba-db3a641947a6)
+
+
 # Planner
 
 

diff --git a/mindsdb_sql/__about__.py b/mindsdb_sql/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'mindsdb_sql'
 __package_name__ = 'mindsdb_sql'
-__version__ = '0.10.5'
+__version__ = '0.10.6'
 __description__ = "Pure python SQL parser"
 __email__ = "[email protected]"
 __author__ = 'MindsDB Inc'

diff --git a/mindsdb_sql/__init__.py b/mindsdb_sql/__init__.py
@@ -1,9 +1,166 @@
 import re
+from collections import defaultdict
+
+from sly.lex import Token
 
 from mindsdb_sql.exceptions import ParsingException
 from mindsdb_sql.parser.ast import *
 
 
+class ErrorHandling:
+
+    def __init__(self, lexer, parser):
+        self.parser = parser
+        self.lexer = lexer
+
+    def process(self, error_info):
+        self.tokens = [t for t in error_info['tokens'] if t is not None]
+        self.bad_token = error_info['bad_token']
+        self.expected_tokens = error_info['expected_tokens']
+
+        if len(self.tokens) == 0:
+            return 'Empty input'
+
+        # show error location
+        msgs = self.error_location()
+
+        # suggestion
+        suggestions = self.make_suggestion()
+
+        if suggestions:
+            prefix = 'Possible inputs: ' if len(suggestions) > 1 else 'Expected symbol: '
+            msgs.append(prefix + ', '.join([f'"{item}"' for item in suggestions]))
+        return '\n'.join(msgs)
+
+    def error_location(self):
+
+        # restore query text
+        lines_idx = defaultdict(str)
+
+        # used + unused tokens
+        for token in self.tokens:
+            if token is None:
+                continue
+            line = lines_idx[token.lineno]
+
+            if len(line) > token.index:
+                line = line[: token.index]
+            else:
+                line = line.ljust(token.index)
+
+            line += token.value
+            lines_idx[token.lineno] = line
+
+        msgs = []
+
+        # error message and location
+        if self.bad_token is None:
+            msgs.append('Syntax error, unexpected end of query:')
+            error_len = 1
+            # last line
+            error_line_num = list(lines_idx.keys())[-1]
+            error_index = len(lines_idx[error_line_num])
+        else:
+            msgs.append('Syntax error, unknown input:')
+            error_len = len(self.bad_token.value)
+            error_line_num = self.bad_token.lineno
+            error_index = self.bad_token.index
+
+        # shift lines indexes (it removes spaces from beginnings of the lines)
+        lines = []
+        shift = 0
+        error_line = 0
+        for i, line_num in enumerate(lines_idx.keys()):
+            if line_num == error_line_num:
+                error_index -= shift
+                error_line = i
+
+            line = lines_idx[line_num]
+            lines.append(line[shift:])
+            shift = len(line)
+
+        # add source code
+        first_line = error_line - 2 if error_line > 1 else 0
+        for line in lines[first_line: error_line + 1]:
+            msgs.append('>' + line)
+
+        # error position
+        msgs.append('-' * (error_index + 1) + '^' * error_len)
+        return msgs
+
+    def make_suggestion(self):
+        if len(self.expected_tokens) == 0:
+            return []
+
+        # find error index
+        error_index = None
+        for i, token in enumerate(self.tokens):
+            if token is self.bad_token :
+                error_index = i
+
+        expected = {}  # value: token
+
+        for token_name in self.expected_tokens:
+            value = getattr(self.lexer, token_name, None)
+            if token_name == 'ID':
+                # a lot of other tokens could be ID
+                expected = {'[identifier]': token_name}
+                break
+            elif token_name in ('FLOAT', 'INTEGER'):
+                expected['[number]'] = token_name
+
+            elif token_name in ('DQUOTE_STRING', 'QUOTE_STRING'):
+                expected['[string]'] = token_name
+
+            elif isinstance(value, str):
+                value = value.replace('\\b', '').replace('\\', '')
+
+                # doesn't content regexp
+                if '\\s' not in value and '|' not in value:
+                    expected[value] = token_name
+
+        suggestions = []
+        if len(expected) == 1:
+            # use only it
+            first_value = list(expected.keys())[0]
+            suggestions.append(first_value)
+
+        elif 1 < len(expected) < 20:
+            if self.bad_token is None:
+                # if this is the end of query, just show next expected keywords
+                return list(expected.keys())
+
+            # not every suggestion satisfy the end of the query. we have to check if it works
+            for value, token_name in expected.items():
+                # make up a token
+                token = Token()
+                token.type = token_name
+                token.value = value
+                token.end = 0
+                token.index = 0
+                token.lineno = 0
+
+                # try to add token
+                tokens2 = self.tokens[:error_index] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+                # try to replace token
+                tokens2 = self.tokens[:error_index - 1] + [token] + self.tokens[error_index:]
+                if self.query_is_valid(tokens2):
+                    suggestions.append(value)
+                    continue
+
+        return suggestions
+
+    def query_is_valid(self, tokens):
+        # try to parse list of tokens
+
+        ast = self.parser.parse(iter(tokens))
+        return ast is not None
+
+
 def get_lexer_parser(dialect):
     if dialect == 'sqlite':
         from mindsdb_sql.parser.lexer import SQLLexer
@@ -29,4 +186,12 @@ def parse_sql(sql, dialect='mindsdb'):
     lexer, parser = get_lexer_parser(dialect)
     tokens = lexer.tokenize(sql)
     ast = parser.parse(tokens)
+
+    if ast is None:
+
+        eh = ErrorHandling(lexer, parser)
+        message = eh.process(parser.error_info)
+
+        raise ParsingException(message)
+
     return ast
diff --git a/mindsdb_sql/parser/ast/create.py b/mindsdb_sql/parser/ast/create.py
@@ -9,11 +9,12 @@
 
 
 class TableColumn():
-    def __init__(self, name, type='integer'):
+    def __init__(self, name, type='integer', length=None):
         self.name = name
         self.type = type
         self.is_primary_key = False
         self.default = None
+        self.length = length
 
 
 class CreateTable(ASTNode):
@@ -72,14 +73,18 @@ def get_string(self, *args, **kwargs):
         if self.columns is not None:
             columns = []
             for col in self.columns:
-                type = str(col.type)
-                if sa_types is not None:
+
+                if not isinstance(col.type, str) and sa_types is not None:
                     if issubclass(col.type, sa_types.Integer):
                         type = 'int'
                     elif issubclass(col.type, sa_types.Float):
                         type = 'float'
                     elif issubclass(col.type, sa_types.Text):
                         type = 'text'
+                else:
+                    type = str(col.type)
+                if col.length is not None:
+                    type = f'{type}({col.length})'
                 columns.append( f'{col.name} {type}')
 
             columns_str = '({})'.format(', '.join(columns))

diff --git a/mindsdb_sql/parser/ast/select/__init__.py b/mindsdb_sql/parser/ast/select/__init__.py
@@ -7,7 +7,8 @@
 from .join import Join
 from .type_cast import TypeCast
 from .tuple import Tuple
-from .operation import Operation, BinaryOperation, UnaryOperation, BetweenOperation, Function, WindowFunction, Object
+from .operation import (Operation, BinaryOperation, UnaryOperation, BetweenOperation,
+                        Function, WindowFunction, Object, Interval)
 from .order_by import OrderBy
 from .parameter import Parameter
 from .case import Case

diff --git a/mindsdb_sql/parser/ast/select/operation.py b/mindsdb_sql/parser/ast/select/operation.py
@@ -167,3 +167,19 @@ def to_string(self, *args, **kwargs):
 
     def __repr__(self):
         return self.to_tree()
+
+
+class Interval(Operation):
+
+    def __init__(self, info):
+        super().__init__(op='interval', args=[info, ])
+
+    def get_string(self, *args, **kwargs):
+        return f'INTERVAL {repr(self.args[0])}'
+
+    def to_tree(self, *args, level=0, **kwargs):
+        return self.get_string( *args, **kwargs)
+
+    def assert_arguments(self):
+        if len(self.args) != 1:
+            raise ParsingException(f'Expected one argument for operation "{self.op}"')
diff --git a/mindsdb_sql/parser/dialects/mindsdb/__init__.py b/mindsdb_sql/parser/dialects/mindsdb/__init__.py
@@ -9,7 +9,6 @@
 from .drop_dataset import DropDataset
 from .evaluate import Evaluate
 from .latest import Latest
-from .create_file import CreateFile
 from .create_ml_engine import CreateMLEngine
 from .drop_ml_engine import DropMLEngine
 from .create_job import CreateJob

diff --git a/mindsdb_sql/parser/dialects/mindsdb/create_file.py b/mindsdb_sql/parser/dialects/mindsdb/create_file.py
diff --git a/mindsdb_sql/parser/dialects/mindsdb/lexer.py b/mindsdb_sql/parser/dialects/mindsdb/lexer.py
@@ -1,5 +1,6 @@
 import re
 from sly import Lexer
+from sly.lex import LexError
 
 """
 Unfortunately we can't inherit from base SQLLexer, because the order of rules is important.
@@ -14,7 +15,7 @@ class MindsDBLexer(Lexer):
     ignore_line_comment = r'--[^\n]*'
 
     tokens = {
-        USE, DROP, CREATE, DESCRIBE, RETRAIN,REPLACE,
+        USE, DROP, CREATE, DESCRIBE, RETRAIN, REPLACE,
 
         # Misc
         SET, START, TRANSACTION, COMMIT, ROLLBACK, ALTER, EXPLAIN,
@@ -72,7 +73,7 @@ class MindsDBLexer(Lexer):
         EQUALS, NEQUALS, GREATER, GEQ, LESS, LEQ,
         AND, OR, NOT, IS, IS_NOT,
         IN, LIKE, NOT_LIKE, CONCAT, BETWEEN, WINDOW, OVER, PARTITION_BY,
-        JSON_GET, JSON_GET_STR,
+        JSON_GET, JSON_GET_STR, INTERVAL,
 
         # Data types
         CAST, ID, INTEGER, FLOAT, QUOTE_STRING, DQUOTE_STRING, NULL, TRUE, FALSE,
@@ -287,6 +288,7 @@ class MindsDBLexer(Lexer):
     CAST = r'\bCAST\b'
     CONCAT = r'\|\|'
     BETWEEN = r'\bBETWEEN\b'
+    INTERVAL = r'\bINTERVAL\b'
     WINDOW = r'\bWINDOW\b'
     OVER = r'\bOVER\b'
     PARTITION_BY = r'\bPARTITION BY\b'
@@ -308,12 +310,12 @@ def FLOAT(self, t):
     def INTEGER(self, t):
         return t
 
-    @_(r"'(?:[^\'\\]|\\.)*'")
+    @_(r"'(?:\\.|[^'])*'")
     def QUOTE_STRING(self, t):
         t.value = t.value.replace('\\"', '"').replace("\\'", "'")
         return t
 
-    @_(r'"(?:[^\"\\]|\\.)*"')
+    @_(r'"(?:\\.|[^"])*"')
     def DQUOTE_STRING(self, t):
         t.value = t.value.replace('\\"', '"').replace("\\'", "'")
         return t
@@ -354,3 +356,25 @@ def SYSTEM_VARIABLE(self, t):
             t.value = t.value.strip('`')
         return t
 
+    def error(self, t):
+
+        # convert to lines
+        lines = []
+        shift = 0
+        error_line = 0
+        error_index = 0
+        for i, line in enumerate(self.text.split('\n')):
+            if 0 <= t.index - shift < len(line):
+                error_line = i
+                error_index = t.index - shift
+            lines.append(line)
+            shift += len(line) + 1
+
+        msgs = [f'Illegal character {t.value[0]!r}:']
+        # show error code
+        for line in lines[error_line - 1: error_line + 1]:
+            msgs.append('>' + line)
+
+        msgs.append('-' * (error_index + 1) + '^')
+
+        raise LexError('\n'.join(msgs), t.value, self.index)