diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 12b7b16..b37e24f 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -13,7 +13,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.11.1 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3dbdb40..51c6c25 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: with: python-version: 3.11.1 - - name: Code Formatting (App) + - name: Run tests run: | pip install . pip install pytest diff --git a/.github/workflows/windows-tests.yml b/.github/workflows/windows-tests.yml new file mode 100644 index 0000000..4fb6080 --- /dev/null +++ b/.github/workflows/windows-tests.yml @@ -0,0 +1,18 @@ +name: Code Tests (On Windows) + +on: push + +jobs: + formatting: + runs-on: windows-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + with: + python-version: 3.11.1 + + - name: Run tests + run: | + pip install . + pip install pytest + python3 -m pytest . -s diff --git a/README.md b/README.md index c1b183e..36f6385 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -

Salve v0.6.0

+

Salve v0.7.0

# Installation @@ -8,8 +8,10 @@ In the Command Line, paste the following: `pip install salve_ipc` Salve is an IPC library that can be used by code editors to easily get autocompletions, replacements, editorconfig suggestions, definitions, and syntax highlighting. -> **Note** -> The first time that the system is loaded or a new server needs to be started it will take a fair bit longer. Additionally, any usage of IPC needs to eventually be called from an `if __name__ == "__main__":` block to prevent a multiproccesing error. +> **Notes:** +> - The first time that the system is loaded or a new server needs to be started it will take a fair bit longer. +> - Any usage of IPC needs to eventually be called from an `if __name__ == "__main__":` block to prevent a multiproccesing error. +> - Due to the way Windows handles chars the hidden character highlighter may not work properly. See [#57](https://github.com/Moosems/salve/pull/57). If anyone knows how to fix this, I would greatly appreciate you opening a PR :) ## Documentation @@ -94,6 +96,11 @@ if __name__ == "__main__": To contribute, fork the repository, make your changes, and then make a pull request. If you want to add a feature, please open an issue first so it can be discussed. Note that whenever and wherever possible you should try to use stdlib modules rather than external ones. +## Required Python Version: 3.11+ + +Salve IPC will use the three most recent versions (full releases) going forward and will drop any older versions as I hope to keep this package up to date with modern python versions. +Currently 3.11 is the minimum (instead of 3.10) as it what Salve IPC was developed under. However, after 3.14 is released, the minimum will be 3.12 as and will change accordingly in the future as is described above. + ## License This project is licensed under the MIT License - see the [LICENSE](./LISCENSE). diff --git a/salve_ipc/ipc.py b/salve_ipc/ipc.py index de7877a..70a2587 100644 --- a/salve_ipc/ipc.py +++ b/salve_ipc/ipc.py @@ -2,6 +2,7 @@ from multiprocessing.connection import Connection from pathlib import Path from random import randint +from sys import platform from .misc import ( COMMAND, @@ -15,6 +16,12 @@ ) from .server import Server +# Deal with Windows weirdness +if platform == "win32": + from multiprocessing.connection import ( + PipeConnection as Connection, # type: ignore + ) + class IPC: """The IPC class is used to talk to the server and run commands. The public API includes the following methods: diff --git a/salve_ipc/server.py b/salve_ipc/server.py index ff065aa..f120414 100644 --- a/salve_ipc/server.py +++ b/salve_ipc/server.py @@ -1,5 +1,6 @@ from multiprocessing.connection import Connection from multiprocessing.queues import Queue as GenericClassQueue +from sys import platform from time import sleep from pyeditorconfig import get_config @@ -20,6 +21,12 @@ get_replacements, ) +# Deal with Windows weirdness +if platform == "win32": + from multiprocessing.connection import ( + PipeConnection as Connection, # type: ignore + ) + class Server: """Handles input from the user and returns output from special functions designed to make the job easy. Not an external API.""" diff --git a/salve_ipc/server_functions/__init__.py b/salve_ipc/server_functions/__init__.py index faa2fc1..e9b00e6 100644 --- a/salve_ipc/server_functions/__init__.py +++ b/salve_ipc/server_functions/__init__.py @@ -1,5 +1,6 @@ from .autocompletions import find_autocompletions # noqa: F401 from .definitions import get_definition # noqa: F401 from .highlight import get_highlights # noqa: F401 -from .misc import Token, generic_tokens, is_unicode_letter # noqa: F401 +from .highlight import Token, generic_tokens # noqa: F401 +from .misc import is_unicode_letter # noqa: F401 from .replacements import get_replacements # noqa: F401 diff --git a/salve_ipc/server_functions/definitions.py b/salve_ipc/server_functions/definitions.py index dd6dbf9..3a23642 100644 --- a/salve_ipc/server_functions/definitions.py +++ b/salve_ipc/server_functions/definitions.py @@ -1,6 +1,7 @@ from re import Match, Pattern, compile -from .misc import Token, find_words +from .highlight import Token +from .misc import find_words def get_definition( diff --git a/salve_ipc/server_functions/find_words.py b/salve_ipc/server_functions/find_words.py deleted file mode 100644 index 8b13789..0000000 --- a/salve_ipc/server_functions/find_words.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/salve_ipc/server_functions/highlight.py b/salve_ipc/server_functions/highlight.py deleted file mode 100644 index 946cb6f..0000000 --- a/salve_ipc/server_functions/highlight.py +++ /dev/null @@ -1,364 +0,0 @@ -from re import MULTILINE, Match, Pattern, compile - -from beartype.typing import Callable -from pygments import lex -from pygments.lexer import Lexer, RegexLexer, default -from pygments.lexers import get_lexer_by_name -from pygments.token import Comment as CommentToken -from pygments.token import String as StringToken - -from .misc import Token, generic_tokens - -default_tokens: list[str] = [ - "Token.Text.Whitespace", - "Token.Text", - "Token.Error", - "Token.Keyword", - "Token.Name", - "Token.Literal.String", - "Token.Literal.Number", - "Token.Literal", - "Token.Operator", - "Token.Punctuation", - "Token.Comment", - "Token.Generic", -] - - -def get_new_token_type(old_token: str) -> str: - """Turns pygments token types into a generic predefined Token""" - new_type: str = generic_tokens[0] - for index, token in enumerate(default_tokens): - if old_token.startswith(token): - new_type = generic_tokens[index] - break - return new_type - - -url_regex: Pattern = compile(r"(ftp|http|https):\/\/[a-zA-Z0-9_-]") - - -def get_urls(lines: list[str], start_line: int = 1) -> list[Token]: - start_pos: tuple[int, int] = (start_line, 0) - url_toks: list[Token] = [] - while True: - if start_pos[0] >= len(lines) + start_line: - break - line: str = lines[start_pos[0] - start_line][start_pos[1] :] - match_start: Match[str] | None = url_regex.search(line) - if match_start is None: - start_pos = (start_pos[0] + 1, 0) - continue - token_start_col = match_start.span()[0] # type: ignore - url: str = line[token_start_col:] - - # Narrow down the url - url = url.strip() - url = url.split()[0] - url = url.split("'")[0] - url = url.split("`")[0] - url = url.split('"')[0] - url = url.rstrip(".,?!") - if "(" not in url: # urls can contain spaces (e.g. wikipedia) - url = url.rstrip(")") - url = url.rstrip(".,?!") - - url_len: int = len(url) - token: Token = ((start_pos[0], token_start_col), url_len, "Link") - url_toks.append(token) - start_pos = (start_pos[0], start_pos[1] + url_len + token_start_col) - - return url_toks - - -hidden_chars: dict[str, str] = { - "\u0009": "CHARACTER TABULATION", - "\u00a0": "NO-BREAK SPACE", - "\u00ad": "SOFT HYPHEN", - "\u034f": "COMBINING GRAPHEME JOINER", - "\u061c": "ARABIC LETTER MARK", - "\u115f": "HANGUL CHOSEONG FILLER", - "\u1160": "HANGUL JUNGSEONG FILLER", - "\u17b4": "KHMER VOWEL INHERENT AQ", - "\u17b5": "KHMER VOWEL INHERENT AA", - "\u180e": "MONGOLIAN VOWEL SEPARATOR", - "\u2000": "EN QUAD", - "\u2001": "EM QUAD", - "\u2002": "EN SPACE", - "\u2003": "EM SPACE", - "\u2004": "THREE-PER-EM SPACE", - "\u2005": "FOUR-PER-EM SPACE", - "\u2006": "SIX-PER-EM SPACE", - "\u2007": "FIGURE SPACE", - "\u2008": "PUNCTUATION SPACE", - "\u2009": "THIN SPACE", - "\u200a": "HAIR SPACE", - "\u200b": "ZERO WIDTH SPACE", - "\u200c": "ZERO WIDTH NON-JOINER", - "\u200d": "ZERO WIDTH JOINER", - "\u200e": "LEFT-TO-RIGHT MARK", - "\u200f": "RIGHT-TO-LEFT MARK", - "\u202f": "NARROW NO-BREAK SPACE", - "\u205f": "MEDIUM MATHEMATICAL SPACE", - "\u2060": "WORD JOINER", - "\u2061": "FUNCTION APPLICATION", - "\u2062": "INVISIBLE TIMES", - "\u2063": "INVISIBLE SEPARATOR", - "\u2064": "INVISIBLE PLUS", - "\u206a": "INHIBIT SYMMETRIC SWAPPING", - "\u206b": "ACTIVATE SYMMETRIC SWAPPING", - "\u206c": "INHIBIT ARABIC FORM SHAPING", - "\u206d": "ACTIVATE ARABIC FORM SHAPING", - "\u206e": "NATIONAL DIGIT SHAPES", - "\u206f": "NOMINAL DIGIT SHAPES", - "\u3000": "IDEOGRAPHIC SPACE", - "\u2800": "BRAILLE PATTERN BLANK", - "\u3164": "HANGUL FILLER", - "\ufeff": "ZERO WIDTH NO-BREAK SPACE", - "\uffa0": "HALFWIDTH HANGUL FILLER", - "\u1d159": "MUSICAL SYMBOL NULL NOTEHEAD", - "\u1d173": "MUSICAL SYMBOL BEGIN BEAM", - "\u1d174": "MUSICAL SYMBOL END BEAM", - "\u1d175": "MUSICAL SYMBOL BEGIN TIE", - "\u1d176": "MUSICAL SYMBOL END TIE", - "\u1d177": "MUSICAL SYMBOL BEGIN SLUR", - "\u1d178": "MUSICAL SYMBOL END SLUR", - "\u1d179": "MUSICAL SYMBOL BEGIN PHRASE", - "\u1d17A": "MUSICAL SYMBOL END PHRASE", - "\ue0020": "TAG SPACE", -} - - -def find_hidden_chars(lines: list[str], start_line: int = 1) -> list[Token]: - hidden_char_indexes: list[tuple[tuple[int, int], str]] = [ - ((line_index + start_line, char_index), char) - for line_index, line in enumerate(lines) - for char_index, char in enumerate(line) - if char in list(hidden_chars.keys()) - ] - tok_list: list[Token] = [ - (char[0], len(char[1]), "Hidden_Char") for char in hidden_char_indexes - ] - return tok_list - - -# Instantiate some useful variables/types for the following functions -useful_toks = { - StringToken.Doc, - StringToken.Heredoc, - CommentToken, - CommentToken.Multiline, -} - -# Beartype speed optimizations -_TokenType = type(StringToken) # Resolves to pygments.token._TokenType -_TokenTupleInternalType = tuple[_TokenType | Callable, ...] -_TokenTupleReturnType = list[tuple[str, _TokenType]] -_ListOfStrs = list[str] -_LexReturnTokens = list[tuple[_TokenType, str]] - - -def get_pygments_comment_regexes(lexer: RegexLexer) -> _TokenTupleReturnType: - """ - Steals the regexes that pgments uses to give docstring, heredoc, comment, and multiline comment highlights - (css comments, though multine, aren't called multiline comments) - """ - - regexes: _TokenTupleReturnType = [] - - for path in lexer.tokens: - # This should have a better type definition but I didn't have the mental capacity to - # write each possibility so I'm waiting for beartype to implement the functionality for me like the bum I am - path_tokens: list = lexer.tokens[path] - - if isinstance(path_tokens[0], str): - # This means that the path is redirecting to another path in its place but we check them all anyway so just exit this path - continue - - for token_tuple in path_tokens: - # Ensure that this is actually a tuple and not a random type - if isinstance(token_tuple, default): - continue - - if token_tuple[1] in useful_toks: - regexes.append((token_tuple[0], token_tuple[1])) - continue - - # The Token tuple SHOULD be a callable at this point - if not callable(token_tuple[1]): - continue - - pygments_func: Callable = token_tuple[1] - - if pygments_func.__closure__ is None: - # Will always evaluate to False but its for the static type checkers appeasement - continue - - tokens: _TokenTupleInternalType = [ - cell.cell_contents for cell in token_tuple[1].__closure__ - ][ - 0 - ] # Sometimes pygments hides these types in functional programming - - for token in tokens: - if token in useful_toks: - # We know if its in the useful tokens list that its a token type but the static type checker doesn't - regexes.append((token_tuple[0], token)) # type: ignore - continue - - return list(set(regexes)) # type: ignore - - -def proper_docstring_tokens(lexer: RegexLexer, full_text: str) -> list[Token]: - proper_highlight_regexes: _TokenTupleReturnType = ( - get_pygments_comment_regexes(lexer) - ) - - new_docstring_tokens: list[Token] = [] - split_text: _ListOfStrs = full_text.splitlines() - - for regex, token_type in proper_highlight_regexes: - current_text = full_text - match: Match[str] | None = compile(regex, flags=MULTILINE).search( - full_text - ) - - if match is None: - # Onwards to the next regex! - continue - - start_pos: tuple[int, int] = (1, 0) - simple_token_type: str = get_new_token_type(str(token_type)) - - while match: - span: tuple[int, int] = match.span() - matched_str: str = current_text[span[0] : span[1]] - - # Remove any whitespace previous to the match and update span accordingly - matched_len_initial: int = len(matched_str) - matched_str = matched_str.lstrip() - matched_len_lstripped: int = len(matched_str) - span = ( - (span[0] + matched_len_initial - matched_len_lstripped), - span[1], - ) - - # Other useful variables without relation - newline_count: int = matched_str.count("\n") - previous_text: str = current_text[: span[0]] - - start_line: int = previous_text.count("\n") + start_pos[0] - - # Deal with the easy case first - if not newline_count: - # Prepare token variables - start_col: int = split_text[start_line].find(matched_str) - current_text: str = full_text[span[0] + span[1] - span[0] :] - - # Create and add token - token: Token = ( - (start_line, start_col), - matched_len_lstripped, - simple_token_type, - ) - new_docstring_tokens.append(token) - - start_pos = (start_line, start_col + matched_len_lstripped) - current_text = current_text[: span[1]] - - # Continue onward! - match = compile(regex, flags=MULTILINE).search(current_text) - continue - - # Now for multiple line matches - split_match: list[str] = matched_str.splitlines() - for i in range(newline_count + 1): - match_str: str = split_match[i] - initial_len: int = len(match_str) - start_col: int = initial_len - len(match_str.lstrip()) - - if i == 0: - line: str = split_text[start_line - 1] - - true_len: int = len(line) - lstripped_len: int = len(line.lstrip()) - initial_len = lstripped_len - if lstripped_len != true_len: - # In case the regex doesn't skip whitespace/junk - initial_len = true_len - - start_col = line.find(match_str) - - # Create and add token - token: Token = ( - (start_line + i, start_col), - initial_len - start_col, - simple_token_type, - ) - new_docstring_tokens.append(token) - - start_pos = (start_line + i, start_col + len(match_str)) - - # Continue onward! - current_text = current_text[span[1] :] - match = compile(regex, flags=MULTILINE).search(current_text) - - return new_docstring_tokens - - -def get_highlights( - full_text: str, - language: str = "text", - text_range: tuple[int, int] = (1, -1), -) -> list[Token]: - """Gets pygments tokens from text provided in language proved and converts them to Token's""" - - # Create some variables used all throughout the function - lexer: Lexer = get_lexer_by_name(language) - split_text: _ListOfStrs = full_text.splitlines() - new_tokens: list[Token] = [] - - if text_range[1] == -1: - # This indicates that the text range should span the length of the entire code - text_range = (text_range[0], len(split_text)) - - start_index: tuple[int, int] = (text_range[0], 0) - # We want only the lines in the text range because this list is iterated - split_text: _ListOfStrs = split_text[text_range[0] - 1 : text_range[1]] - - for line in split_text: - og_tokens: _LexReturnTokens = list(lex(line, lexer)) - for token in og_tokens: - new_type: str = get_new_token_type(str(token[0])) - token_str: str = token[1] - token_len: int = len(token_str) - - if token_str == "\n": - # Lexer adds the newline back as its own token - continue - - if not token_str.strip() and new_type == "Text": - # If the token is empty or is plain Text we simply skip it because thats ultimately useless info - start_index = (start_index[0], start_index[1] + token_len) - continue - - # Create and append the Token that will be returned - new_token = (start_index, token_len, new_type) - new_tokens.append(new_token) - - start_index = (start_index[0], start_index[1] + token_len) - start_index = (start_index[0] + 1, 0) - - # Add extra token types - # NOTE: we add these at the end so that when they are applied one by one by the editor these - # override older tokens that may not be as accurate - - if isinstance(lexer, RegexLexer): - new_tokens += proper_docstring_tokens(lexer, full_text) - - new_tokens += get_urls(split_text, text_range[0]) - if [char for char in hidden_chars if char in full_text]: - # if there are not hidden chars we don't want to needlessly compute this - new_tokens += find_hidden_chars(split_text, text_range[0]) - - return new_tokens diff --git a/salve_ipc/server_functions/highlight/__init__.py b/salve_ipc/server_functions/highlight/__init__.py new file mode 100644 index 0000000..126f3b4 --- /dev/null +++ b/salve_ipc/server_functions/highlight/__init__.py @@ -0,0 +1,2 @@ +from .highlight import get_highlights # noqa: F401 +from .tokens import Token, generic_tokens # noqa: F401 diff --git a/salve_ipc/server_functions/highlight/docstring_highlight.py b/salve_ipc/server_functions/highlight/docstring_highlight.py new file mode 100644 index 0000000..e770b2a --- /dev/null +++ b/salve_ipc/server_functions/highlight/docstring_highlight.py @@ -0,0 +1,172 @@ +from functools import cache +from re import MULTILINE, Match, compile + +from beartype.typing import Callable +from pygments.lexer import RegexLexer, default +from pygments.token import Comment as CommentToken +from pygments.token import String as StringToken + +from .tokens import Token, get_new_token_type + +useful_toks = { + StringToken.Doc, + StringToken.Heredoc, + CommentToken, + CommentToken.Multiline, +} + +# Beartype speed optimizations +_TokenType = type(StringToken) # Resolves to pygments.token._TokenType +_TokenTupleInternalType = tuple[_TokenType | Callable, ...] +_TokenTupleReturnType = list[tuple[str, _TokenType]] +_ListOfStrs = list[str] +_LexReturnTokens = list[tuple[_TokenType, str]] + + +@cache +def get_pygments_comment_regexes(lexer: RegexLexer) -> _TokenTupleReturnType: + """ + Steals the regexes that pgments uses to give docstring, heredoc, comment, and multiline comment highlights + (css comments, though multine, aren't called multiline comments) + """ + + regexes: _TokenTupleReturnType = [] + + for path in lexer.tokens: + # This should have a better type definition but I didn't have the mental capacity to + # write each possibility so I'm waiting for beartype to implement the functionality for me like the bum I am + path_tokens: list = lexer.tokens[path] + + if isinstance(path_tokens[0], str): + # This means that the path is redirecting to another path in its place but we check them all anyway so just exit this path + continue + + for token_tuple in path_tokens: + # Ensure that this is actually a tuple and not a random type + if isinstance(token_tuple, default): + continue + + if token_tuple[1] in useful_toks: + regexes.append((token_tuple[0], token_tuple[1])) + continue + + # The Token tuple SHOULD be a callable at this point + if not callable(token_tuple[1]): + continue + + pygments_func: Callable = token_tuple[1] + + if pygments_func.__closure__ is None: + # Will always evaluate to False but its for the static type checkers appeasement + continue + + tokens: _TokenTupleInternalType = [ + cell.cell_contents for cell in token_tuple[1].__closure__ + ][ + 0 + ] # Sometimes pygments hides these types in functional programming + + for token in tokens: + if token in useful_toks: + # We know if its in the useful tokens list that its a token type but the static type checker doesn't + regexes.append((token_tuple[0], token)) # type: ignore + continue + + return list(set(regexes)) # type: ignore + + +def proper_docstring_tokens(lexer: RegexLexer, full_text: str) -> list[Token]: + proper_highlight_regexes: _TokenTupleReturnType = ( + get_pygments_comment_regexes(lexer) + ) + + new_docstring_tokens: list[Token] = [] + split_text: _ListOfStrs = full_text.splitlines() + + for regex, token_type in proper_highlight_regexes: + current_text = full_text + match: Match[str] | None = compile(regex, flags=MULTILINE).search( + full_text + ) + + if match is None: + # Onwards to the next regex! + continue + + start_pos: tuple[int, int] = (1, 0) + simple_token_type: str = get_new_token_type(str(token_type)) + + while match: + span: tuple[int, int] = match.span() + matched_str: str = current_text[span[0] : span[1]] + + # Remove any whitespace previous to the match and update span accordingly + matched_len_initial: int = len(matched_str) + matched_str = matched_str.lstrip() + matched_len_lstripped: int = len(matched_str) + span = ( + (span[0] + matched_len_initial - matched_len_lstripped), + span[1], + ) + + # Other useful variables without relation + newline_count: int = matched_str.count("\n") + previous_text: str = current_text[: span[0]] + + start_line: int = previous_text.count("\n") + start_pos[0] + + # Deal with the easy case first + if not newline_count: + # Prepare token variables + start_col: int = split_text[start_line].find(matched_str) + current_text: str = full_text[span[0] + span[1] - span[0] :] + + # Create and add token + token: Token = ( + (start_line, start_col), + matched_len_lstripped, + simple_token_type, + ) + new_docstring_tokens.append(token) + + start_pos = (start_line, start_col + matched_len_lstripped) + current_text = current_text[: span[1]] + + # Continue onward! + match = compile(regex, flags=MULTILINE).search(current_text) + continue + + # Now for multiple line matches + split_match: list[str] = matched_str.splitlines() + for i in range(newline_count + 1): + match_str: str = split_match[i] + initial_len: int = len(match_str) + start_col: int = initial_len - len(match_str.lstrip()) + + if i == 0: + line: str = split_text[start_line - 1] + + true_len: int = len(line) + lstripped_len: int = len(line.lstrip()) + initial_len = lstripped_len + if lstripped_len != true_len: + # In case the regex doesn't skip whitespace/junk + initial_len = true_len + + start_col = line.find(match_str) + + # Create and add token + token: Token = ( + (start_line + i, start_col), + initial_len - start_col, + simple_token_type, + ) + new_docstring_tokens.append(token) + + start_pos = (start_line + i, start_col + len(match_str)) + + # Continue onward! + current_text = current_text[span[1] :] + match = compile(regex, flags=MULTILINE).search(current_text) + + return new_docstring_tokens diff --git a/salve_ipc/server_functions/highlight/highlight.py b/salve_ipc/server_functions/highlight/highlight.py new file mode 100644 index 0000000..2f4ab64 --- /dev/null +++ b/salve_ipc/server_functions/highlight/highlight.py @@ -0,0 +1,76 @@ +from functools import cache + +from pygments import lex +from pygments.lexer import Lexer, RegexLexer +from pygments.lexers import get_lexer_by_name + +from .docstring_highlight import _LexReturnTokens, proper_docstring_tokens +from .links_and_hidden_chars import find_hidden_chars, get_urls, hidden_chars +from .tokens import ( + Token, + get_new_token_type, + only_tokens_in_text_range, + overwrite_and_merge_tokens, +) + + +@cache +def lexer_by_name_cached(language: str) -> Lexer: + return get_lexer_by_name(language) + + +def get_highlights( + full_text: str, + language: str = "text", + text_range: tuple[int, int] = (1, -1), +) -> list[Token]: + """Gets pygments tokens from text provided in language proved and converts them to Token's""" + + # Create some variables used all throughout the function + lexer: Lexer = lexer_by_name_cached(language) + split_text: list[str] = full_text.splitlines() + new_tokens: list[Token] = [] + + if text_range[1] == -1: + # This indicates that the text range should span the length of the entire code + text_range = (text_range[0], len(split_text)) + + start_index: tuple[int, int] = (text_range[0], 0) + # We want only the lines in the text range because this list is iterated + split_text: list[str] = split_text[text_range[0] - 1 : text_range[1]] + + for line in split_text: + og_tokens: _LexReturnTokens = list(lex(line, lexer)) + for token in og_tokens: + new_type: str = get_new_token_type(str(token[0])) + token_str: str = token[1] + token_len: int = len(token_str) + + if token_str == "\n": + # Lexer adds the newline back as its own token + continue + + if not token_str.strip() and new_type == "Text": + # If the token is empty or is plain Text we simply skip it because thats ultimately useless info + start_index = (start_index[0], start_index[1] + token_len) + continue + + # Create and append the Token that will be returned + new_token = (start_index, token_len, new_type) + new_tokens.append(new_token) + + start_index = (start_index[0], start_index[1] + token_len) + start_index = (start_index[0] + 1, 0) + + if isinstance(lexer, RegexLexer): + new_tokens = overwrite_and_merge_tokens( + new_tokens, proper_docstring_tokens(lexer, full_text) + ) + + new_tokens += get_urls(split_text, text_range[0]) + if [char for char in hidden_chars if char in full_text]: + # if there are not hidden chars we don't want to needlessly compute this + new_tokens += find_hidden_chars(split_text, text_range[0]) + + new_tokens = only_tokens_in_text_range(new_tokens, text_range) + return new_tokens diff --git a/salve_ipc/server_functions/highlight/links_and_hidden_chars.py b/salve_ipc/server_functions/highlight/links_and_hidden_chars.py new file mode 100644 index 0000000..3d5e5e4 --- /dev/null +++ b/salve_ipc/server_functions/highlight/links_and_hidden_chars.py @@ -0,0 +1,109 @@ +from re import Match, Pattern, compile + +from .tokens import Token + +url_regex: Pattern = compile(r"(ftp|http|https):\/\/[a-zA-Z0-9_-]") + + +def get_urls(lines: list[str], start_line: int = 1) -> list[Token]: + start_pos: tuple[int, int] = (start_line, 0) + url_toks: list[Token] = [] + while True: + if start_pos[0] >= len(lines) + start_line: + break + line: str = lines[start_pos[0] - start_line][start_pos[1] :] + match_start: Match[str] | None = url_regex.search(line) + if match_start is None: + start_pos = (start_pos[0] + 1, 0) + continue + token_start_col = match_start.span()[0] # type: ignore + url: str = line[token_start_col:] + + # Narrow down the url + url = url.strip() + url = url.split()[0] + url = url.split("'")[0] + url = url.split("`")[0] + url = url.split('"')[0] + url = url.rstrip(".,?!") + if "(" not in url: # urls can contain spaces (e.g. wikipedia) + url = url.rstrip(")") + url = url.rstrip(".,?!") + + url_len: int = len(url) + token: Token = ((start_pos[0], token_start_col), url_len, "Link") + url_toks.append(token) + start_pos = (start_pos[0], start_pos[1] + url_len + token_start_col) + + return url_toks + + +hidden_chars: dict[str, str] = { + "\u0009": "CHARACTER TABULATION", + "\u00a0": "NO-BREAK SPACE", + "\u00ad": "SOFT HYPHEN", + "\u034f": "COMBINING GRAPHEME JOINER", + "\u061c": "ARABIC LETTER MARK", + "\u115f": "HANGUL CHOSEONG FILLER", + "\u1160": "HANGUL JUNGSEONG FILLER", + "\u17b4": "KHMER VOWEL INHERENT AQ", + "\u17b5": "KHMER VOWEL INHERENT AA", + "\u180e": "MONGOLIAN VOWEL SEPARATOR", + "\u2000": "EN QUAD", + "\u2001": "EM QUAD", + "\u2002": "EN SPACE", + "\u2003": "EM SPACE", + "\u2004": "THREE-PER-EM SPACE", + "\u2005": "FOUR-PER-EM SPACE", + "\u2006": "SIX-PER-EM SPACE", + "\u2007": "FIGURE SPACE", + "\u2008": "PUNCTUATION SPACE", + "\u2009": "THIN SPACE", + "\u200a": "HAIR SPACE", + "\u200b": "ZERO WIDTH SPACE", + "\u200c": "ZERO WIDTH NON-JOINER", + "\u200d": "ZERO WIDTH JOINER", + "\u200e": "LEFT-TO-RIGHT MARK", + "\u200f": "RIGHT-TO-LEFT MARK", + "\u202f": "NARROW NO-BREAK SPACE", + "\u205f": "MEDIUM MATHEMATICAL SPACE", + "\u2060": "WORD JOINER", + "\u2061": "FUNCTION APPLICATION", + "\u2062": "INVISIBLE TIMES", + "\u2063": "INVISIBLE SEPARATOR", + "\u2064": "INVISIBLE PLUS", + "\u206a": "INHIBIT SYMMETRIC SWAPPING", + "\u206b": "ACTIVATE SYMMETRIC SWAPPING", + "\u206c": "INHIBIT ARABIC FORM SHAPING", + "\u206d": "ACTIVATE ARABIC FORM SHAPING", + "\u206e": "NATIONAL DIGIT SHAPES", + "\u206f": "NOMINAL DIGIT SHAPES", + "\u3000": "IDEOGRAPHIC SPACE", + "\u2800": "BRAILLE PATTERN BLANK", + "\u3164": "HANGUL FILLER", + "\ufeff": "ZERO WIDTH NO-BREAK SPACE", + "\uffa0": "HALFWIDTH HANGUL FILLER", + "\u1d159": "MUSICAL SYMBOL NULL NOTEHEAD", + "\u1d173": "MUSICAL SYMBOL BEGIN BEAM", + "\u1d174": "MUSICAL SYMBOL END BEAM", + "\u1d175": "MUSICAL SYMBOL BEGIN TIE", + "\u1d176": "MUSICAL SYMBOL END TIE", + "\u1d177": "MUSICAL SYMBOL BEGIN SLUR", + "\u1d178": "MUSICAL SYMBOL END SLUR", + "\u1d179": "MUSICAL SYMBOL BEGIN PHRASE", + "\u1d17A": "MUSICAL SYMBOL END PHRASE", + "\ue0020": "TAG SPACE", +} + + +def find_hidden_chars(lines: list[str], start_line: int = 1) -> list[Token]: + hidden_char_indexes: list[tuple[tuple[int, int], str]] = [ + ((line_index + start_line, char_index), char) + for line_index, line in enumerate(lines) + for char_index, char in enumerate(line) + if char in list(hidden_chars.keys()) + ] + tok_list: list[Token] = [ + (char[0], len(char[1]), "Hidden_Char") for char in hidden_char_indexes + ] + return tok_list diff --git a/salve_ipc/server_functions/highlight/tokens.py b/salve_ipc/server_functions/highlight/tokens.py new file mode 100644 index 0000000..32c08b0 --- /dev/null +++ b/salve_ipc/server_functions/highlight/tokens.py @@ -0,0 +1,209 @@ +from functools import cache + +Token = tuple[tuple[int, int], int, str] + +generic_tokens: list[str] = [ + "Whitespace", + "Text", + "Error", + "Keyword", + "Name", + "String", + "Number", + "Literal", + "Operator", + "Punctuation", + "Comment", + "Generic", + "Link", # Website link (Not given by pygments) + "Hidden_Char", # Hidden chars (no width space kind of stuff) + "Definition", # Definitions +] + +default_tokens: list[str] = [ + "Token.Text.Whitespace", + "Token.Text", + "Token.Error", + "Token.Keyword", + "Token.Name", + "Token.Literal.String", + "Token.Literal.Number", + "Token.Literal", + "Token.Operator", + "Token.Punctuation", + "Token.Comment", + "Token.Generic", +] + + +@cache +def get_new_token_type(old_token: str) -> str: + """Turns pygments token types into a generic predefined Token""" + new_type: str = generic_tokens[0] + for index, token in enumerate(default_tokens): + if old_token.startswith(token): + new_type = generic_tokens[index] + break + return new_type + + +def only_tokens_in_text_range( + tokens: list[Token], text_range: tuple[int, int] +) -> list[Token]: + # We create a new list becase lists are pass by reference + output_tokens: list[Token] = [] + + for token in tokens: + token_lineno: int = token[0][0] + minimum_line: int = text_range[0] + maximum_line: int = text_range[1] + + if token_lineno < minimum_line or token_lineno > maximum_line: + continue + + output_tokens.append(token) + + output_tokens = merge_tokens(output_tokens) + return output_tokens + + +def merge_tokens(tokens: list[Token]) -> list[Token]: + output_tokens: list[Token] = [] + depth: int = 0 + for token in tokens: + # Deal with basic edge case + if depth == 0: + output_tokens.append(token) + depth += 1 + continue + + previous_token = output_tokens[-1] + + # Get our boolean checks + same_token_type: bool = previous_token[2] == token[2] + same_line: bool = previous_token[0][0] == token[0][0] + neighboring_tokens: bool = ( + previous_token[0][1] + previous_token[1] == token[0][1] + ) + + # Determine if tokens should be merged + if not (same_token_type and same_line and neighboring_tokens): + output_tokens.append(token) + depth += 1 + continue + + # Replace previous token with new token (we don't increase depth because we are substituting, not adding) + new_token: Token = ( + (token[0][0], previous_token[0][1]), + previous_token[1] + token[1], + token[2], + ) + output_tokens[-1] = new_token + return output_tokens + + +def overwrite_tokens(old_tokens: list[Token], new_tokens: list[Token]): + output_tokens: list[Token] = [] + dont_add_tokens: list[Token] = [] + for new_token in new_tokens: + for old_token in old_tokens: + same_token: bool = old_token == new_token + if same_token: + continue + + same_line: bool = old_token[0][0] == new_token[0][0] + can_add_token: bool = old_token not in dont_add_tokens + if not same_line: + if can_add_token: + output_tokens.append(old_token) + continue + + # Check if the ranges overlap and if so either (remove the old_token and add to don't add list) or, + # if part of the token is out of the new_token_range, remove the part in the new tokens range + + old_token_end: int = old_token[0][1] + old_token[1] + new_token_end: int = new_token[0][1] + new_token[1] + + partial_front_overlap: bool = ( + new_token[0][1] <= old_token_end + and not old_token_end > new_token_end + ) + partial_end_overlap: bool = new_token_end >= old_token[0][1] + fully_contained: bool = ( + old_token_end <= new_token_end + and old_token[0][1] >= new_token[0][1] + ) + + if not ( + partial_front_overlap or partial_end_overlap or fully_contained + ): + continue + + dont_add_tokens.append(old_token) + + while old_token in output_tokens: + output_tokens.remove(old_token) + + if fully_contained: + continue + + # If we are here if means its a partial overlap + if partial_front_overlap: + created_token: Token = ( + (new_token[0][0], old_token[0][1]), + new_token[0][1] - old_token[0][1], + old_token[2], + ) + while created_token in output_tokens: + output_tokens.remove(created_token) + output_tokens.append(created_token) + dont_add_tokens.append(created_token) + continue + + if old_token[0][1] < new_token[0][1]: + created_token_1: Token = ( + (new_token[0][0], old_token[0][1]), + new_token[0][1] - old_token[0][1], + old_token[2], + ) + created_token_2: Token = ( + (new_token[0][0], new_token_end), + old_token_end - new_token_end, + old_token[2], + ) + while created_token_1 in output_tokens: + output_tokens.remove(created_token_1) + output_tokens.append(created_token_1) + while created_token_2 in output_tokens: + output_tokens.remove(created_token_2) + output_tokens.append(created_token_2) + dont_add_tokens.append(created_token_1) + dont_add_tokens.append(created_token_2) + + created_token: Token = ( + (new_token[0][0], new_token_end), + old_token_end - new_token_end, + old_token[2], + ) + while created_token in output_tokens: + output_tokens.remove(created_token) + output_tokens.append(created_token) + dont_add_tokens.append(created_token) + + output_tokens.append(new_token) + + output_tokens = sorted(set(output_tokens)) + return output_tokens + + +def overwrite_and_merge_tokens( + old_tokens: list[Token], new_tokens: list[Token] +) -> list[Token]: + merged_old_tokens: list[Token] = merge_tokens(sorted(set(old_tokens))) + merged_new_tokens: list[Token] = merge_tokens(sorted(set(new_tokens))) + output_tokens: list[Token] = overwrite_tokens( + merged_old_tokens, merged_new_tokens + ) + + output_tokens = sorted(set(merge_tokens(output_tokens))) + return output_tokens diff --git a/salve_ipc/server_functions/misc.py b/salve_ipc/server_functions/misc.py index cde5dd5..3b0bc9f 100644 --- a/salve_ipc/server_functions/misc.py +++ b/salve_ipc/server_functions/misc.py @@ -1,26 +1,8 @@ +from functools import cache from unicodedata import category -Token = tuple[tuple[int, int], int, str] - -generic_tokens: list[str] = [ - "Whitespace", - "Text", - "Error", - "Keyword", - "Name", - "String", - "Number", - "Literal", - "Operator", - "Punctuation", - "Comment", - "Generic", - "Link", # Website link (Not given by pygments) - "Hidden_Char", # Hidden chars (no width space kind of stuff) - "Definition", # Definitions -] - +@cache def is_unicode_letter(char: str) -> bool: """Returns a boolean value of whether a given unicode char is a letter or not (includes "_" for code completion reasons)""" return char == "_" or category(char).startswith("L") diff --git a/setup.py b/setup.py index 6a77db1..1fae367 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="salve_ipc", - version="0.6.0", + version="0.7.0", description="Salve is an IPC library that can be used by code editors to easily get autocompletions, replacements, editorconfig suggestions, definitions, and syntax highlighting.", author="Moosems", author_email="moosems.j@gmail.com", @@ -15,7 +15,7 @@ long_description=long_description, long_description_content_type="text/markdown", install_requires=["pygments", "pyeditorconfig", "beartype"], - python_requires=">=3.9", + python_requires=">=3.11", license="MIT license", classifiers=[ "Development Status :: 3 - Alpha", diff --git a/tests/test_defintions.py b/tests/test_defintions.py index ad6bd78..432853e 100644 --- a/tests/test_defintions.py +++ b/tests/test_defintions.py @@ -1,3 +1,5 @@ +from pathlib import Path + from salve_ipc.server_functions import get_definition @@ -9,7 +11,7 @@ def test_get_definition(): (r"class ", "after"), (r":?.*=.*", "before"), ] - file = open("tests/testing_file2.py", "r+").read() + file = open(Path("tests/testing_file2.py"), "r+").read() assert get_definition( file, diff --git a/tests/test_ipc.py b/tests/test_ipc.py index 914d51c..b4c0cbc 100644 --- a/tests/test_ipc.py +++ b/tests/test_ipc.py @@ -1,3 +1,5 @@ +from pathlib import Path +from sys import platform from time import sleep from salve_ipc import ( @@ -14,7 +16,9 @@ def test_IPC(): context = IPC() - context.update_file("test", open("tests/testing_file1.py", "r+").read()) + context.update_file( + "test", open(Path("tests/testing_file1.py"), "r+").read() + ) context.request( AUTOCOMPLETE, @@ -28,7 +32,9 @@ def test_IPC(): expected_keywords=[], current_word="thid", ) - context.request(HIGHLIGHT, file="test", language="python") + context.request( + HIGHLIGHT, file="test", language="python", text_range=(1, 18) + ) context.request(EDITORCONFIG, file_path=__file__) context.request( DEFINITION, @@ -74,7 +80,8 @@ def test_IPC(): if highlight_output is None: raise AssertionError("Highlight output is None") highlight_output["id"] = 0 - assert highlight_output == { + + expected_output: Response = { "id": 0, "type": "response", "cancelled": False, @@ -91,39 +98,76 @@ def test_IPC(): ((3, 11), 7, "Comment"), ((5, 0), 5, "Name"), ((5, 5), 1, "Punctuation"), - ((5, 6), 1, "String"), - ((5, 7), 1, "String"), - ((5, 8), 1, "String"), + ((5, 6), 3, "String"), ((5, 9), 1, "Punctuation"), ((5, 12), 16, "Comment"), ((8, 0), 5, "Keyword"), ((8, 6), 3, "Name"), ((8, 9), 1, "Punctuation"), ((8, 10), 3, "Name"), - ((8, 13), 1, "Punctuation"), - ((8, 14), 1, "Punctuation"), + ((8, 13), 2, "Punctuation"), ((9, 4), 3, "String"), - ((10, 4), 4, "Name"), + ((10, 4), 4, "String"), ((11, 4), 3, "String"), ((13, 4), 3, "Keyword"), ((13, 8), 8, "Name"), ((13, 16), 1, "Punctuation"), ((13, 17), 4, "Name"), - ((13, 21), 1, "Punctuation"), - ((13, 22), 1, "Punctuation"), + ((13, 21), 2, "Punctuation"), ((14, 8), 4, "Keyword"), ((17, 0), 3, "Name"), - ((17, 3), 1, "Punctuation"), - ((17, 4), 1, "Punctuation"), + ((17, 3), 2, "Punctuation"), ((18, 0), 24, "Comment"), - ((9, 4), 3, "String"), - ((10, 4), 4, "String"), - ((11, 4), 3, "String"), ((18, 2), 22, "Link"), ((5, 7), 1, "Hidden_Char"), ], } + # Deal with Windows weirdness + if platform == "win32": + expected_output = { + "id": 0, + "type": "response", + "cancelled": False, + "command": HIGHLIGHT, + "result": [ + ((1, 0), 4, "Keyword"), + ((1, 5), 4, "Name"), + ((1, 10), 6, "Keyword"), + ((1, 17), 1, "Name"), + ((1, 20), 12, "Comment"), + ((3, 0), 3, "Name"), + ((3, 4), 1, "Operator"), + ((3, 6), 3, "Name"), + ((3, 11), 7, "Comment"), + ((5, 0), 5, "Name"), + ((5, 5), 1, "Punctuation"), + ((5, 6), 5, "String"), + ((5, 11), 1, "Punctuation"), + ((5, 14), 16, "Comment"), + ((8, 0), 5, "Keyword"), + ((8, 6), 3, "Name"), + ((8, 9), 1, "Punctuation"), + ((8, 10), 3, "Name"), + ((8, 13), 2, "Punctuation"), + ((9, 4), 3, "String"), + ((10, 4), 4, "String"), + ((11, 4), 3, "String"), + ((13, 4), 3, "Keyword"), + ((13, 8), 8, "Name"), + ((13, 16), 1, "Punctuation"), + ((13, 17), 4, "Name"), + ((13, 21), 2, "Punctuation"), + ((14, 8), 4, "Keyword"), + ((17, 0), 3, "Name"), + ((17, 3), 2, "Punctuation"), + ((18, 0), 24, "Comment"), + ((18, 2), 22, "Link"), + ], + } + + assert highlight_output == expected_output + context.remove_file("test") context.kill_IPC() diff --git a/tests/testing_file1.py b/tests/testing_file1.py index b40a7f8..7da5ef0 100644 --- a/tests/testing_file1.py +++ b/tests/testing_file1.py @@ -16,3 +16,6 @@ def __init__(self): Foo() # https://www.google.com +""" +test +"""