-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit a5df5fd
Showing
10 changed files
with
744 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
|
||
# Created by https://www.gitignore.io/api/flask,python | ||
# Edit at https://www.gitignore.io/?templates=flask,python | ||
|
||
# | ||
model/ | ||
|
||
### Flask ### | ||
instance/* | ||
!instance/.gitignore | ||
.webassets-cache | ||
|
||
### Flask.Python Stack ### | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
|
||
# celery beat schedule file | ||
celerybeat-schedule | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
**/.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
ARG TENSORFLOW_VERSION=1.14.0-gpu-py3 | ||
FROM tensorflow/tensorflow:${TENSORFLOW_VERSION} | ||
|
||
WORKDIR / | ||
|
||
RUN apt-get -y update && apt-get -y upgrade && \ | ||
apt-get install -y --no-install-recommends curl | ||
|
||
COPY . /galois | ||
WORKDIR /galois | ||
RUN curl -SL https://github.com/iedmrc/galois-autocompleter/releases/latest/download/model.tar.xz \ | ||
| tar -xJC . && \ | ||
pip --no-cache-dir install --upgrade pip && \ | ||
pip --no-cache-dir install -r requirements.txt && \ | ||
apt purge -y git curl && \ | ||
apt autoremove --purge -y && \ | ||
apt clean && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
CMD [ "python", "main.py" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) [2019] [Ibrahim Ethem DEMIRCI] | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
<h1 align="center">Galois Autocompleter</h1> | ||
<p> | ||
<img alt="Version" src="https://img.shields.io/badge/version-0.1.0-blue.svg?cacheSeconds=2592000" /> | ||
<a href="https://twitter.com/iedmrc"> | ||
<img alt="Twitter: iedmrc" src="https://img.shields.io/twitter/follow/iedmrc.svg?style=social" target="_blank" /> | ||
</a> | ||
</p> | ||
|
||
> An autocompleter for code editors based on [OpenAI GPT-2](https://github.com/openai/gpt-2). | ||
### 🏠 [Homepage](usegalois.com) | ||
|
||
**Galois** is an auto code completer for code editors (or any text editor) based on [OpenAI GPT-2](https://github.com/openai/gpt-2). It is trained (finetuned) on a curated list of approximately 45K Python (~470MB) files gathered from the Github. It is now just works properly on Python but not bad at other languages (thanks to GPT-2's power). | ||
|
||
This repository now contains the very first release of the **Galois** project. With this project, I aim to create a **Deep Learning based autocompleter** such that anyone can run it on their own computer easily. Thus, coding will be more easier and fun! | ||
|
||
![Galois demo GIF](img/python1.gif) | ||
## Installation | ||
|
||
### With Docker | ||
Either clone the repository and build the image from docker file or directly run the following command: | ||
|
||
```sh | ||
docker run --rm -dit -p 3030:3030 iedmrc/galois-autocompleter | ||
``` | ||
|
||
### Without Docker | ||
|
||
Clone the repository, download the latest model from releases tab and uncompress it into the directory. Then, run the following commands: | ||
```sh | ||
pip3 install -r requirements.txt | ||
``` | ||
```sh | ||
python3 main.py | ||
``` | ||
|
||
## Usage | ||
Currently, there is no extensions for code editors. You can use it through HTTP. When you run the `main.py`, it will serve an HTTP (flask) server. Then you can easily make an POST request to the http://localhost:3030/ with the some `JSON` body like the following: | ||
|
||
```sh | ||
{text: "your python code goes here"} | ||
``` | ||
|
||
An example curl command: | ||
|
||
```sh | ||
curl -X POST \ | ||
http://localhost:3030/autocomplete \ | ||
-H 'Content-Type: application/json' \ | ||
-d '{"text":"import os\nimport sys\n# Count lines of codes in the given directory, separated by file extension.\ndef main(directory):\n line_count = {}\n for filename in os.listdir(directory):\n _, ext = os.path.splitext(filename)\n if ext not"}' | ||
``` | ||
|
||
> Check out the [gist here](https://gist.github.com/iedmrc/1e41197a6a2f7a9a654a0df9bd932290) for a `docker-compose` file. | ||
## Finetuning The Model | ||
Even you can finetune (re-train over) the model with/for your code files. Just follow the `Max Woolf's` [gpt-2-simple](https://github.com/minimaxir/gpt-2-simple) or `Neil Shepperd's` [gpt-2](https://github.com/nshepperd/gpt-2) repositories with **`345M`** version. But don't forget to replace checkpoint (model) with the one in this repository. | ||
|
||
You can train it on the Google Colaboratory for free. But if you need a production-grade (i.e. more accurate) one then you may need to train it for more longer time. In my case, it took ~48 hours on a P100 GPU. | ||
|
||
## Planned Works | ||
|
||
- Train the model to predict in most common programming languages. | ||
- Create extensions for most common code editors to use galois as an autocompleter. | ||
- Create a new, more lightweight but powerful model such that anyone can run it in their computer easily. | ||
|
||
## Contribution | ||
Contributions are welcome. Feel free to create an issue or a pull request. | ||
|
||
## Author | ||
|
||
👤 **Ibrahim Ethem DEMIRCI** | ||
|
||
Twitter: [@iedmrc](https://twitter.com/iedmrc) | Github: [@iedmrc](https://github.com/iedmrc) | Patreon: [@iedmrc](https://patreon.com/iedmrc) | ||
|
||
|
||
Ibrahim's open-source projects are supported by his Patreon. If you found this project helpful, any monetary contributions to the Patreon are appreciated and will be put to good creative use. | ||
|
||
## License | ||
It is licensed under MIT License as found in the LICENSE file. | ||
|
||
## Disclaimer | ||
This repo has no affiliation or relationship with OpenAI. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
"""Byte pair encoding utilities""" | ||
|
||
import os | ||
import json | ||
import regex as re | ||
from functools import lru_cache | ||
|
||
@lru_cache() | ||
def bytes_to_unicode(): | ||
""" | ||
Returns list of utf-8 byte and a corresponding list of unicode strings. | ||
The reversible bpe codes work on unicode strings. | ||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. | ||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. | ||
This is a signficant percentage of your normal, say, 32K bpe vocab. | ||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings. | ||
And avoids mapping to whitespace/control characters the bpe code barfs on. | ||
""" | ||
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) | ||
cs = bs[:] | ||
n = 0 | ||
for b in range(2**8): | ||
if b not in bs: | ||
bs.append(b) | ||
cs.append(2**8+n) | ||
n += 1 | ||
cs = [chr(n) for n in cs] | ||
return dict(zip(bs, cs)) | ||
|
||
def get_pairs(word): | ||
"""Return set of symbol pairs in a word. | ||
Word is represented as tuple of symbols (symbols being variable-length strings). | ||
""" | ||
pairs = set() | ||
prev_char = word[0] | ||
for char in word[1:]: | ||
pairs.add((prev_char, char)) | ||
prev_char = char | ||
return pairs | ||
|
||
class Encoder: | ||
def __init__(self, encoder, bpe_merges, errors='replace'): | ||
self.encoder = encoder | ||
self.decoder = {v:k for k,v in self.encoder.items()} | ||
self.errors = errors # how to handle errors in decoding | ||
self.byte_encoder = bytes_to_unicode() | ||
self.byte_decoder = {v:k for k, v in self.byte_encoder.items()} | ||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) | ||
self.cache = {} | ||
|
||
# Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions | ||
self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") | ||
|
||
def bpe(self, token): | ||
if token in self.cache: | ||
return self.cache[token] | ||
word = tuple(token) | ||
pairs = get_pairs(word) | ||
|
||
if not pairs: | ||
return token | ||
|
||
while True: | ||
bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) | ||
if bigram not in self.bpe_ranks: | ||
break | ||
first, second = bigram | ||
new_word = [] | ||
i = 0 | ||
while i < len(word): | ||
try: | ||
j = word.index(first, i) | ||
new_word.extend(word[i:j]) | ||
i = j | ||
except: | ||
new_word.extend(word[i:]) | ||
break | ||
|
||
if word[i] == first and i < len(word)-1 and word[i+1] == second: | ||
new_word.append(first+second) | ||
i += 2 | ||
else: | ||
new_word.append(word[i]) | ||
i += 1 | ||
new_word = tuple(new_word) | ||
word = new_word | ||
if len(word) == 1: | ||
break | ||
else: | ||
pairs = get_pairs(word) | ||
word = ' '.join(word) | ||
self.cache[token] = word | ||
return word | ||
|
||
def encode(self, text): | ||
bpe_tokens = [] | ||
for token in re.findall(self.pat, text): | ||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) | ||
bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) | ||
return bpe_tokens | ||
|
||
def decode(self, tokens): | ||
text = ''.join([self.decoder[token] for token in tokens]) | ||
text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) | ||
return text | ||
|
||
def get_encoder(model_name, models_dir): | ||
with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f: | ||
encoder = json.load(f) | ||
with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f: | ||
bpe_data = f.read() | ||
bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]] | ||
return Encoder( | ||
encoder=encoder, | ||
bpe_merges=bpe_merges, | ||
) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.