Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1,641 changes: 1,641 additions & 0 deletions SUCESSFUL_LL_APPROACH.ipynb

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
et-xmlfile==1.1.0
numpy @ file:///D:/Trabajo/rest-mex_2022_sentiment_data_training/numpy-1.24.2-cp311-cp311-win_amd64.whl
openpyxl==3.1.2
pandas @ file:///D:/Trabajo/rest-mex_2022_sentiment_data_training/pandas-1.5.3-cp311-cp311-win_amd64.whl
python-dateutil==2.8.2
pytz==2022.7.1
six==1.16.0
107 changes: 107 additions & 0 deletions src/_onlyrequirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
#_libgcc_mutex=0.1=main
#_openmp_mutex=4.5=1_gnu
blas
boto3
botocore
brotlipy
bzip2
ca-certificates
certifi
cffi
charset-normalizer
click
cryptography
cudatoolkit
ffmpeg
filelock
freetype
giflib
gmp
gnutls
idna
importlib-metadata
intel-openmp
jmespath
joblib
jpeg
lame
lcms2
ld_impl_linux-64
libffi
libgcc-ng
libgfortran-ng
libgfortran4
libgomp
libiconv
libidn2
libpng
libstdcxx-ng
libtasn1
libtiff
libunistring
libuv
libwebp
libwebp-base
lz4-c
mkl
mkl-service
mkl_fft
mkl_random
ncurses
nettle
ninja
numpy
numpy-base
olefile
openh264
openssl
packaging
pillow
pip
pycparser
pyopenssl
pyparsing
pysocks
python==3.7.11
python-dateutil==2.8.2
pytorch==1.7.0
pytorch-mutex==1.0
pyyaml
quadprog
readline
regex
requests
s3transfer
sacremoses
scikit-learn
scipy
setuptools
six
sqlite
threadpoolctl
tk
tokenizers
torchaudio
torchvision
tqdm
transformers
typing_extensions
urllib3
wheel
xz
zipp
zlib
zstd
transformers==4.10.2
zipp==3.6.0
tokenizers==0.10.3
pyparsing==2.4.7
packaging==21.2
quadprog==0.1.10
pyyaml==6.0
filelock==3.3.2
importlib-metadata==4.8.1
huggingface-hub==0.1.1
99 changes: 99 additions & 0 deletions src/_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
#_libgcc_mutex=0.1=main
#_openmp_mutex=4.5=1_gnu
blas=1.0=mkl
boto3=1.18.21=pyhd3eb1b0_0

botocore=1.21.41=pyhd3eb1b0_1
brotlipy=0.7.0=py37h27cfd23_1003
bzip2=1.0.8=h7b6447c_0
ca-certificates=2021.10.26=h06a4308_2
certifi=2021.10.8=py37h06a4308_0
cffi=1.14.6=py37h400218f_0
charset-normalizer=2.0.4=pyhd3eb1b0_0
click=8.0.3=pyhd3eb1b0_0
cryptography=35.0.0=py37hd23ed53_0
cudatoolkit=11.0.221=h6bb024c_0
ffmpeg=4.3=hf484d3e_0
filelock=3.3.2=pypi_0
freetype=2.11.0=h70c0345_0
giflib=5.2.1=h7b6447c_0
gmp=6.2.1=h2531618_2
gnutls=3.6.15=he1e5248_0
huggingface-hub=0.1.1=pypi_0
idna=3.2=pyhd3eb1b0_0
importlib-metadata=4.8.1=pypi_0
intel-openmp=2021.4.0=h06a4308_3561
jmespath=0.10.0=pyhd3eb1b0_0
joblib=1.1.0=pyhd3eb1b0_0
jpeg=9d=h7f8727e_0
lame=3.100=h7b6447c_0
lcms2=2.12=h3be6417_0
ld_impl_linux-64=2.35.1=h7274673_9
libffi=3.3=he6710b0_2
libgcc-ng=9.3.0=h5101ec6_17
libgfortran-ng=7.5.0=ha8ba4b0_17
libgfortran4=7.5.0=ha8ba4b0_17
libgomp=9.3.0=h5101ec6_17
libiconv=1.15=h63c8f33_5
libidn2=2.3.2=h7f8727e_0
libpng=1.6.37=hbc83047_0
libstdcxx-ng=9.3.0=hd4cf53a_17
libtasn1=4.16.0=h27cfd23_0
libtiff=4.2.0=h85742a9_0
libunistring=0.9.10=h27cfd23_0
libuv=1.40.0=h7b6447c_0
libwebp=1.2.0=h89dd481_0
libwebp-base=1.2.0=h27cfd23_0
lz4-c=1.9.3=h295c915_1
mkl=2021.4.0=h06a4308_640
mkl-service=2.4.0=py37h7f8727e_0
mkl_fft=1.3.1=py37hd3c417c_0
mkl_random=1.2.2=py37h51133e4_0
ncurses=6.3=heee7806_1
nettle=3.7.3=hbbd107a_1
ninja=1.10.2=hff7bd54_1
numpy=1.21.2=py37h20f2e39_0
numpy-base=1.21.2=py37h79a1101_0
olefile=0.46=py37_0
openh264=2.1.0=hd408876_0
openssl=1.1.1l=h7f8727e_0
packaging=21.2=pypi_0
pillow=8.4.0=py37h5aabda8_0
pip=21.0.1=py37h06a4308_0
pycparser=2.20=py_2
pyopenssl=21.0.0=pyhd3eb1b0_1
pyparsing=2.4.7=pypi_0
pysocks=1.7.1=py37_1
python=3.7.11=h12debd9_0
python-dateutil=2.8.2=pyhd3eb1b0_0
pytorch=1.7.0=py3.7_cuda11.0.221_cudnn8.0.3_0
pytorch-mutex=1.0=cuda
pyyaml=6.0=pypi_0
quadprog=0.1.10=pypi_0
readline=8.1=h27cfd23_0
regex=2021.8.3=py37h7f8727e_0
requests=2.26.0=pyhd3eb1b0_0
s3transfer=0.5.0=pyhd3eb1b0_0
sacremoses=0.0.43=pyhd3eb1b0_0
scikit-learn=1.0.1=py37h51133e4_0
scipy=1.7.1=py37h292c36d_2
setuptools=58.0.4=py37h06a4308_0
six=1.16.0=pyhd3eb1b0_0
sqlite=3.36.0=hc218d9a_0
threadpoolctl=2.2.0=pyh0d69192_0
tk=8.6.11=h1ccaba5_0
tokenizers=0.10.3=pypi_0
torchaudio=0.7.0=py37
torchvision=0.8.1=py37_cu110
tqdm=4.62.3=pyhd3eb1b0_1
transformers=4.10.2=pypi_0
typing_extensions=3.10.0.2=pyh06a4308_0
urllib3=1.26.7=pyhd3eb1b0_0
wheel=0.37.0=pyhd3eb1b0_1
xz=5.2.5=h7b6447c_0
zipp=3.6.0=pypi_0
zlib=1.2.11=h7b6447c_3
zstd=1.4.9=haebb681_0
97 changes: 97 additions & 0 deletions src/_testrequirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
#_libgcc_mutex==0.1
#_openmp_mutex==4.5
blas
boto3
botocore
brotlipy
bzip2
ca-certificates
certifi
cffi
charset-normalizer
click
cryptography
cudatoolkit
ffmpeg
filelock
freetype
giflib
gmp
gnutls
idna
importlib-metadata
intel-openmp
jmespath
joblib
jpeg
lame
lcms2
ld_impl_linux-64
libffi
libgcc-ng
libgfortran-ng
libgfortran4
libgomp
libiconv
libidn2
libpng
libstdcxx-ng
libtasn1
libtiff
libunistring
libuv
libwebp
libwebp-base
lz4-c
mkl
mkl-service
mkl_fft
mkl_random
ncurses
nettle
ninja
numpy
numpy-base
olefile
openh264
openssl
packaging
pillow
pip
pycparser
pyopenssl
pyparsing
pysocks
python==3.7.11
python-dateutil==2.8.2
pytorch=1.7.0=py3.7_cuda11.0.221_cudnn8.0.3_0
pytorch-mutex=1.0=cuda
pyyaml
quadprog
readline
regex
requests
s3transfer
sacremoses
scikit-learn
scipy
setuptools
six
sqlite
threadpoolctl
tk
tokenizers
torchaudio
torchvision
tqdm
transformers=4.10.2
typing_extensions
urllib3
wheel
xz
zipp
zlib
zstd
13 changes: 13 additions & 0 deletions src/piprequirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
transformers==4.10.2
zipp==3.6.0
tokenizers==0.10.3
pyparsing==2.4.7
packaging==21.2
quadprog==0.1.10
pyyaml==6.0
filelock==3.3.2
importlib-metadata==4.8.1
huggingface-hub==0.1.1
5 changes: 5 additions & 0 deletions src/serialization/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# INSTRUCCIONES
1. Copiar los 2 archivos excel en esta carpeta
2. Tener instalado Python 3.11
3. Instalar las dependencias necesarias que estan al inicio del repositorio en el archivo requirements.txt
4. Ejecutar los scripts
15 changes: 15 additions & 0 deletions src/serialization/classify_to_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pandas as pd

"""
In this algorithm i´m using python 3.11 cause it´s 80% faster than previous versions
"""
# Reading dataset
df = pd.read_excel("Rest_Mex_Sentiment_Analysis_2023_Train.xlsx")

# Filter
with open("classified\\hotel.json", "w", encoding='utf-8') as file:
file.write(df[df["Type"] == "Hotel"].to_json(force_ascii=False, orient='index'))
with open("classified\\restaurant.json", "w", encoding='utf-8') as file:
file.write(df[df["Type"] == "Restaurant"].to_json(force_ascii=False, orient='index'))
with open("classified\\attractive.json", "w", encoding='utf-8') as file:
file.write(df[df["Type"] == "Attractive"].to_json(force_ascii=False, orient='index'))
10 changes: 10 additions & 0 deletions src/serialization/find_diferents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import pandas as pd

df2022 = pd.read_excel('Track_Train.xlsx')
df2023 = pd.read_excel('Rest_Mex_Sentiment_Analysis_2023_Train.xlsx')

df = pd.concat([df2023, df2022]).drop_duplicates(keep=False)

df = df[~df.astype(str).apply(lambda x: x.str.contains('Attractive')).any(axis=1)]

df.to_excel('dataset_filtrado.xlsx', index=False)
18 changes: 18 additions & 0 deletions src/serialization/find_equals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import pandas as pd

# Cargando los datasets
df2022 = pd.read_excel('Track_Train.xlsx')
df2023 = pd.read_excel('Rest_Mex_Sentiment_Analysis_2023_Train.xlsx')

# Filtrando hasta quedarse con las filas unicas
reviews_ds22 = df2022['Review'].unique().tolist()
reviews_ds23 = df2023['Review'].unique().tolist()

# Esta query permite saber si el contenido de la columna review esta en el dataset
# con el q se le esta comparando
rows_only_in_ds22_df = df2023[~df2023['Review'].isin(reviews_ds22)]
rows_only_in_ds23_df = df2022[~df2022['Review'].isin(reviews_ds23)]

# Exportando los datasets resultantes
rows_only_in_ds22_df.to_excel('only_2022.xlsx', index=False)
rows_only_in_ds23_df.to_excel('only_2023.xlsx', index=False)
Loading