Skip to content

Commit

Permalink
Merge branch 'main' into dependabot/pip/pydantic-lt-2.11.0
Browse files Browse the repository at this point in the history
  • Loading branch information
vkehfdl1 authored Dec 12, 2024
2 parents c2a6a4f + aa0bfbf commit 7a97d66
Show file tree
Hide file tree
Showing 49 changed files with 1,084 additions and 96 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,6 @@ pytest.ini
.DS_Store
projects/tutorial_1
!projects/tutorial_1/config.yaml

# Visual Studio Code
.vscode/
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ docker run --rm -it \
-v ~/.cache/huggingface:/root/.cache/huggingface \
-v $(pwd)/projects:/usr/src/app/projects \
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
autoraghq/autorag:api evaluate \
autoraghq/autorag:api-latest evaluate \
--config /usr/src/app/projects/tutorial_1/config.yaml \
--qa_data_path /usr/src/app/projects/tutorial_1/qa_test.parquet \
--corpus_data_path /usr/src/app/projects/tutorial_1/corpus.parquet \
Expand All @@ -455,7 +455,7 @@ docker run --rm -it \
-v ~/.cache/huggingface:/root/.cache/huggingface \
-v $(pwd)/projects:/usr/src/app/projects \
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
autoraghq/autorag:api validate \
autoraghq/autorag:api-latest validate \
--config /usr/src/app/projects/tutorial_1/config.yaml \
--qa_data_path /usr/src/app/projects/tutorial_1/qa_test.parquet \
--corpus_data_path /usr/src/app/projects/tutorial_1/corpus.parquet
Expand All @@ -469,7 +469,7 @@ docker run --rm -it \
-v $(pwd)/projects:/usr/src/app/projects \
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
-p 8502:8502 \
autoraghq/autorag:api dashboard \
autoraghq/autorag:api-latest dashboard \
--trial_dir /usr/src/app/projects/tutorial_1/0
```

Expand All @@ -481,7 +481,7 @@ docker run --rm -it \
-v $(pwd)/projects:/usr/src/app/projects \
-e OPENAI_API_KEY=${OPENAI_API_KEY} \
-p 8501:8501 \
autoraghq/autorag:api run_web --trial_path ./projects/tutorial_1/0
autoraghq/autorag:api-latest run_web --trial_path ./projects/tutorial_1/0
```

#### Key Points :
Expand Down
2 changes: 1 addition & 1 deletion autorag/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.10rc1
0.3.12
7 changes: 5 additions & 2 deletions autorag/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,17 @@ def evaluate(config, qa_data_path, corpus_data_path, project_dir, skip_validatio
@click.option(
"--project_dir", help="Path to project directory.", type=str, default=None
)
def run_api(config_path, host, port, trial_dir, project_dir):
@click.option(
"--remote", help="Run the API server in remote mode.", type=bool, default=False
)
def run_api(config_path, host, port, trial_dir, project_dir, remote: bool):
if trial_dir is None:
runner = ApiRunner.from_yaml(config_path, project_dir=project_dir)
else:
runner = ApiRunner.from_trial_folder(trial_dir)
logger.info(f"Running API server at {host}:{port}...")
nest_asyncio.apply()
runner.run_api_server(host, port)
runner.run_api_server(host, port, remote=remote)


@click.command()
Expand Down
33 changes: 29 additions & 4 deletions autorag/data/parse/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime
from glob import glob
from typing import Tuple, List, Optional
import os

from autorag.utils import result_to_dataframe
from autorag.data.utils.util import get_file_metadata
Expand All @@ -14,14 +15,38 @@ def parser_node(func):
@functools.wraps(func)
@result_to_dataframe(["texts", "path", "page", "last_modified_datetime"])
def wrapper(
data_path_glob: str, parse_method: Optional[str] = None, **kwargs
data_path_glob: str,
file_type: str,
parse_method: Optional[str] = None,
**kwargs,
) -> Tuple[List[str], List[str], List[int], List[datetime]]:
logger.info(f"Running parser - {func.__name__} module...")

data_path_list = glob(data_path_glob)
if not data_path_list:
raise FileNotFoundError(f"data does not exits in {data_path_glob}")

assert file_type in [
"pdf",
"csv",
"json",
"md",
"html",
"xml",
"all_files",
], f"search type {file_type} is not supported"

# extract only files from data_path_list based on the file_type set in the YAML file
data_paths = (
[
data_path
for data_path in data_path_list
if os.path.basename(data_path).split(".")[-1] == file_type
]
if file_type != "all_files"
else data_path_list
)

if func.__name__ == "langchain_parse":
parse_method = parse_method.lower()
if parse_method == "directory":
Expand All @@ -30,14 +55,14 @@ def wrapper(
folder_path = "/".join(path_split_list)
kwargs.update({"glob": glob_path, "path": folder_path})
result = func(
data_path_list=data_path_list, parse_method=parse_method, **kwargs
data_path_list=data_paths, parse_method=parse_method, **kwargs
)
else:
result = func(
data_path_list=data_path_list, parse_method=parse_method, **kwargs
data_path_list=data_paths, parse_method=parse_method, **kwargs
)
elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]:
result = func(data_path_list=data_path_list, **kwargs)
result = func(data_path_list=data_paths, **kwargs)
else:
raise ValueError(f"Unsupported module_type: {func.__name__}")
result = _add_last_modified_datetime(result)
Expand Down
99 changes: 96 additions & 3 deletions autorag/data/parse/run.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,90 @@
import os
from typing import List, Callable, Dict
import pandas as pd
from glob import glob

from autorag.strategy import measure_speed
from autorag.data.utils.util import get_param_combinations

default_map = {
"pdf": {
"file_type": "pdf",
"module_type": "langchain_parse",
"parse_method": "pdfminer",
},
"csv": {
"file_type": "csv",
"module_type": "langchain_parse",
"parse_method": "csv",
},
"md": {
"file_type": "md",
"module_type": "langchain_parse",
"parse_method": "unstructuredmarkdown",
},
"html": {
"file_type": "html",
"module_type": "langchain_parse",
"parse_method": "bshtml",
},
"xml": {
"file_type": "xml",
"module_type": "langchain_parse",
"parse_method": "unstructuredxml",
},
}


def run_parser(
modules: List[Callable],
module_params: List[Dict],
data_path_glob: str,
project_dir: str,
all_files: bool,
):
if not all_files:
# Set the parsing module to default if it is a file type in paths but not set in YAML.
data_path_list = glob(data_path_glob)
if not data_path_list:
raise FileNotFoundError(f"data does not exits in {data_path_glob}")

file_types = set(
[os.path.basename(data_path).split(".")[-1] for data_path in data_path_list]
)
set_file_types = set([module["file_type"] for module in module_params])

# Calculate the set difference once
file_types_to_remove = set_file_types - file_types

# Use list comprehension to filter out unwanted elements
module_params = [
param
for param in module_params
if param["file_type"] not in file_types_to_remove
]
modules = [
module
for module, param in zip(modules, module_params)
if param["file_type"] not in file_types_to_remove
]

# create a list of only those file_types that are in file_types but not in set_file_types
missing_file_types = list(file_types - set_file_types)

if missing_file_types:
add_modules_list = []
for missing_file_type in missing_file_types:
if missing_file_type == "json":
raise ValueError(
"JSON file type must have a jq_schema so you must set it in the YAML file."
)

add_modules_list.append(default_map[missing_file_type])

add_modules, add_params = get_param_combinations(add_modules_list)
modules.extend(add_modules)
module_params.extend(add_params)

results, execution_times = zip(
*map(
lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),
Expand All @@ -20,9 +94,19 @@ def run_parser(
average_times = list(map(lambda x: x / len(results[0]), execution_times))

# save results to parquet files
filepaths = list(
map(lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules)))
)
if all_files:
if len(module_params) > 1:
raise ValueError(
"All files is set to True, You can only use one parsing module."
)
filepaths = [os.path.join(project_dir, "parsed_result.parquet")]
else:
filepaths = list(
map(
lambda x: os.path.join(project_dir, f"{x['file_type']}.parquet"),
module_params,
)
)
list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)))
filenames = list(map(lambda x: os.path.basename(x), filepaths))

Expand All @@ -35,4 +119,13 @@ def run_parser(
}
)
summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False)

# concat all parquet files here if not all_files.
if not all_files:
dataframes = [pd.read_parquet(file) for file in filepaths]
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_parquet(
os.path.join(project_dir, "parsed_result.parquet"), index=False
)

return summary_df
14 changes: 13 additions & 1 deletion autorag/data/qa/query/llama_gen_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from llama_index.core.base.llms.base import BaseLLM
from llama_index.core.base.llms.types import ChatResponse, ChatMessage, MessageRole

from autorag.data.qa.query.prompt import QUERY_GEN_PROMPT
from autorag.data.qa.query.prompt import QUERY_GEN_PROMPT, QUERY_GEN_PROMPT_EXTRA


async def llama_index_generate_base(
Expand Down Expand Up @@ -68,3 +68,15 @@ async def custom_query_gen(
messages: List[ChatMessage],
) -> Dict:
return await llama_index_generate_base(row, llm, messages)


# Experimental feature: can only use factoid_single_hop
async def multiple_queries_gen(
row: Dict,
llm: BaseLLM,
lang: str = "en",
n: int = 3,
) -> Dict:
_messages = QUERY_GEN_PROMPT["factoid_single_hop"][lang]
_messages[0].content += QUERY_GEN_PROMPT_EXTRA["multiple_queries"][lang].format(n=n)
return await llama_index_generate_base(row, llm, _messages)
33 changes: 27 additions & 6 deletions autorag/data/qa/query/prompt.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
2. Questions should be as detailed as possible from Text
3. Create questions that ask about factual information from the Text
4. Do not mention any of these in the questions: "in the given text", "in the provided information", etc.
Users do not know the passage source of the question, so it should not be mentioned in the question.""",
Users do not know the passage source of the question, so it should not be mentioned in the question.
5. Do not ask about the file name or the file title. Ask about the content of the file.
For example, avoid to write questions like `What is the file name of the document?`""",
)
],
"ko": [
Expand All @@ -41,7 +43,9 @@
3. Text에서 사실적 정보를 요구하는 질문을 만들어야 합니다. 즉, Text를 기반으로 사실 질문을 만드세요.
4. 질문에 “주어진 Text에서” 또는 “제공된 단락에서”와 같은 표현을 포함해서는 안 됩니다.
사용자는 질문의 출처가 Text라는 것을 모르기 때문에 반드시 그 출처를 언급해서는 안 됩니다.
5. 질문을 한국어로 작성하세요.""",
5. 파일 이름이나 파일 제목에 대한 질문을 하지 마세요. 파일의 내용에 대해 물어보세요.
예를 들어, '문서의 파일 이름은 무엇입니까?'와 같은 질문을 작성하지 마세요.
6. 질문을 한국어로 작성하세요.""",
)
],
"ja": [
Expand All @@ -63,7 +67,9 @@
3. Textで事実的情報を要求する質問を作らなければなりません。 つまり、Textに基づいて質問を作成します。
4. 質問に「与えられたTextで」または「提供された段落で」のような表現を含めてはいけません。
ユーザーは質問の出所がTextだということを知らないので、必ずしもその出所を言及してはいけません。
5. 質問を日本語で作成しなさい。""",
5. ファイル名やファイルタイトルを訊かないでください。ファイルの内容について聞いてください。
例えば、「このドキュメントのファイル名は何ですか?
6. 質問を日本語で作成しなさい。""",
)
],
},
Expand All @@ -81,7 +87,9 @@
3. Create questions that ask about information from the Text
4. MUST include specific keywords from the Text.
5. Do not mention any of these in the questions: "in the given text", "in the provided information", etc.
Users do not know the passage source of the question, so it should not be mentioned in the question.""",
Users do not know the passage source of the question, so it should not be mentioned in the question.
6. Do not ask about the file name or the file title. Ask about the content of the file.
For example, avoid to write questions like `What is the file name of the document?""",
)
],
"ko": [
Expand All @@ -98,7 +106,9 @@
4. Text의 특정 키워드를 반드시 질문에 포함하세요.
5. 질문에 “주어진 Text에서” 또는 “제공된 단락에서”와 같은 표현을 포함해서는 안 됩니다.
사용자는 질문의 출처가 Text라는 것을 모르기 때문에 반드시 그 출처를 언급해서는 안 됩니다.
6. 질문을 한국어로 작성하세요.""",
6. 파일 이름이나 파일 제목에 대한 질문을 하지 마세요. 파일의 내용에 대해 물어보세요.
예를 들어, '문서의 파일 이름은 무엇입니까?'와 같은 질문을 작성하지 마세요.
7. 질문을 한국어로 작성하세요.""",
)
],
"ja": [
Expand All @@ -115,7 +125,9 @@
4. Textの特定のキーワードを必ず質問に含みます。
5. 質問に「与えられたTextで」または「提供された段落で」のような表現を含めてはいけません。
ユーザーは質問の出所がTextだということを知らないので、必ずしもその出所を言及してはいけません。
6. 質問を日本語で書きましょう。""",
6. ファイル名やファイルタイトルを訊かないでください。ファイルの内容について聞いてください。
例えば、「このドキュメントのファイル名は何ですか?
7. 質問を日本語で書きましょう。""",
)
],
},
Expand Down Expand Up @@ -179,3 +191,12 @@
],
},
}

# Experimental feature
QUERY_GEN_PROMPT_EXTRA = {
"multiple_queries": {
"en": "\nAdditional instructions:\n - Please make {n} questions.",
"ko": "\n추가 지침:\n - 질문은 {n}개를 만드세요.",
"ja": "\n追加指示:\n - 質問を{n}個作成してください。",
}
}
Loading

0 comments on commit 7a97d66

Please sign in to comment.