Merge branch 'main' into dependabot/pip/pydantic-lt-2.11.0

Marker-Inc-Korea · Dec 12, 2024 · 7a97d66 · 7a97d66
2 parents c2a6a4f + aa0bfbf
commit 7a97d66
Show file tree

Hide file tree

Showing 49 changed files with 1,084 additions and 96 deletions.
diff --git a/.gitignore b/.gitignore
@@ -162,3 +162,6 @@ pytest.ini
 .DS_Store
 projects/tutorial_1
 !projects/tutorial_1/config.yaml
+
+# Visual Studio Code
+.vscode/
diff --git a/README.md b/README.md
@@ -441,7 +441,7 @@ docker run --rm -it \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   -v $(pwd)/projects:/usr/src/app/projects \
   -e OPENAI_API_KEY=${OPENAI_API_KEY} \
-  autoraghq/autorag:api evaluate \
+  autoraghq/autorag:api-latest evaluate \
   --config /usr/src/app/projects/tutorial_1/config.yaml \
   --qa_data_path /usr/src/app/projects/tutorial_1/qa_test.parquet \
   --corpus_data_path /usr/src/app/projects/tutorial_1/corpus.parquet \
@@ -455,7 +455,7 @@ docker run --rm -it \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   -v $(pwd)/projects:/usr/src/app/projects \
   -e OPENAI_API_KEY=${OPENAI_API_KEY} \
-  autoraghq/autorag:api validate \
+  autoraghq/autorag:api-latest validate \
   --config /usr/src/app/projects/tutorial_1/config.yaml \
   --qa_data_path /usr/src/app/projects/tutorial_1/qa_test.parquet \
   --corpus_data_path /usr/src/app/projects/tutorial_1/corpus.parquet
@@ -469,7 +469,7 @@ docker run --rm -it \
   -v $(pwd)/projects:/usr/src/app/projects \
   -e OPENAI_API_KEY=${OPENAI_API_KEY} \
   -p 8502:8502 \
-  autoraghq/autorag:api dashboard \
+  autoraghq/autorag:api-latest dashboard \
     --trial_dir /usr/src/app/projects/tutorial_1/0
 ```
 
@@ -481,7 +481,7 @@ docker run --rm -it \
   -v $(pwd)/projects:/usr/src/app/projects \
   -e OPENAI_API_KEY=${OPENAI_API_KEY} \
   -p 8501:8501 \
-  autoraghq/autorag:api run_web --trial_path ./projects/tutorial_1/0
+  autoraghq/autorag:api-latest run_web --trial_path ./projects/tutorial_1/0
 ```
 
 #### Key Points :

diff --git a/autorag/VERSION b/autorag/VERSION
@@ -1 +1 @@
-0.3.10rc1
+0.3.12
diff --git a/autorag/cli.py b/autorag/cli.py
@@ -75,14 +75,17 @@ def evaluate(config, qa_data_path, corpus_data_path, project_dir, skip_validatio
 @click.option(
 	"--project_dir", help="Path to project directory.", type=str, default=None
 )
-def run_api(config_path, host, port, trial_dir, project_dir):
+@click.option(
+	"--remote", help="Run the API server in remote mode.", type=bool, default=False
+)
+def run_api(config_path, host, port, trial_dir, project_dir, remote: bool):
 	if trial_dir is None:
 		runner = ApiRunner.from_yaml(config_path, project_dir=project_dir)
 	else:
 		runner = ApiRunner.from_trial_folder(trial_dir)
 	logger.info(f"Running API server at {host}:{port}...")
 	nest_asyncio.apply()
-	runner.run_api_server(host, port)
+	runner.run_api_server(host, port, remote=remote)
 
 
 @click.command()

diff --git a/autorag/data/parse/base.py b/autorag/data/parse/base.py
@@ -3,6 +3,7 @@
 from datetime import datetime
 from glob import glob
 from typing import Tuple, List, Optional
+import os
 
 from autorag.utils import result_to_dataframe
 from autorag.data.utils.util import get_file_metadata
@@ -14,14 +15,38 @@ def parser_node(func):
 	@functools.wraps(func)
 	@result_to_dataframe(["texts", "path", "page", "last_modified_datetime"])
 	def wrapper(
-		data_path_glob: str, parse_method: Optional[str] = None, **kwargs
+		data_path_glob: str,
+		file_type: str,
+		parse_method: Optional[str] = None,
+		**kwargs,
 	) -> Tuple[List[str], List[str], List[int], List[datetime]]:
 		logger.info(f"Running parser - {func.__name__} module...")
 
 		data_path_list = glob(data_path_glob)
 		if not data_path_list:
 			raise FileNotFoundError(f"data does not exits in {data_path_glob}")
 
+		assert file_type in [
+			"pdf",
+			"csv",
+			"json",
+			"md",
+			"html",
+			"xml",
+			"all_files",
+		], f"search type {file_type} is not supported"
+
+		# extract only files from data_path_list based on the file_type set in the YAML file
+		data_paths = (
+			[
+				data_path
+				for data_path in data_path_list
+				if os.path.basename(data_path).split(".")[-1] == file_type
+			]
+			if file_type != "all_files"
+			else data_path_list
+		)
+
 		if func.__name__ == "langchain_parse":
 			parse_method = parse_method.lower()
 			if parse_method == "directory":
@@ -30,14 +55,14 @@ def wrapper(
 				folder_path = "/".join(path_split_list)
 				kwargs.update({"glob": glob_path, "path": folder_path})
 				result = func(
-					data_path_list=data_path_list, parse_method=parse_method, **kwargs
+					data_path_list=data_paths, parse_method=parse_method, **kwargs
 				)
 			else:
 				result = func(
-					data_path_list=data_path_list, parse_method=parse_method, **kwargs
+					data_path_list=data_paths, parse_method=parse_method, **kwargs
 				)
 		elif func.__name__ in ["clova_ocr", "llama_parse", "table_hybrid_parse"]:
-			result = func(data_path_list=data_path_list, **kwargs)
+			result = func(data_path_list=data_paths, **kwargs)
 		else:
 			raise ValueError(f"Unsupported module_type: {func.__name__}")
 		result = _add_last_modified_datetime(result)

diff --git a/autorag/data/parse/run.py b/autorag/data/parse/run.py
@@ -1,16 +1,90 @@
 import os
 from typing import List, Callable, Dict
 import pandas as pd
+from glob import glob
 
 from autorag.strategy import measure_speed
+from autorag.data.utils.util import get_param_combinations
+
+default_map = {
+	"pdf": {
+		"file_type": "pdf",
+		"module_type": "langchain_parse",
+		"parse_method": "pdfminer",
+	},
+	"csv": {
+		"file_type": "csv",
+		"module_type": "langchain_parse",
+		"parse_method": "csv",
+	},
+	"md": {
+		"file_type": "md",
+		"module_type": "langchain_parse",
+		"parse_method": "unstructuredmarkdown",
+	},
+	"html": {
+		"file_type": "html",
+		"module_type": "langchain_parse",
+		"parse_method": "bshtml",
+	},
+	"xml": {
+		"file_type": "xml",
+		"module_type": "langchain_parse",
+		"parse_method": "unstructuredxml",
+	},
+}
 
 
 def run_parser(
 	modules: List[Callable],
 	module_params: List[Dict],
 	data_path_glob: str,
 	project_dir: str,
+	all_files: bool,
 ):
+	if not all_files:
+		# Set the parsing module to default if it is a file type in paths but not set in YAML.
+		data_path_list = glob(data_path_glob)
+		if not data_path_list:
+			raise FileNotFoundError(f"data does not exits in {data_path_glob}")
+
+		file_types = set(
+			[os.path.basename(data_path).split(".")[-1] for data_path in data_path_list]
+		)
+		set_file_types = set([module["file_type"] for module in module_params])
+
+		# Calculate the set difference once
+		file_types_to_remove = set_file_types - file_types
+
+		# Use list comprehension to filter out unwanted elements
+		module_params = [
+			param
+			for param in module_params
+			if param["file_type"] not in file_types_to_remove
+		]
+		modules = [
+			module
+			for module, param in zip(modules, module_params)
+			if param["file_type"] not in file_types_to_remove
+		]
+
+		# create a list of only those file_types that are in file_types but not in set_file_types
+		missing_file_types = list(file_types - set_file_types)
+
+		if missing_file_types:
+			add_modules_list = []
+			for missing_file_type in missing_file_types:
+				if missing_file_type == "json":
+					raise ValueError(
+						"JSON file type must have a jq_schema so you must set it in the YAML file."
+					)
+
+				add_modules_list.append(default_map[missing_file_type])
+
+			add_modules, add_params = get_param_combinations(add_modules_list)
+			modules.extend(add_modules)
+			module_params.extend(add_params)
+
 	results, execution_times = zip(
 		*map(
 			lambda x: measure_speed(x[0], data_path_glob=data_path_glob, **x[1]),
@@ -20,9 +94,19 @@ def run_parser(
 	average_times = list(map(lambda x: x / len(results[0]), execution_times))
 
 	# save results to parquet files
-	filepaths = list(
-		map(lambda x: os.path.join(project_dir, f"{x}.parquet"), range(len(modules)))
-	)
+	if all_files:
+		if len(module_params) > 1:
+			raise ValueError(
+				"All files is set to True, You can only use one parsing module."
+			)
+		filepaths = [os.path.join(project_dir, "parsed_result.parquet")]
+	else:
+		filepaths = list(
+			map(
+				lambda x: os.path.join(project_dir, f"{x['file_type']}.parquet"),
+				module_params,
+			)
+		)
 	list(map(lambda x: x[0].to_parquet(x[1], index=False), zip(results, filepaths)))
 	filenames = list(map(lambda x: os.path.basename(x), filepaths))
 
@@ -35,4 +119,13 @@ def run_parser(
 		}
 	)
 	summary_df.to_csv(os.path.join(project_dir, "summary.csv"), index=False)
+
+	# concat all parquet files here if not all_files.
+	if not all_files:
+		dataframes = [pd.read_parquet(file) for file in filepaths]
+		combined_df = pd.concat(dataframes, ignore_index=True)
+		combined_df.to_parquet(
+			os.path.join(project_dir, "parsed_result.parquet"), index=False
+		)
+
 	return summary_df
diff --git a/autorag/data/qa/query/llama_gen_query.py b/autorag/data/qa/query/llama_gen_query.py
@@ -4,7 +4,7 @@
 from llama_index.core.base.llms.base import BaseLLM
 from llama_index.core.base.llms.types import ChatResponse, ChatMessage, MessageRole
 
-from autorag.data.qa.query.prompt import QUERY_GEN_PROMPT
+from autorag.data.qa.query.prompt import QUERY_GEN_PROMPT, QUERY_GEN_PROMPT_EXTRA
 
 
 async def llama_index_generate_base(
@@ -68,3 +68,15 @@ async def custom_query_gen(
 	messages: List[ChatMessage],
 ) -> Dict:
 	return await llama_index_generate_base(row, llm, messages)
+
+
+# Experimental feature: can only use factoid_single_hop
+async def multiple_queries_gen(
+	row: Dict,
+	llm: BaseLLM,
+	lang: str = "en",
+	n: int = 3,
+) -> Dict:
+	_messages = QUERY_GEN_PROMPT["factoid_single_hop"][lang]
+	_messages[0].content += QUERY_GEN_PROMPT_EXTRA["multiple_queries"][lang].format(n=n)
+	return await llama_index_generate_base(row, llm, _messages)
diff --git a/autorag/data/qa/query/prompt.py b/autorag/data/qa/query/prompt.py
@@ -19,7 +19,9 @@
 2. Questions should be as detailed as possible from Text
 3. Create questions that ask about factual information from the Text
 4. Do not mention any of these in the questions: "in the given text", "in the provided information", etc.
-Users do not know the passage source of the question, so it should not be mentioned in the question.""",
+Users do not know the passage source of the question, so it should not be mentioned in the question.
+5. Do not ask about the file name or the file title. Ask about the content of the file.
+For example, avoid to write questions like `What is the file name of the document?`""",
 			)
 		],
 		"ko": [
@@ -41,7 +43,9 @@
 	3.	Text에서 사실적 정보를 요구하는 질문을 만들어야 합니다. 즉, Text를 기반으로 사실 질문을 만드세요.
 	4.	질문에 “주어진 Text에서” 또는 “제공된 단락에서”와 같은 표현을 포함해서는 안 됩니다.
 사용자는 질문의 출처가 Text라는 것을 모르기 때문에 반드시 그 출처를 언급해서는 안 됩니다.
-	5.	질문을 한국어로 작성하세요.""",
+	5.	파일 이름이나 파일 제목에 대한 질문을 하지 마세요. 파일의 내용에 대해 물어보세요.
+예를 들어, '문서의 파일 이름은 무엇입니까?'와 같은 질문을 작성하지 마세요.
+	6.	질문을 한국어로 작성하세요.""",
 			)
 		],
 		"ja": [
@@ -63,7 +67,9 @@
 	3. Textで事実的情報を要求する質問を作らなければなりません。 つまり、Textに基づいて質問を作成します。
 	4. 質問に「与えられたTextで」または「提供された段落で」のような表現を含めてはいけません。
 ユーザーは質問の出所がTextだということを知らないので、必ずしもその出所を言及してはいけません。
-	5. 質問を日本語で作成しなさい。""",
+	5. ファイル名やファイルタイトルを訊かないでください。ファイルの内容について聞いてください。
+例えば、「このドキュメントのファイル名は何ですか？
+	6. 質問を日本語で作成しなさい。""",
 			)
 		],
 	},
@@ -81,7 +87,9 @@
 3. Create questions that ask about information from the Text
 4. MUST include specific keywords from the Text.
 5. Do not mention any of these in the questions: "in the given text", "in the provided information", etc.
-Users do not know the passage source of the question, so it should not be mentioned in the question.""",
+Users do not know the passage source of the question, so it should not be mentioned in the question.
+6. Do not ask about the file name or the file title. Ask about the content of the file.
+For example, avoid to write questions like `What is the file name of the document?""",
 			)
 		],
 		"ko": [
@@ -98,7 +106,9 @@
 4.	Text의 특정 키워드를 반드시 질문에 포함하세요.
 5.	질문에 “주어진 Text에서” 또는 “제공된 단락에서”와 같은 표현을 포함해서는 안 됩니다.
 사용자는 질문의 출처가 Text라는 것을 모르기 때문에 반드시 그 출처를 언급해서는 안 됩니다.
-6.	질문을 한국어로 작성하세요.""",
+6.	파일 이름이나 파일 제목에 대한 질문을 하지 마세요. 파일의 내용에 대해 물어보세요.
+예를 들어, '문서의 파일 이름은 무엇입니까?'와 같은 질문을 작성하지 마세요.
+7.	질문을 한국어로 작성하세요.""",
 			)
 		],
 		"ja": [
@@ -115,7 +125,9 @@
 4. Textの特定のキーワードを必ず質問に含みます。
 5. 質問に「与えられたTextで」または「提供された段落で」のような表現を含めてはいけません。
 ユーザーは質問の出所がTextだということを知らないので、必ずしもその出所を言及してはいけません。
-6. 質問を日本語で書きましょう。""",
+6. ファイル名やファイルタイトルを訊かないでください。ファイルの内容について聞いてください。
+例えば、「このドキュメントのファイル名は何ですか？
+7. 質問を日本語で書きましょう。""",
 			)
 		],
 	},
@@ -179,3 +191,12 @@
 		],
 	},
 }
+
+# Experimental feature
+QUERY_GEN_PROMPT_EXTRA = {
+	"multiple_queries": {
+		"en": "\nAdditional instructions:\n  - Please make {n} questions.",
+		"ko": "\n추가 지침:\n  - 질문은 {n}개를 만드세요.",
+		"ja": "\n追加指示:\n  - 質問を{n}個作成してください。",
+	}
+}