diff --git a/baselines/BDS/README.md b/baselines/BDS/README.md new file mode 100644 index 0000000..9ef5aba --- /dev/null +++ b/baselines/BDS/README.md @@ -0,0 +1,2 @@ +# Bottom-up Domain-specific Superintelligence: A Reliable Knowledge Graph is What We Need +https://arxiv.org/pdf/2507.13966 \ No newline at end of file diff --git a/baselines/BDS/__init__.py b/baselines/BDS/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/baselines/BDS/bds.py b/baselines/BDS/bds.py new file mode 100644 index 0000000..da677b0 --- /dev/null +++ b/baselines/BDS/bds.py @@ -0,0 +1,161 @@ +import argparse +import asyncio +import json +import os +from dataclasses import dataclass +from typing import List + +import networkx as nx +from dotenv import load_dotenv +from tqdm.asyncio import tqdm as tqdm_async + +from graphgen.models import NetworkXStorage, OpenAIModel, Tokenizer +from graphgen.utils import create_event_loop + +QA_GENERATION_PROMPT = """ +Create an agriculture examination question for advanced agricultural students that tests the relationship between {src} and {tgt}. The relationship is: {path}. The question should: + 1. Be in multiple choice format (4 options) + 2. Require agriculture reasoning along the relationship + 3. Include a brief farm or field scenario + 4. Not directly mention the relationship in the question stem + 5. Have one clearly correct answer +Format: + + [Farm or Field Scenario] + + + A. [Option] + B. [Option] + C. [Option] + D. [Option] + + : + [Correct Option Letter] + +""" + + +def _post_process(text: str) -> dict: + try: + q = text.split("")[1].split("")[0].strip() + opts = text.split("")[1].split("")[0].strip().splitlines() + opts = [o.strip() for o in opts if o.strip()] + ans = text.split(":")[1].strip()[0].upper() + return { + "question": q, + "options": opts, + "answer": ord(ans) - ord("A"), + "raw": text, + } + except Exception as e: # pylint: disable=broad-except + print(f"Error in post-processing: {e}") + return {} + + +@dataclass +class BDS: + llm_client: OpenAIModel = None + max_concurrent: int = 1000 + + def generate(self, tasks: List[dict]) -> List[dict]: + loop = create_event_loop() + return loop.run_until_complete(self._async_generate(tasks)) + + async def _async_generate(self, tasks: List[dict]) -> List[dict]: + sem = asyncio.Semaphore(self.max_concurrent) + + async def job(item): + async with sem: + path_str = " -> ".join([f"({h},{r},{t})" for h, r, t in item["path"]]) + prompt = QA_GENERATION_PROMPT.format( + src=item["src"], tgt=item["tgt"], path=path_str + ) + resp = await self.llm_client.generate_answer(prompt) + return _post_process(resp) + + tasks = [job(it) for it in tasks] + results = [] + for coro in tqdm_async(asyncio.as_completed(tasks), total=len(tasks)): + try: + if r := await coro: + results.append(r) + except Exception as e: # pylint: disable=broad-except + print("Error:", e) + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_file", + help="GraphML input file path.", + default="resources/input_examples/graphml_demo.graphml", + type=str, + ) + parser.add_argument( + "--output_file", + help="Output file path.", + default="cache/data/bds_qa.jsonl", + type=str, + ) + args = parser.parse_args() + + load_dotenv() + + tokenizer_instance: Tokenizer = Tokenizer( + model_name=os.getenv("TOKENIZER_MODEL", "cl100k_base") + ) + llm_client = OpenAIModel( + model_name=os.getenv("SYNTHESIZER_MODEL"), + api_key=os.getenv("SYNTHESIZER_API_KEY"), + base_url=os.getenv("SYNTHESIZER_BASE_URL"), + tokenizer_instance=tokenizer_instance, + ) + bds = BDS(llm_client=llm_client) + + graph = NetworkXStorage.load_nx_graph(args.input_file) + + MAX_PATH = 20000 + all_paths = [] + + G = graph.to_directed() if not graph.is_directed() else graph + print(G) + + source_nodes = [n for n in G.nodes if G.out_degree(n) > 0][:1000] + + for src in source_nodes: + for path in nx.all_simple_paths(G, source=src, target=list(G.nodes), cutoff=3): + if len(path) == 4: + all_paths.append(path) + if len(all_paths) >= MAX_PATH: + break + if len(all_paths) >= MAX_PATH: + break + if len(all_paths) >= MAX_PATH: + break + + print(f"Found {len(all_paths)} 4-node paths") + + items = [] + for path in all_paths: + path_edges = [] + for i in range(len(path) - 1): + edge_data = G.get_edge_data(path[i], path[i + 1]) + if edge_data is None: + edge_data = G.get_edge_data(path[i + 1], path[i]) + if edge_data is None: + print(f"Warning: No edge data between {path[i]} and {path[i+1]}") + relation = "related_to" + else: + relation = edge_data.get("relation", "related_to") + path_edges.append((path[i], relation, path[i + 1])) + items.append({"src": path[0], "tgt": path[-1], "path": path_edges}) + + print(f"Prepared {len(items)} items for question generation") + + qa_pairs = bds.generate(items) + print(f"Generated {len(qa_pairs)} QA pairs") + + # Save results + with open(args.output_file, "w", encoding="utf-8") as f: + json.dump(qa_pairs, f, indent=4, ensure_ascii=False) diff --git a/baselines/EntiGraph/README.md b/baselines/EntiGraph/README.md new file mode 100644 index 0000000..ae1a53b --- /dev/null +++ b/baselines/EntiGraph/README.md @@ -0,0 +1,3 @@ +# EntiGraph +https://arxiv.org/abs/2409.07431 +https://github.com/zitongyang/synthetic_continued_pretraining \ No newline at end of file diff --git a/baselines/EntiGraph/entigraph.py b/baselines/EntiGraph/entigraph.py index 07d3d5d..d04546e 100644 --- a/baselines/EntiGraph/entigraph.py +++ b/baselines/EntiGraph/entigraph.py @@ -1,6 +1,3 @@ -# https://arxiv.org/abs/2409.07431 -# https://github.com/zitongyang/synthetic_continued_pretraining - import argparse import asyncio import json diff --git a/baselines/Genie/README.md b/baselines/Genie/README.md new file mode 100644 index 0000000..b28f642 --- /dev/null +++ b/baselines/Genie/README.md @@ -0,0 +1,2 @@ +# Genie +https://arxiv.org/pdf/2401.14367 \ No newline at end of file diff --git a/baselines/Genie/genie.py b/baselines/Genie/genie.py index 75e713e..9c4479c 100644 --- a/baselines/Genie/genie.py +++ b/baselines/Genie/genie.py @@ -1,5 +1,3 @@ -# https://arxiv.org/pdf/2401.14367 - import argparse import asyncio import json diff --git a/baselines/LongForm/README.md b/baselines/LongForm/README.md new file mode 100644 index 0000000..3bc7bbf --- /dev/null +++ b/baselines/LongForm/README.md @@ -0,0 +1,3 @@ +# LongForm +https://arxiv.org/pdf/2304.08460 +https://github.com/akoksal/LongForm/tree/main \ No newline at end of file diff --git a/baselines/LongForm/longform.py b/baselines/LongForm/longform.py index 31feb01..c17cdec 100644 --- a/baselines/LongForm/longform.py +++ b/baselines/LongForm/longform.py @@ -1,6 +1,3 @@ -# https://arxiv.org/pdf/2304.08460 -# https://github.com/akoksal/LongForm/tree/main - import argparse import asyncio import json diff --git a/baselines/SELF-QA/README.md b/baselines/SELF-QA/README.md new file mode 100644 index 0000000..d896688 --- /dev/null +++ b/baselines/SELF-QA/README.md @@ -0,0 +1,2 @@ +# SELF-QA +https://arxiv.org/abs/2305.11952 \ No newline at end of file diff --git a/baselines/SELF-QA/self-qa.py b/baselines/SELF-QA/self-qa.py index 8ee0307..011915f 100644 --- a/baselines/SELF-QA/self-qa.py +++ b/baselines/SELF-QA/self-qa.py @@ -1,5 +1,3 @@ -# https://arxiv.org/abs/2305.11952 - import argparse import asyncio import json diff --git a/baselines/Wrap/README.md b/baselines/Wrap/README.md new file mode 100644 index 0000000..f200262 --- /dev/null +++ b/baselines/Wrap/README.md @@ -0,0 +1,2 @@ +# Wrap +https://arxiv.org/abs/2401.16380 \ No newline at end of file diff --git a/baselines/Wrap/wrap.py b/baselines/Wrap/wrap.py index 3f71b2f..4c2a3b3 100644 --- a/baselines/Wrap/wrap.py +++ b/baselines/Wrap/wrap.py @@ -1,5 +1,3 @@ -# https://arxiv.org/abs/2401.16380 - import argparse import asyncio import json diff --git a/resources/input_examples/graphml_demo.graphml b/resources/input_examples/graphml_demo.graphml new file mode 100644 index 0000000..9d90c84 --- /dev/null +++ b/resources/input_examples/graphml_demo.graphml @@ -0,0 +1,362 @@ + + + + + + + + + + + + "TECHNOLOGY" + "农业技术是指用于提高作物产量和质量的科学方法,云粳26号的推广依赖于相关的农业技术。" + chunk-59c892b0b8d999d3057866424ac38c5f + 3.277478752446756e-07 + 50 + + + "LOCATION" + "云南省是中国的一个省份,云粳26号在该省的农业研究和推广中发挥了重要作用。" + chunk-59c892b0b8d999d3057866424ac38c5f + 3.128163225886591e-07 + 44 + + + "ORGANIZATION" + "云南省农业科学院粮食作物研究所是一个科研机构,负责育成和研究粮食作物,包括早熟品种云粳26号。" + chunk-59c892b0b8d999d3057866424ac38c5f + 8.336807513843113e-05 + 69 + + + "MISSION" + "农业推广是指将新技术和新品种引入生产实践的过程,云粳26号被列为主导品种以促进其推广。" + chunk-59c892b0b8d999d3057866424ac38c5f + 7.628287841247848e-05 + 51 + + + "CONCEPT" + "食用价值是指食品在营养和口感方面的综合评价,云粳26号因其食味品质好而具有较高的食用价值。" + chunk-59c892b0b8d999d3057866424ac38c5f + 2.997547934514262 + 58 + + + "ORGANIZATION" + "农业部是中国的一个政府机构,负责农业政策的制定和推广,云粳26号被其列为主导品种。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.08804169791274698 + 48 + + + "CONCEPT" + "推广主导品种是指在特定区域内被推荐种植的主要作物品种,云粳26号在西南稻区被列为此类品种。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.24357749228919026 + 55 + + + "LOCATION" + "中海拔稻区是指海拔在1500至1800米之间的稻田区域,适合云粳26号的生长。" + chunk-59c892b0b8d999d3057866424ac38c5f + 5.926949838261465 + 45 + + + "KEYWORD" + "谷壳黄色是云粳26号的外观特征之一,表明其成熟状态。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.4537467972212248 + 35 + + + "KEYWORD" + "高抗稻瘟病是云粳26号的抗病性特征,增强了其种植的可行性。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.010406265234302301 + 45 + + + "KEYWORD" + "落粒性适中是云粳26号的特性之一,影响其收割和加工过程。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.39751120253379707 + 36 + + + "KEYWORD" + "有香味是云粳26号的食味品质之一,增加了其市场吸引力。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.1572458000758473 + 36 + + + "NATURE" + "稻瘟病是一种影响水稻的病害,云粳26号具有高抗稻瘟病的特性,增强了其种植的可行性。" + chunk-59c892b0b8d999d3057866424ac38c5f + 2.880133332981227e-07 + 63 + + + "CONCEPT" + "外观特点是指云粳26号的视觉特征,包括颖尖、谷壳颜色等,影响其市场接受度。" + chunk-59c892b0b8d999d3057866424ac38c5f + 1.668849735265512 + 51 + + + "KEYWORD" + "食味品质好是云粳26号的一个重要特性,影响消费者的选择。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.11790494848439569 + 33 + + + "WORK" + "云粳26号是由云南省农业科学院粮食作物研究所于2005年育成的早熟稻品种,具有良好的外观和食味品质。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.4054990190854595 + 68 + + + "NATURE" + "香味是指米粒在烹饪后散发的气味,云粳26号的米粒具有良好的香味,增加了其市场吸引力。" + chunk-59c892b0b8d999d3057866424ac38c5f + 1.704282376131091 + 60 + + + "DATE" + "2012年是云粳26号被农业部列为西南稻区农业推广主导品种的年份。" + chunk-59c892b0b8d999d3057866424ac38c5f + 1.6109373199087365 + 38 + + + "DATE" + "2005年是云粳26号品种育成的年份,标志着该品种的研发开始。" + chunk-59c892b0b8d999d3057866424ac38c5f + 2.5795454440481884 + 35 + + + "KEYWORD" + "米粒大是云粳26号的一个重要特征,通常与食用价值相关。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.8984304647517591 + 31 + + + "NATURE" + "落粒性是指稻谷在成熟后从稻穗上脱落的特性,云粳26号的落粒性适中,影响其收割和加工的便利性。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.32474459712132514 + 66 + + + "CONCEPT" + "早熟品种是指生长周期较短的作物品种,云粳26号属于这一类,适合在特定气候条件下种植。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.0018348240895041633 + 55 + + + "LOCATION" + "云南中海拔 1 500∼1 800 m 稻区是云粳26号适宜种植的区域,提供了特定的生长环境。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.03589417758929121 + 55 + + + "KEYWORD" + "颖尖无色、无芒是云粳26号的外观特点之一,影响其市场接受度。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.7739797890026089 + 39 + + + "NATURE" + "米粒是稻谷经过加工后得到的食用部分,云粳26号的米粒大且有香味,提升了其食用价值。" + chunk-59c892b0b8d999d3057866424ac38c5f + 7.555004360466345e-05 + 54 + + + "CONCEPT" + "适宜种植区域是指云粳26号推荐的种植环境,主要为云南中海拔 1 500∼1 800 m 稻区。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.4086719057062533 + 58 + + + "LOCATION" + "西南稻区是指中国西南地区,云粳26号在此区域被推广。" + chunk-59c892b0b8d999d3057866424ac38c5f + 0.004633279566562749 + 31 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的推广依赖于相关的农业技术,以提高其种植效率和产量。" + 4.469515500104365e-07 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号是云南省育成的品种,适合在该省的特定环境中种植。" + 0.0005762292913028245 + 38 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云南省农业科学院粮食作物研究所位于云南省,专注于该省的粮食作物研究和育种工作。" + 0.0004066592991875774 + 55 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号是由云南省农业科学院粮食作物研究所育成的早熟稻品种,体现了该机构的科研成果。" + 8.201402306440362e-05 + 60 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号被列为主导品种,旨在通过农业推广提高其种植和消费。" + 0.0016328476835250948 + 37 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号因其食味品质好而具有较高的食用价值,适合市场需求。" + 9.018137544091494e-05 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号在2012年被农业部列为西南稻区农业推广主导品种,显示了其在农业政策中的重要性。" + 0.0660654296723789 + 51 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号在西南稻区被列为推广主导品种,显示其在该地区的重要性。" + 0.004850300392718063 + 36 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "中海拔稻区是云粳26号推荐的种植区域,提供了适合其生长的气候条件。" + 0.4154816437684379 + 44 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "谷壳黄色是云粳26号的外观特征之一,影响其消费者的接受度。" + 0.055662130155681604 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "高抗稻瘟病是云粳26号的抗病性特征,增强了其种植的可行性。" + 0.007417909388497926 + 45 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "落粒性适中是云粳26号的特性之一,影响其收割和加工的便利性。" + 0.04371501902711867 + 39 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "有香味是云粳26号的食味品质之一,增加了其市场吸引力。" + 0.238045756594794 + 36 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号具有高抗稻瘟病的特性,表明其在种植过程中能够抵御这一病害。" + 0.006100528949900764 + 51 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的外观特点包括颖尖无色、无芒等,影响其市场表现和消费者选择。" + 0.4906831717572373 + 43 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "食味品质好是云粳26号的一个重要特性,影响消费者的选择。" + 0.013361831643390068 + 33 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号适宜在云南中海拔 1 500∼1 800 m 稻区种植,表明其对环境的适应性。" + 0.39592594481115617 + 52 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号适宜在云南中海拔 1 500∼1 800 m 稻区种植,表明其生长环境的要求。" + 0.5030086991360689 + 51 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "米粒大是云粳26号的重要特征,通常与其食用价值相关。" + 0.14715079462621358 + 31 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号是一种早熟品种,适合在特定的气候条件下种植,具有较短的生长周期。" + 0.008795430480896508 + 49 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号于2005年育成,标志着该品种的研发历程。" + 2.4413040461122195 + 31 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的香味是其食用品质的重要组成部分,吸引了消费者的关注。" + 2.379013142252285e-06 + 40 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "颖尖无色、无芒是云粳26号的外观特点,影响其市场表现。" + 2.96823331740096 + 35 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的落粒性适中,影响其在收割时的表现和加工效率。" + 0.11254188650424042 + 36 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "2012年云粳26号被农业部列为西南稻区农业推广主导品种,标志着其推广的重要性。" + 0.26297679025 + 46 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "云粳26号的米粒大且有香味,提升了其在市场上的竞争力。" + 0.0008942391715159215 + 37 + + + chunk-59c892b0b8d999d3057866424ac38c5f + "2012年云粳26号在西南稻区被推广,表明其在该地区的适应性和重要性。" + 0.001021465373999705 + 39 + + diff --git a/scripts/baselines/generate_bds.sh b/scripts/baselines/generate_bds.sh new file mode 100644 index 0000000..d4bd9e8 --- /dev/null +++ b/scripts/baselines/generate_bds.sh @@ -0,0 +1,2 @@ +python3 -m baselines.BDS.bds --input_file resources/input_examples/graphml_demo.graphml \ + --output_file cache/data/bds.json \ diff --git a/scripts/baselines/generate_entigraph.sh b/scripts/baselines/generate_entigraph.sh index ce9cc99..8474c96 100644 --- a/scripts/baselines/generate_entigraph.sh +++ b/scripts/baselines/generate_entigraph.sh @@ -1,3 +1,3 @@ -python3 -m baselines.EntiGraph.entigraph --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.EntiGraph.entigraph --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/entigraph.json \ diff --git a/scripts/baselines/generate_genie.sh b/scripts/baselines/generate_genie.sh index 0119930..3a06de1 100644 --- a/scripts/baselines/generate_genie.sh +++ b/scripts/baselines/generate_genie.sh @@ -1,3 +1,3 @@ -python3 -m baselines.Genie.genie --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.Genie.genie --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/genie.json \ diff --git a/scripts/baselines/generate_longform.sh b/scripts/baselines/generate_longform.sh index d7ed70c..62de848 100644 --- a/scripts/baselines/generate_longform.sh +++ b/scripts/baselines/generate_longform.sh @@ -1,3 +1,3 @@ -python3 -m baselines.LongForm.longform --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.LongForm.longform --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/longform.json \ diff --git a/scripts/baselines/generate_selfqa.sh b/scripts/baselines/generate_selfqa.sh index 18eb7b1..ef13e72 100644 --- a/scripts/baselines/generate_selfqa.sh +++ b/scripts/baselines/generate_selfqa.sh @@ -1,3 +1,3 @@ -python3 -m baselines.SELF-QA.self-qa --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.SELF-QA.self-qa --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/self-qa.json \ diff --git a/scripts/baselines/generate_wrap.sh b/scripts/baselines/generate_wrap.sh index f10857a..fcaf393 100644 --- a/scripts/baselines/generate_wrap.sh +++ b/scripts/baselines/generate_wrap.sh @@ -1,3 +1,3 @@ -python3 -m baselines.Wrap.wrap --input_file resources/examples/raw_demo.jsonl \ +python3 -m baselines.Wrap.wrap --input_file resources/input_examples/raw_demo.jsonl \ --data_type raw \ --output_file cache/data/wrap.json \