-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patharxiv_digest.py
123 lines (102 loc) · 4.27 KB
/
arxiv_digest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import streamlit as st
import json
from src.rss import ArxivRSS
from src.utils import merge_dicts
from src.llm import LLMPaperReader
import asyncio
# Load config
with open("./config.json") as f:
config = json.load(f)
def display_papers(topic, paper_list):
st.markdown(f"## {topic} ({len(paper_list)} papers)")
for arxiv_paper in paper_list:
st.markdown(f"## [{arxiv_paper['title']}]({arxiv_paper['url']})")
st.markdown(f"{', '.join(arxiv_paper['authors'])}")
with st.expander("Abstract"):
st.markdown(arxiv_paper["abstract"])
if "judgement" in arxiv_paper:
with st.expander("Judgement"):
for topic, relevance in arxiv_paper["judgement"].items():
if relevance["relevance"] > 0:
st.markdown(
f"{relevance['relevance']} || {topic} || {relevance['reason']}"
)
# @st.cache_data
def fetch_arxiv_papers(config):
paper_lists = []
with st.status(
"Fetching data from arxiv", expanded=False
) as status: # Changed to st.sidebar.status
for arxiv_subject in config["arxiv_subjects"]:
info_text = f"📡 Fetching data from {arxiv_subject}"
st.write(info_text)
status.update(label=info_text, state="running")
rss_url = config["arxiv_rss_base_url"] + arxiv_subject
arss = ArxivRSS(rss_url)
paper_list = arss.fetch_paper_list()
info_text = f"📄 Obtained {len(paper_list)} papers from {arxiv_subject}"
st.write(info_text)
status.update(label=info_text, state="running")
paper_lists.append(paper_list)
merged_paper_list = merge_dicts(paper_lists)
info_text = f"✅ Done obtaining {len(merged_paper_list)} papers"
st.write(info_text)
status.update(label=info_text, state="complete", expanded=False)
return merged_paper_list
def classify_papers(paper_list):
papers_by_topics = {}
papers_by_topics["All"] = paper_list.values()
for paper_id, paper in paper_list.items():
for topic, relevance in paper["judgement"].items():
if relevance["relevance"] > 0.7:
if topic not in papers_by_topics:
papers_by_topics[topic] = []
papers_by_topics[topic].append(paper)
# print(papers_by_topics[topic])
st.session_state["papers_by_topics"] = papers_by_topics
def llm_read_papers(paper_list):
llm_reader = LLMPaperReader(
config["openai_model"], config["topics"], config["timeout_seconds"]
)
papers_to_read = []
for paper_id, paper in paper_list.items():
if "judgement" not in paper:
papers_to_read.append(paper)
judgement_list = asyncio.run(
llm_reader.read_papers(
papers_to_read,
number_of_concurrent_tasks=config["number_of_concurrent_tasks"],
)
)
success_judgement_list = []
for judgement in judgement_list:
if (
not isinstance(judgement, Exception)
and judgement.get("judgement") is not None
):
success_judgement_list.append(judgement)
print(f"Read {len(success_judgement_list)} out of {len(judgement_list)} papers...")
judgement_results = {}
for judgement in success_judgement_list:
judgement_results[judgement["id"]] = judgement["judgement"]
paper_with_judgement = {}
for paper_id in paper_list.keys():
if paper_id in judgement_results:
paper_judgement = judgement_results[paper_id]
paper_with_judgement[paper_id] = paper_list[paper_id]
paper_with_judgement[paper_id]["judgement"] = paper_judgement
classify_papers(paper_with_judgement)
# Sidebar
st.sidebar.title("arXiv digest")
if st.sidebar.button("Fetch new papers"):
st.session_state["arxiv_papers"] = fetch_arxiv_papers(config)
if st.sidebar.button("Read papers"):
llm_read_papers(st.session_state["arxiv_papers"])
# Define selected_topic before using it
selected_topic = st.sidebar.radio(
"Select Topic",
list(st.session_state.get("papers_by_topics", {}).keys()),
index=0,
)
if "papers_by_topics" in st.session_state:
display_papers(selected_topic, st.session_state["papers_by_topics"][selected_topic])