-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
157 lines (135 loc) · 4.89 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import json
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types import content
from pybars import Compiler
from dotenv import load_dotenv
from deepgram import Deepgram
import feedparser
import asyncio
load_dotenv()
# Configure Gemini API
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
# Create the model
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_schema": content.Schema(
type=content.Type.OBJECT,
enum=[],
required=["title", "summary", "sections"],
properties={
"title": content.Schema(
type=content.Type.STRING,
),
"summary": content.Schema(
type=content.Type.STRING,
),
"sections": content.Schema(
type=content.Type.ARRAY,
items=content.Schema(
type=content.Type.OBJECT,
enum=[],
required=["timestamp", "header", "content"],
properties={
"timestamp": content.Schema(
type=content.Type.NUMBER,
),
"header": content.Schema(
type=content.Type.STRING,
),
"content": content.Schema(
type=content.Type.STRING,
),
},
),
),
},
),
"response_mime_type": "application/json",
}
model = genai.GenerativeModel(
model_name="gemini-2.0-flash-exp",
generation_config=generation_config,
)
async def main():
# Fetch latest podcast episode from RSS
rss = feedparser.parse('https://www.allearsenglish.com/feed/podcast')
episode = rss.entries[0]
audio_url = episode.enclosures[0].href
episode_title = episode.title
episode_description = episode.description
# Transcribe podcast using Deepgram
DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
if not DEEPGRAM_API_KEY:
raise EnvironmentError("DEEPGRAM_API_KEY not found in environment variables.")
deepgram = Deepgram(DEEPGRAM_API_KEY)
source = {'url': audio_url}
transcription_options = {"punctuate": True, "diarize": True, "paragraphs": True}
print("Transcribing podcast...")
print("Transcribing podcast...")
try:
response = await deepgram.transcription.prerecorded(source, transcription_options)
except Exception as e:
print(f"Error during transcription: {e}")
return
# Extract paragraphs with timestamps
paragraphs = response['results']['channels'][0]['alternatives'][0]['paragraphs']['paragraphs']
transcript_segments = []
for paragraph in paragraphs:
start_time = paragraph['start']
sentences = paragraph['sentences']
content_text = ' '.join(sentence['text'] for sentence in sentences)
transcript_segments.append({
'timestamp': start_time,
'content': content_text
})
# Generate newsletter using Gemini
system_instruction = f"""
You are creating a newsletter for a podcast titled '{episode_title}'.
Description: {episode_description}
The transcript is divided into timed segments. For each segment:
1. Create a section with a descriptive header
2. Write 1-2 detailed paragraphs explaining the content
3. Use the provided timestamp
4. Maintain professional tone without advertisements
Include an overall title and summary for the newsletter.
Don't include any sponsorships or advertisements.
"""
print("Generating newsletter...")
chat_session = model.start_chat(history=[])
response = chat_session.send_message(
f"System: {system_instruction}\nTranscript segments: {json.dumps(transcript_segments)}"
)
# Process Gemini response
newsletter_data = json.loads(response.text)
# Format timestamps
def format_timestamp(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
seconds = int(seconds % 60)
return f"{hours:02}:{minutes:02}:{seconds:02}"
for section in newsletter_data["sections"]:
section["formatted_timestamp"] = format_timestamp(section["timestamp"])
# Generate Markdown
markdown_template = """
# {{title}}
{{summary}}
{{#each sections}}
## {{header}}
{{content}}
[Listen at {{formatted_timestamp}}]({{../base_url}}#t={{timestamp}})
{{/each}}
"""
compiler = Compiler()
template = compiler.compile(markdown_template)
newsletter_data["base_url"] = audio_url
output = template(newsletter_data)
# Save newsletter
with open("newsletter.md", "w", encoding="utf-8") as f:
f.write(output)
print("Newsletter generated successfully!")
if __name__ == "__main__":
asyncio.run(main())