-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpipeline.yml
More file actions
120 lines (103 loc) · 3.45 KB
/
pipeline.yml
File metadata and controls
120 lines (103 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# PIPELINE CONFIGURATION FOR SYNTHETIC DOCUMENT GENERATION
################
# LLM SETTINGS #
################
llm:
# llm: whether to call LLM API
## true: generate content using LLM
## false: save prompts only (debugging only)
enabled: true
# provider: which LLM provider to use
## 'gemini': Google Gemini API
## 'claude': Anthropic Claude API
## 'local': Local OpenAI-compatible API
## note that this is ignored when enabled set to false
provider: gemini
# gemini configuration
gemini:
model: gemini-2.5-flash
temperature: 1.0
max_tokens: 8000
# claude configuration
claude:
model: claude-sonnet-4-5-20250929
temperature: 1.0
max_tokens: 8000
# local api configuration
## note: base_url and model are configured via LOCAL_LLM_BASE_URL and LOCAL_LLM_MODEL in .env
local:
temperature: 1.0
max_tokens: 8000
######################
# SAMPLING BEHAVIOUR #
######################
profile_selection:
# domain: which profile domain to use
## e.g. cancer: cancer-related profiles
## e.g. general: general medical profiles
domain: general
# mode: 'random' or 'sequential'
## random: randomly sample profiles with replacement
## sequential: iterate through profiles in order
mode: sequential
# count: number of documents to generate
## int: generate exactly that many documents then stop
## -1: generate documents for all profiles in selected file(s)
count: -1
# file: list of filenames or null
## null: processes all .yml files in profiles/{domain}/ directory
## [file1.yml, file2.yml, ...]: processes specified files in profiles/{domain}/ directory
file: null
# - head_neck.yml
# - skin.yml
# - haem.yml
# - brain.yml
# - tissue.yml
# - liver.yml
#####################
# SAMPLE STRUCTURES #
#####################
structure_selection:
# enabled_structures: explicit list of structure template files
## note that selection method is always random
enabled_structures: null
#################
# STYLE CONFIG #
#################
style_selection:
# file: single style configuration file from config/style/
## Required - specify which style config to use
file: general.yml
##################
# CONTENT CONFIG #
##################
content_selection:
# file: single content configuration file from config/content/
## Required - specify which content config to use
file: general.yml
########################
# PROMPT CONFIGURATION #
########################
prompt_config:
# include_style: whether to include prompts from selected style config
## true: include style requirements
## false: exclude style requirements
include_style: true
# include_content: whether to include prompts from selected content config
## true: include content requirements
## false: exclude content requirements
include_content: true
# prompt_template: name of system prompt file to use from prompts/ directory
## file will be loaded from prompts/{prompt_template}.md
prompt_template: general
#################
# OUTPUT FOLDER #
#################
output:
# subdirectory: subdirectory in ./output/ where generated documents are saved
## resulting path will be ./output/{subdirectory}/
subdirectory: general
# skip_existing: whether to skip profiles that already have generated documents
## true: skip generating documents for profiles that already exist in output folder
## false: always generate documents (may create duplicates with different timestamps)
skip_existing: true