MESA-Docsynth/pipeline.yml at main · londonaicentre/MESA-Docsynth · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# PIPELINE CONFIGURATION FOR SYNTHETIC DOCUMENT GENERATION

################
# LLM SETTINGS #
################
llm:
  # llm: whether to call LLM API
  ## true: generate content using LLM
  ## false: save prompts only (debugging only)
  enabled: true

  # provider: which LLM provider to use
  ## 'gemini': Google Gemini API
  ## 'claude': Anthropic Claude API
  ## 'local': Local OpenAI-compatible API
  ## note that this is ignored when enabled set to false
  provider: gemini

  # gemini configuration
  gemini:
    model: gemini-2.5-flash
    temperature: 1.0
    max_tokens: 8000

  # claude configuration
  claude:
    model: claude-sonnet-4-5-20250929
    temperature: 1.0
    max_tokens: 8000

  # local api configuration
  ## note: base_url and model are configured via LOCAL_LLM_BASE_URL and LOCAL_LLM_MODEL in .env
  local:
    temperature: 1.0
    max_tokens: 8000

######################
# SAMPLING BEHAVIOUR #
######################
profile_selection:
  # domain: which profile domain to use
  ## e.g. cancer: cancer-related profiles
  ## e.g. general: general medical profiles
  domain: general

  # mode: 'random' or 'sequential'
  ## random: randomly sample profiles with replacement
  ## sequential: iterate through profiles in order
  mode: sequential

  # count: number of documents to generate
  ## int: generate exactly that many documents then stop
  ## -1: generate documents for all profiles in selected file(s)
  count: -1

  # file: list of filenames or null
  ## null: processes all .yml files in profiles/{domain}/ directory
  ## [file1.yml, file2.yml, ...]: processes specified files in profiles/{domain}/ directory
  file: null
    # - head_neck.yml
    # - skin.yml
    # - haem.yml
    # - brain.yml
    # - tissue.yml
    # - liver.yml

#####################
# SAMPLE STRUCTURES #
#####################
structure_selection:
  # enabled_structures: explicit list of structure template files
  ## note that selection method is always random
  enabled_structures: null

#################
# STYLE CONFIG  #
#################
style_selection:
  # file: single style configuration file from config/style/
  ## Required - specify which style config to use
  file: general.yml

##################
# CONTENT CONFIG #
##################
content_selection:
  # file: single content configuration file from config/content/
  ## Required - specify which content config to use
  file: general.yml

########################
# PROMPT CONFIGURATION #
########################
prompt_config:
  # include_style: whether to include prompts from selected style config
  ## true: include style requirements
  ## false: exclude style requirements
  include_style: true

  # include_content: whether to include prompts from selected content config
  ## true: include content requirements
  ## false: exclude content requirements
  include_content: true

  # prompt_template: name of system prompt file to use from prompts/ directory
  ## file will be loaded from prompts/{prompt_template}.md
  prompt_template: general

#################
# OUTPUT FOLDER #
#################
output:
  # subdirectory: subdirectory in ./output/ where generated documents are saved
  ## resulting path will be ./output/{subdirectory}/
  subdirectory: general

  # skip_existing: whether to skip profiles that already have generated documents
  ## true: skip generating documents for profiles that already exist in output folder
  ## false: always generate documents (may create duplicates with different timestamps)
  skip_existing: true