|
| 1 | +""" |
| 2 | +Script to parse maintainers data for Forklore PR |
| 3 | +Intended for both Maintainer who want to self-contribute or simple anyone to help out! |
| 4 | +""" |
| 5 | + |
| 6 | +import re |
| 7 | +import json |
| 8 | +from html import escape |
| 9 | + |
| 10 | +def is_image(url): |
| 11 | + return any(url.lower().endswith(ext) for ext in ['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp']) |
| 12 | + |
| 13 | +def format_response(value: str) -> str: |
| 14 | + value = value.strip() |
| 15 | + |
| 16 | + # Convert links and image URLs into HTML tags |
| 17 | + def convert_url(match): |
| 18 | + url = match.group(0) |
| 19 | + if is_image(url): |
| 20 | + return f'<img src="{url}" alt="image" />' |
| 21 | + return f'<a href="{url}">{url}</a>' |
| 22 | + |
| 23 | + value = re.sub(r'https?://\S+', convert_url, value) |
| 24 | + |
| 25 | + # Convert newlines into <br> |
| 26 | + value = value.replace('\n', '<br>') |
| 27 | + return value |
| 28 | + |
| 29 | +def parse_multiline_field(lines, start_index): |
| 30 | + """Extract multiline field starting at start_index + 1 until next field or section.""" |
| 31 | + value_lines = [] |
| 32 | + i = start_index + 1 |
| 33 | + while i < len(lines): |
| 34 | + line = lines[i] |
| 35 | + if line.startswith("**") and line.endswith("**"): # Next field label |
| 36 | + break |
| 37 | + if line.startswith("### ") or line.startswith("## "): # Next section heading |
| 38 | + break |
| 39 | + value_lines.append(line) |
| 40 | + i += 1 |
| 41 | + return " ".join(value_lines).strip(), i |
| 42 | + |
| 43 | + |
| 44 | +def parse_issue(markdown: str): |
| 45 | + # Remove all comments (<!-- ... -->) |
| 46 | + markdown = re.sub(r'<!--.*?-->', '', markdown, flags=re.DOTALL) |
| 47 | + |
| 48 | + lines = [line.strip() for line in markdown.strip().splitlines()] |
| 49 | + lines = [line for line in lines if line != ''] # remove empty lines |
| 50 | + |
| 51 | + data = { |
| 52 | + "username": "", |
| 53 | + "full_name": "", |
| 54 | + "photo": "", |
| 55 | + "designation": "", |
| 56 | + "socials": [], |
| 57 | + "projects": [], |
| 58 | + "form": [] |
| 59 | + } |
| 60 | + |
| 61 | + # State machine parsing |
| 62 | + current_section = None |
| 63 | + current_project = {} |
| 64 | + project_fields = ["Name", "Project Link", "Website Link", "Logo URL", "Short Description", "Full Description"] |
| 65 | + form_questions = [] |
| 66 | + |
| 67 | + i = 0 |
| 68 | + while i < len(lines): |
| 69 | + line = lines[i] |
| 70 | + |
| 71 | + # ==== USER DETAILS ==== |
| 72 | + if line.startswith("**Username:**"): |
| 73 | + data["username"] = lines[i + 1].strip() |
| 74 | + i += 2 |
| 75 | + continue |
| 76 | + |
| 77 | + if line.startswith("**Full Name:**"): |
| 78 | + data["full_name"] = lines[i + 1].strip() |
| 79 | + i += 2 |
| 80 | + continue |
| 81 | + |
| 82 | + if line.startswith("**Photo URL:**"): |
| 83 | + data["photo"] = lines[i + 1].strip() |
| 84 | + i += 2 |
| 85 | + continue |
| 86 | + |
| 87 | + if line.startswith("**Designation / Role:**"): |
| 88 | + data["designation"] = lines[i + 1].strip() |
| 89 | + i += 2 |
| 90 | + continue |
| 91 | + |
| 92 | + if line.startswith("**Social Profiles:**"): |
| 93 | + i += 1 |
| 94 | + while i < len(lines) and ':' in lines[i]: |
| 95 | + label_link = lines[i].split(":", 1) |
| 96 | + if len(label_link) == 2: |
| 97 | + label, link = label_link[0].strip(), label_link[1].strip() |
| 98 | + data["socials"].append({"label": label, "link": link}) |
| 99 | + i += 1 |
| 100 | + continue |
| 101 | + |
| 102 | + # ==== PROJECTS ==== |
| 103 | + if line.startswith("### Project"): |
| 104 | + if current_project: |
| 105 | + data["projects"].append(current_project) |
| 106 | + current_project = {} |
| 107 | + i += 1 |
| 108 | + continue |
| 109 | + |
| 110 | + if any(line.startswith(f"**{field}:**") for field in project_fields): |
| 111 | + field = re.match(r"\*\*(.*?):\*\*", line).group(1).strip() |
| 112 | + value, i = parse_multiline_field(lines, i) |
| 113 | + current_project[field] = value |
| 114 | + continue |
| 115 | + |
| 116 | + # ==== FORM QUESTIONS ==== |
| 117 | + question_match = re.match(r"\*\*(\d+\..*?)\*\*", line) |
| 118 | + if question_match: |
| 119 | + question = question_match.group(1).strip() |
| 120 | + # Capture all lines under this until we hit another bold or end |
| 121 | + i += 1 |
| 122 | + response_lines = [] |
| 123 | + while i < len(lines) and not lines[i].startswith("**"): |
| 124 | + response_lines.append(lines[i]) |
| 125 | + i += 1 |
| 126 | + response_text = "\n".join(response_lines).strip() |
| 127 | + formatted_response = format_response(response_text) |
| 128 | + form_questions.append({ |
| 129 | + "question": re.sub(r"^\d+\.\s*", "", question), |
| 130 | + "response": formatted_response |
| 131 | + }) |
| 132 | + continue |
| 133 | + |
| 134 | + i += 1 |
| 135 | + |
| 136 | + # Append last project |
| 137 | + if current_project: |
| 138 | + # Rename keys to match JSON spec |
| 139 | + project_json = { |
| 140 | + "name": current_project.get("Name", ""), |
| 141 | + "project_link": current_project.get("Project Link", ""), |
| 142 | + "website_link": current_project.get("Website Link", ""), |
| 143 | + "logo": current_project.get("Logo URL", ""), |
| 144 | + "description": current_project.get("Full Description", ""), |
| 145 | + "short_description": current_project.get("Short Description", "") |
| 146 | + } |
| 147 | + data["projects"].append(project_json) |
| 148 | + |
| 149 | + # Append form Q&A |
| 150 | + data["form"] = form_questions |
| 151 | + |
| 152 | + return data |
| 153 | + |
| 154 | + |
| 155 | +if __name__ == "__main__": |
| 156 | + import sys |
| 157 | + |
| 158 | + if len(sys.argv) < 2: |
| 159 | + print("Usage: python parse_issue.py <input_file.md>") |
| 160 | + sys.exit(1) |
| 161 | + |
| 162 | + input_file = sys.argv[1] |
| 163 | + |
| 164 | + with open(input_file, "r", encoding="utf-8") as f: |
| 165 | + md = f.read() |
| 166 | + result = parse_issue(md) |
| 167 | + print(json.dumps(result, indent=2, ensure_ascii=False)) |
| 168 | + |
0 commit comments