import re import html import sys raw_path = "research_output/raw/399926.html" result_path = "research_output/results/399926_result.md" with open(raw_path, 'r', encoding='utf-8') as f: content = f.read() # Extract article content article_match = re.search(r'(.*?)', content, re.DOTALL | re.IGNORECASE) if article_match: article_html = article_match.group(1) else: article_html = content def clean_text(text): text = re.sub(r'<.*?>', '', text) # remove any tags return html.unescape(text).strip() def parse_table(table_html): rows = [] # For each tr for tr_match in re.finditer(r'(.*?)', table_html, re.DOTALL | re.IGNORECASE): tr_html = tr_match.group(1) # Extract cells (th or td) cells = [] # Find all th or td tags for cell_match in re.finditer(r'<(th|td).*?>(.*?)', tr_html, re.DOTALL | re.IGNORECASE): cell_html = cell_match.group(2) # Remove any inner tags cell_text = re.sub(r'<.*?>', '', cell_html) cell_text = html.unescape(cell_text).strip() cells.append(cell_text) if cells: rows.append(cells) return rows def rows_to_markdown(rows): if not rows: return "" lines = [] ncols = len(rows[0]) for i, row in enumerate(rows): # Ensure row has same number of cols? pad if needed if len(row) < ncols: row += [''] * (ncols - len(row)) line = "| " + " | ".join(row) + " |" lines.append(line) if i == 0: # add separator after header row sep = "| " + " | ".join(["---"] * ncols) + " |" lines.insert(1, sep) if i==0 else lines.append(sep) # but we insert after first row # Actually we need to insert separator after header row (first row). If we already inserted after first row when i==0, but we are in loop; we can handle after loop. # Let's separate: build lines, then after loop insert separator at position 1. # But easier: after building, if more than 1 row, insert separator at index 1. # We'll just redo: collect all rows as lines, then after loop, if we have at least 2 rows? Actually markdown requires separator after header row, even if only header? Usually yes. pass return "\n".join(lines) # Actually reimplement rows_to_markdown properly def rows_to_markdown(rows): if not rows: return "" lines = [] ncols = len(rows[0]) for i, row in enumerate(rows): if len(row) < ncols: row += [''] * (ncols - len(row)) lines.append("| " + " | ".join(row) + " |") if i == 0: lines.append("| " + " | ".join(["---"] * ncols) + " |") return "\n".join(lines) # Extract all tables in order tables = [] for table_match in re.finditer(r'(.*?)', article_html, re.DOTALL | re.IGNORECASE): table_html = table_match.group(1) rows = parse_table(table_html) if rows: tables.append(rows) # Extract main heading (h1) heading = "" h1_match = re.search(r']*>(.*?)', article_html, re.DOTALL | re.IGNORECASE) if h1_match: heading = clean_text(h1_match.group(1)) # Extract "Data download" heading data_dl_heading = "" dl_h5_match = re.search(r']*>Data download', article_html, re.IGNORECASE) if dl_h5_match: data_dl_heading = "Data download" # Extract download link from that section (look for DOWNLOAD ALL) download_link = "" link_match = re.search(r']*href="([^"]+)"[^>]*>DOWNLOAD ALL', article_html, re.IGNORECASE) if link_match: download_link = link_match.group(1) # Build markdown md = [] if heading: md.append(f"# {heading}") md.append("") for table in tables: table_md = rows_to_markdown(table) md.append(table_md) md.append("") # blank line if data_dl_heading: md.append(f"## {data_dl_heading}") md.append("") if download_link: md.append(f"[DOWNLOAD ALL]({download_link})") md.append("") final_md = "\n".join(md) with open(result_path, 'w', encoding='utf-8') as out: out.write(final_md) print(f"Result written to {result_path}")