import re
import html
import sys

raw_path = "research_output/raw/399926.html"
result_path = "research_output/results/399926_result.md"

with open(raw_path, 'r', encoding='utf-8') as f:
    content = f.read()

# Extract article content
article_match = re.search(r'<article.*?>(.*?)</article>', content, re.DOTALL | re.IGNORECASE)
if article_match:
    article_html = article_match.group(1)
else:
    article_html = content

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove any tags
    return html.unescape(text).strip()

def parse_table(table_html):
    rows = []
    # For each tr
    for tr_match in re.finditer(r'<tr.*?>(.*?)</tr>', table_html, re.DOTALL | re.IGNORECASE):
        tr_html = tr_match.group(1)
        # Extract cells (th or td)
        cells = []
        # Find all th or td tags
        for cell_match in re.finditer(r'<(th|td).*?>(.*?)</\1>', tr_html, re.DOTALL | re.IGNORECASE):
            cell_html = cell_match.group(2)
            # Remove any inner tags
            cell_text = re.sub(r'<.*?>', '', cell_html)
            cell_text = html.unescape(cell_text).strip()
            cells.append(cell_text)
        if cells:
            rows.append(cells)
    return rows

def rows_to_markdown(rows):
    if not rows:
        return ""
    lines = []
    ncols = len(rows[0])
    for i, row in enumerate(rows):
        # Ensure row has same number of cols? pad if needed
        if len(row) < ncols:
            row += [''] * (ncols - len(row))
        line = "| " + " | ".join(row) + " |"
        lines.append(line)
        if i == 0:
            # add separator after header row
            sep = "| " + " | ".join(["---"] * ncols) + " |"
            lines.insert(1, sep) if i==0 else lines.append(sep) # but we insert after first row
    # Actually we need to insert separator after header row (first row). If we already inserted after first row when i==0, but we are in loop; we can handle after loop.
    # Let's separate: build lines, then after loop insert separator at position 1.
    # But easier: after building, if more than 1 row, insert separator at index 1.
    # We'll just redo: collect all rows as lines, then after loop, if we have at least 2 rows? Actually markdown requires separator after header row, even if only header? Usually yes.
    pass
    return "\n".join(lines)

# Actually reimplement rows_to_markdown properly
def rows_to_markdown(rows):
    if not rows:
        return ""
    lines = []
    ncols = len(rows[0])
    for i, row in enumerate(rows):
        if len(row) < ncols:
            row += [''] * (ncols - len(row))
        lines.append("| " + " | ".join(row) + " |")
        if i == 0:
            lines.append("| " + " | ".join(["---"] * ncols) + " |")
    return "\n".join(lines)

# Extract all tables in order
tables = []
for table_match in re.finditer(r'<table.*?>(.*?)</table>', article_html, re.DOTALL | re.IGNORECASE):
    table_html = table_match.group(1)
    rows = parse_table(table_html)
    if rows:
        tables.append(rows)

# Extract main heading (h1)
heading = ""
h1_match = re.search(r'<h1[^>]*>(.*?)</h1>', article_html, re.DOTALL | re.IGNORECASE)
if h1_match:
    heading = clean_text(h1_match.group(1))

# Extract "Data download" heading
data_dl_heading = ""
dl_h5_match = re.search(r'<h5[^>]*>Data download</h5>', article_html, re.IGNORECASE)
if dl_h5_match:
    data_dl_heading = "Data download"

# Extract download link from that section (look for <a href="...">DOWNLOAD ALL</a>)
download_link = ""
link_match = re.search(r'<a[^>]*href="([^"]+)"[^>]*>DOWNLOAD ALL</a>', article_html, re.IGNORECASE)
if link_match:
    download_link = link_match.group(1)

# Build markdown
md = []
if heading:
    md.append(f"# {heading}")
    md.append("")

for table in tables:
    table_md = rows_to_markdown(table)
    md.append(table_md)
    md.append("")  # blank line

if data_dl_heading:
    md.append(f"## {data_dl_heading}")
    md.append("")
    if download_link:
        md.append(f"[DOWNLOAD ALL]({download_link})")
        md.append("")

final_md = "\n".join(md)

with open(result_path, 'w', encoding='utf-8') as out:
    out.write(final_md)

print(f"Result written to {result_path}")
