update run.py for fetchprob

2026-04-27 09:43:47 +09:00
parent c332fa900c
commit 695760da41
2 changed files with 579 additions and 1 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 click>=8.1
+latex2mathml>=3.77.0
--- a/run.py
+++ b/run.py
@@ -2,14 +2,24 @@
 import os
 import sys
 import pathlib
+import base64
 from enum import Enum, unique, auto
 from dataclasses import dataclass
 import re
 import subprocess
+import urllib.error
+import urllib.parse
+import urllib.request
+from html import escape, unescape
 import yaml

 import click

+try:
+    from latex2mathml.converter import convert as latex_to_mathml
+except ImportError:
+    latex_to_mathml = None
+
 CFG_PATH = "./config.yml"
 STATE_PATH = "./state.yml"

@@ -200,6 +210,502 @@ def parse_range_string_list(str_list) -> list[int]:
    return list(result)


+def parse_fetchprob_target(target: str) -> tuple[str, str | None]:
+    """
+    fetchprob target parser.
+    - zeta/<id>: single mode
+    - zeta: batch mode
+    """
+    parts = target.split("/", 1)
+    location = parts[0].strip()
+
+    if location != "zeta":
+        raise click.UsageError("fetchprob target must start with 'zeta'")
+
+    if len(parts) == 1:
+        return location, None
+
+    problem_id = parts[1].strip()
+    if not problem_id.isdigit():
+        raise click.UsageError("problem id must be numeric (e.g. zeta/2447)")
+
+    return location, problem_id
+
+
+def extract_problem_id_from_stem(stem: str) -> str | None:
+    """
+    Extract BOJ numeric id from file stem.
+    Accepted forms: <id>, <id>_<suffix>, <id>-<suffix>
+    """
+    m = re.match(r"^(\d+)(?:[_-].*)?$", stem)
+    return m.group(1) if m else None
+
+
+def collect_zeta_problem_ids() -> list[str]:
+    """
+    Collect problem ids from storage/zeta/* and storage/zeta/*/completed.
+    """
+    zeta_dir = pathlib.Path(STORAGE_DIR) / "zeta"
+    if not zeta_dir.is_dir():
+        raise click.ClickException(f"Storage location '{zeta_dir}' not found")
+
+    ids: set[str] = set()
+    for lang_dir in sorted(zeta_dir.iterdir()):
+        if not lang_dir.is_dir() or lang_dir.name.startswith("_"):
+            continue
+
+        for f in lang_dir.iterdir():
+            if f.is_file():
+                problem_id = extract_problem_id_from_stem(f.stem)
+                if problem_id:
+                    ids.add(problem_id)
+
+        completed_dir = lang_dir / "completed"
+        if completed_dir.is_dir():
+            for f in completed_dir.iterdir():
+                if f.is_file():
+                    problem_id = extract_problem_id_from_stem(f.stem)
+                    if problem_id:
+                        ids.add(problem_id)
+
+    return sorted(ids, key=int)
+
+
+def fetch_boj_problem_html(problem_id: str, timeout: int = 10) -> str:
+    """
+    Download BOJ problem page raw HTML.
+    """
+    url = f"https://www.acmicpc.net/problem/{problem_id}"
+    req = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": (
+                "Mozilla/5.0 (X11; Linux x86_64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/123.0.0.0 Safari/537.36"
+            )
+        },
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            status = getattr(resp, "status", 200)
+            if status != 200:
+                raise click.ClickException(
+                    f"failed to fetch problem {problem_id}: HTTP {status}"
+                )
+            return resp.read().decode("utf-8", errors="replace")
+    except urllib.error.HTTPError as e:
+        raise click.ClickException(
+            f"failed to fetch problem {problem_id}: HTTP {e.code}"
+        ) from e
+    except urllib.error.URLError as e:
+        raise click.ClickException(
+            f"network error while fetching problem {problem_id}: {e.reason}"
+        ) from e
+
+
+def _problem_static_html_path(problem_id: str) -> pathlib.Path:
+    return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / f"{problem_id}.html"
+
+
+def _problem_static_assets_dir(problem_id: str) -> pathlib.Path:
+    return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / "assets" / problem_id
+
+
+def _guess_image_mime(src_url: str, content_type: str | None) -> str:
+    if content_type:
+        mime = content_type.split(";", 1)[0].strip().lower()
+        if mime.startswith("image/"):
+            return mime
+
+    parsed = urllib.parse.urlparse(src_url)
+    ext = pathlib.Path(parsed.path).suffix.lower()
+    ext_to_mime = {
+        ".jpg": "image/jpeg",
+        ".jpeg": "image/jpeg",
+        ".png": "image/png",
+        ".gif": "image/gif",
+        ".webp": "image/webp",
+        ".svg": "image/svg+xml",
+        ".bmp": "image/bmp",
+        ".ico": "image/x-icon",
+    }
+    return ext_to_mime.get(ext, "image/png")
+
+
+def _download_image_for_offline(problem_id: str, src_url: str, seq: int, force: bool) -> str | None:
+    req = urllib.request.Request(
+        src_url,
+        headers={
+            "User-Agent": (
+                "Mozilla/5.0 (X11; Linux x86_64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/123.0.0.0 Safari/537.36"
+            ),
+            "Referer": f"https://www.acmicpc.net/problem/{problem_id}",
+        },
+    )
+
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            status = getattr(resp, "status", 200)
+            if status != 200:
+                return None
+            content_type = resp.headers.get("Content-Type")
+            image_bytes = resp.read()
+    except (urllib.error.HTTPError, urllib.error.URLError):
+        return None
+
+    mime = _guess_image_mime(src_url, content_type)
+    encoded = base64.b64encode(image_bytes).decode("ascii")
+    return f"data:{mime};base64,{encoded}"
+
+
+def _localize_images_in_html(problem_id: str, html_fragment: str, force: bool) -> str:
+    base_url = f"https://www.acmicpc.net/problem/{problem_id}"
+    counter = {"i": 0}
+    cache: dict[str, str] = {}
+
+    pattern = re.compile(
+        r'(<img\b[^>]*?\bsrc\s*=\s*)(["\']?)([^"\'>\s]+)(["\']?)',
+        flags=re.IGNORECASE,
+    )
+
+    def repl(m: re.Match) -> str:
+        prefix = m.group(1)
+        q1 = m.group(2)
+        src = m.group(3)
+
+        if src.startswith("data:"):
+            return m.group(0)
+
+        abs_url = urllib.parse.urljoin(base_url, src)
+        if abs_url in cache:
+            local_src = cache[abs_url]
+            quote = q1 if q1 else '"'
+            return f"{prefix}{quote}{local_src}{quote}"
+
+        counter["i"] += 1
+        local_src = _download_image_for_offline(problem_id, abs_url, counter["i"], force=force)
+        if not local_src:
+            return m.group(0)
+
+        cache[abs_url] = local_src
+        quote = q1 if q1 else '"'
+        return f"{prefix}{quote}{local_src}{quote}"
+
+    return pattern.sub(repl, html_fragment)
+
+
+def _extract_html_by_id(raw_html: str, tag: str, element_id: str) -> str | None:
+    pattern = rf"<{tag}[^>]*id=\"{re.escape(element_id)}\"[^>]*>(.*?)</{tag}>"
+    m = re.search(pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
+    if not m:
+        return None
+    return m.group(1).strip()
+
+
+def _strip_tags(html_text: str) -> str:
+    text = re.sub(r"<[^>]+>", "", html_text, flags=re.DOTALL)
+    return " ".join(text.split())
+
+
+def _render_math_expressions(html_fragment: str) -> str:
+    """
+    Convert TeX math delimiters to MathML for offline rendering.
+    - Inline: $...$
+    - Block: $$...$$
+    """
+    if latex_to_mathml is None:
+        return html_fragment
+
+    protected_blocks: list[str] = []
+
+    def protect(m: re.Match) -> str:
+        protected_blocks.append(m.group(0))
+        return f"@@PROTECTED_{len(protected_blocks) - 1}@@"
+
+    # Do not touch code/pre blocks.
+    temp = re.sub(
+        r"<(pre|code)\b[^>]*>.*?</\1>",
+        protect,
+        html_fragment,
+        flags=re.DOTALL | re.IGNORECASE,
+    )
+
+    def repl_block(m: re.Match) -> str:
+        expr = unescape(m.group(1).strip())
+        if not expr:
+            return m.group(0)
+        try:
+            mathml = latex_to_mathml(expr)
+            return f'<div class="math-block">{mathml}</div>'
+        except Exception:
+            return m.group(0)
+
+    def repl_inline(m: re.Match) -> str:
+        expr = unescape(m.group(1).strip())
+        if not expr:
+            return m.group(0)
+        try:
+            mathml = latex_to_mathml(expr)
+            return f'<span class="math-inline">{mathml}</span>'
+        except Exception:
+            return m.group(0)
+
+    temp = re.sub(r"\$\$(.+?)\$\$", repl_block, temp, flags=re.DOTALL)
+    temp = re.sub(r"(?<!\$)\$(?!\$)(.+?)(?<!\$)\$(?!\$)", repl_inline, temp, flags=re.DOTALL)
+
+    for i, block in enumerate(protected_blocks):
+        temp = temp.replace(f"@@PROTECTED_{i}@@", block)
+
+    return temp
+
+
+def make_offline_problem_html(problem_id: str, raw_html: str, force: bool) -> str:
+    """
+    Build a self-contained offline-friendly HTML page from BOJ raw HTML.
+    """
+    title = _extract_html_by_id(raw_html, "span", "problem_title")
+    if not title:
+        title = f"BOJ {problem_id}"
+
+    blocks: list[str] = []
+    core_specs = [
+        ("problem_description", "문제"),
+        ("problem_input", "입력"),
+        ("problem_output", "출력"),
+        ("problem_limit", "제한"),
+        ("problem_hint", "힌트"),
+    ]
+
+    for content_id, fallback_label in core_specs:
+        content = _extract_html_by_id(raw_html, "div", content_id)
+        if not content or not content.strip():
+            continue
+
+        localized_content = _localize_images_in_html(
+            problem_id,
+            content,
+            force=force,
+        )
+        localized_content = _render_math_expressions(localized_content)
+
+        blocks.append(
+            "\n".join(
+                [
+                    "<article class=\"section\">",
+                    f"<h2>{fallback_label}</h2>",
+                    f"{localized_content}",
+                    "</article>",
+                ]
+            )
+        )
+
+    for sample_type, sample_label in (("sampleinput", "예제 입력"), ("sampleoutput", "예제 출력")):
+        sample_pattern = rf"<section[^>]*id=\"{sample_type}(\d+)\"[^>]*>(.*?)</section>"
+        sample_matches = list(
+            re.finditer(sample_pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
+        )
+        sample_matches.sort(key=lambda m: int(m.group(1)))
+
+        for m in sample_matches:
+            idx = m.group(1)
+            section_html = m.group(2)
+            pre_match = re.search(
+                r"(<pre[^>]*>.*?</pre>)",
+                section_html,
+                flags=re.DOTALL | re.IGNORECASE,
+            )
+            if not pre_match:
+                continue
+
+            pre_html = _localize_images_in_html(
+                problem_id,
+                pre_match.group(1),
+                force=force,
+            )
+
+            h2_match = re.search(
+                r"<h2[^>]*>(.*?)</h2>",
+                section_html,
+                flags=re.DOTALL | re.IGNORECASE,
+            )
+            if h2_match:
+                h2 = _strip_tags(h2_match.group(1))
+            else:
+                h2 = f"{sample_label} {idx}"
+
+            blocks.append(
+                "\n".join(
+                    [
+                        "<article class=\"section\">",
+                        f"<h2>{h2}</h2>",
+                        pre_html,
+                        "</article>",
+                    ]
+                )
+            )
+
+    if not blocks:
+        body_fallback = (
+            "<article class=\"section\">"
+            "<h2>원본 페이지</h2>"
+            "<p>문제 본문 파싱에 실패하여 원본 HTML을 포함합니다.</p>"
+            f"<pre>{escape(raw_html[:100000])}</pre>"
+            "</article>"
+        )
+        blocks.append(body_fallback)
+
+    source_url = f"https://www.acmicpc.net/problem/{problem_id}"
+    content_html = "\n".join(blocks)
+
+    return f"""<!DOCTYPE html>
+<html lang=\"ko\">
+<head>
+    <meta charset=\"UTF-8\" />
+    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />
+    <title>BOJ {problem_id} - Offline</title>
+    <style>
+        :root {{
+            --bg: #fafaf8;
+            --paper: #ffffff;
+            --ink: #1e1f24;
+            --muted: #6a6d75;
+            --line: #d8dce3;
+            --accent: #0d6e6e;
+            --code-bg: #f4f6fb;
+        }}
+        * {{ box-sizing: border-box; }}
+        body {{
+            margin: 0;
+            background:
+                radial-gradient(circle at 15% 0%, #f0efe9 0%, transparent 42%),
+                radial-gradient(circle at 85% 20%, #e7f1f2 0%, transparent 38%),
+                var(--bg);
+            color: var(--ink);
+            font-family: "Noto Sans KR", "Pretendard", "Apple SD Gothic Neo", sans-serif;
+            line-height: 1.65;
+        }}
+        main {{
+            max-width: 980px;
+            margin: 0 auto;
+            padding: 24px 16px 56px;
+        }}
+        .header {{
+            background: var(--paper);
+            border: 1px solid var(--line);
+            border-radius: 14px;
+            padding: 18px 20px;
+            margin-bottom: 18px;
+        }}
+        .header h1 {{ margin: 0 0 6px; font-size: 1.5rem; }}
+        .header p {{ margin: 0; color: var(--muted); font-size: 0.95rem; }}
+        .header a {{ color: var(--accent); text-decoration: none; }}
+        .section {{
+            background: var(--paper);
+            border: 1px solid var(--line);
+            border-radius: 14px;
+            padding: 16px 18px;
+            margin-bottom: 14px;
+            overflow-x: auto;
+        }}
+        h2 {{
+            margin: 0 0 10px;
+            font-size: 1.05rem;
+            color: var(--accent);
+            border-bottom: 1px solid var(--line);
+            padding-bottom: 8px;
+        }}
+        pre, code {{
+            font-family: "JetBrains Mono", "Fira Code", monospace;
+            background: var(--code-bg);
+        }}
+        pre {{
+            padding: 12px;
+            border-radius: 10px;
+            border: 1px solid #e7ebf2;
+            overflow: auto;
+        }}
+        blockquote {{
+            margin: 14px 0;
+            padding: 16px 16px 14px 22px;
+            border-left: 4px solid var(--accent);
+            border-radius: 10px;
+            background: linear-gradient(90deg, #eef8f8 0%, #f9fdfd 100%);
+            color: #24313a;
+            font-weight: 600;
+            position: relative;
+        }}
+        blockquote::before {{
+            content: "“";
+            position: absolute;
+            left: 8px;
+            top: 2px;
+            font-size: 1.35rem;
+            line-height: 1;
+            color: #0b5f5f;
+            opacity: 0.7;
+        }}
+        blockquote > :first-child {{ margin-top: 0; }}
+        blockquote > :last-child {{ margin-bottom: 0; }}
+        q {{
+            color: #114f50;
+            font-weight: 700;
+            background: #edf8f8;
+            border-radius: 6px;
+            padding: 0 4px;
+        }}
+        .math-inline math {{
+            font-size: 1em;
+            vertical-align: middle;
+        }}
+        .math-block {{
+            margin: 10px 0;
+            padding: 8px 10px;
+            overflow-x: auto;
+            background: #f8fbff;
+            border: 1px solid #e2ecf8;
+            border-radius: 8px;
+        }}
+        .math-block math {{
+            font-size: 1.04em;
+            display: block;
+        }}
+        table {{ border-collapse: collapse; width: 100%; }}
+        th, td {{ border: 1px solid var(--line); padding: 6px 8px; }}
+        img {{ max-width: 100%; height: auto; }}
+    </style>
+</head>
+<body>
+    <main>
+        <header class=\"header\">
+            <h1>{title}</h1>
+        </header>
+        {content_html}
+    </main>
+</body>
+</html>
+"""
+
+
+def save_problem_html(problem_id: str, html: str, force: bool) -> str:
+    """
+    Save html to storage/zeta/_static/<id>.html
+    Return: fetched | skipped
+    """
+    static_dir = pathlib.Path(STORAGE_DIR) / "zeta" / "_static"
+    static_dir.mkdir(parents=True, exist_ok=True)
+
+    dest = _problem_static_html_path(problem_id)
+    if dest.exists() and not force:
+        return "skipped"
+
+    dest.write_text(html, encoding="utf-8")
+    return "fetched"
+
+
@click.group()
 def cli():
    pass
@@ -791,6 +1297,76 @@ def find(keyword: str, completed: bool | None):
        click.echo(f"    {status} {file_name}.{lang_name}")


+@click.command(name="fetchprob")
+@click.argument("target", type=str, nargs=1, required=True)
+@click.option("--force", "-f", is_flag=True, help="Overwrite existing HTML files")
+def fetchprob(target: str, force: bool):
+    """
+    Fetch BOJ problem HTML into storage/zeta/_static.
+
+    TARGET:
+      zeta/<id>  Fetch one problem
+      zeta       Fetch all detected problem ids under storage/zeta
+    """
+    location, problem_id = parse_fetchprob_target(target)
+    if location != "zeta":
+        raise click.UsageError("only 'zeta' location is supported")
+
+    if problem_id is not None:
+        if _problem_static_html_path(problem_id).exists() and not force:
+            click.echo(f"{problem_id}: skipped (already exists)")
+            return
+
+        raw_html = fetch_boj_problem_html(problem_id)
+        offline_html = make_offline_problem_html(problem_id, raw_html, force=force)
+        result = save_problem_html(problem_id, offline_html, force=force)
+        if result == "skipped":
+            click.echo(f"{problem_id}: skipped (already exists)")
+        else:
+            click.echo(f"{problem_id}: fetched (offline processed + images)")
+        return
+
+    ids = collect_zeta_problem_ids()
+    if not ids:
+        click.echo("No problem ids found in storage/zeta")
+        return
+
+    attempted = len(ids)
+    fetched = 0
+    skipped = 0
+    failed = 0
+
+    for pid in ids:
+        try:
+            if _problem_static_html_path(pid).exists() and not force:
+                skipped += 1
+                click.echo(f"{pid}: skipped")
+                continue
+
+            raw_html = fetch_boj_problem_html(pid)
+            offline_html = make_offline_problem_html(pid, raw_html, force=force)
+            result = save_problem_html(pid, offline_html, force=force)
+            if result == "skipped":
+                skipped += 1
+                click.echo(f"{pid}: skipped")
+            else:
+                fetched += 1
+                click.echo(f"{pid}: fetched (offline processed + images)")
+        except click.ClickException as e:
+            failed += 1
+            click.echo(f"{pid}: failed ({e.message})")
+
+    click.echo()
+    click.secho(
+        (
+            f"Summary - attempted: {attempted}, fetched: {fetched}, "
+            f"skipped: {skipped}, failed: {failed}"
+        ),
+        fg="cyan",
+        bold=True,
+    )
+
+

 cli.add_command(run)
 cli.add_command(load)
@@ -799,6 +1375,7 @@ cli.add_command(export)
 cli.add_command(state)
 cli.add_command(show)
 cli.add_command(find)
+cli.add_command(fetchprob)

 if __name__ == "__main__":
    cli()