update run.py for fetchprob

2026-04-27 09:43:47 +09:00
parent c332fa900c
commit 695760da41
2 changed files with 579 additions and 1 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
-click>=8.1
+click>=8.1
 latex2mathml>=3.77.0
--- a/run.py
+++ b/run.py
@@ -2,14 +2,24 @@
 import os
 import sys
 import pathlib
 import base64
 from enum import Enum, unique, auto
 from dataclasses import dataclass
 import re
 import subprocess
 import urllib.error
 import urllib.parse
 import urllib.request
 from html import escape, unescape
 import yaml
 import click
 try:
    from latex2mathml.converter import convert as latex_to_mathml
 except ImportError:
    latex_to_mathml = None
 CFG_PATH = "./config.yml"
 STATE_PATH = "./state.yml"
@@ -200,6 +210,502 @@ def parse_range_string_list(str_list) -> list[int]:
    return list(result)
 def parse_fetchprob_target(target: str) -> tuple[str, str | None]:
    """
    fetchprob target parser.
    - zeta/<id>: single mode
    - zeta: batch mode
    """
    parts = target.split("/", 1)
    location = parts[0].strip()
    if location != "zeta":
        raise click.UsageError("fetchprob target must start with 'zeta'")
    if len(parts) == 1:
        return location, None
    problem_id = parts[1].strip()
    if not problem_id.isdigit():
        raise click.UsageError("problem id must be numeric (e.g. zeta/2447)")
    return location, problem_id
 def extract_problem_id_from_stem(stem: str) -> str | None:
    """
    Extract BOJ numeric id from file stem.
    Accepted forms: <id>, <id>_<suffix>, <id>-<suffix>
    """
    m = re.match(r"^(\d+)(?:[_-].*)?$", stem)
    return m.group(1) if m else None
 def collect_zeta_problem_ids() -> list[str]:
    """
    Collect problem ids from storage/zeta/* and storage/zeta/*/completed.
    """
    zeta_dir = pathlib.Path(STORAGE_DIR) / "zeta"
    if not zeta_dir.is_dir():
        raise click.ClickException(f"Storage location '{zeta_dir}' not found")
    ids: set[str] = set()
    for lang_dir in sorted(zeta_dir.iterdir()):
        if not lang_dir.is_dir() or lang_dir.name.startswith("_"):
            continue
        for f in lang_dir.iterdir():
            if f.is_file():
                problem_id = extract_problem_id_from_stem(f.stem)
                if problem_id:
                    ids.add(problem_id)
        completed_dir = lang_dir / "completed"
        if completed_dir.is_dir():
            for f in completed_dir.iterdir():
                if f.is_file():
                    problem_id = extract_problem_id_from_stem(f.stem)
                    if problem_id:
                        ids.add(problem_id)
    return sorted(ids, key=int)
 def fetch_boj_problem_html(problem_id: str, timeout: int = 10) -> str:
    """
    Download BOJ problem page raw HTML.
    """
    url = f"https://www.acmicpc.net/problem/{problem_id}"
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": (
                "Mozilla/5.0 (X11; Linux x86_64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/123.0.0.0 Safari/537.36"
            )
        },
    )
    try:
        with urllib.request.urlopen(req, timeout=timeout) as resp:
            status = getattr(resp, "status", 200)
            if status != 200:
                raise click.ClickException(
                    f"failed to fetch problem {problem_id}: HTTP {status}"
                )
            return resp.read().decode("utf-8", errors="replace")
    except urllib.error.HTTPError as e:
        raise click.ClickException(
            f"failed to fetch problem {problem_id}: HTTP {e.code}"
        ) from e
    except urllib.error.URLError as e:
        raise click.ClickException(
            f"network error while fetching problem {problem_id}: {e.reason}"
        ) from e
 def _problem_static_html_path(problem_id: str) -> pathlib.Path:
    return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / f"{problem_id}.html"
 def _problem_static_assets_dir(problem_id: str) -> pathlib.Path:
    return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / "assets" / problem_id
 def _guess_image_mime(src_url: str, content_type: str | None) -> str:
    if content_type:
        mime = content_type.split(";", 1)[0].strip().lower()
        if mime.startswith("image/"):
            return mime
    parsed = urllib.parse.urlparse(src_url)
    ext = pathlib.Path(parsed.path).suffix.lower()
    ext_to_mime = {
        ".jpg": "image/jpeg",
        ".jpeg": "image/jpeg",
        ".png": "image/png",
        ".gif": "image/gif",
        ".webp": "image/webp",
        ".svg": "image/svg+xml",
        ".bmp": "image/bmp",
        ".ico": "image/x-icon",
    }
    return ext_to_mime.get(ext, "image/png")
 def _download_image_for_offline(problem_id: str, src_url: str, seq: int, force: bool) -> str | None:
    req = urllib.request.Request(
        src_url,
        headers={
            "User-Agent": (
                "Mozilla/5.0 (X11; Linux x86_64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/123.0.0.0 Safari/537.36"
            ),
            "Referer": f"https://www.acmicpc.net/problem/{problem_id}",
        },
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            status = getattr(resp, "status", 200)
            if status != 200:
                return None
            content_type = resp.headers.get("Content-Type")
            image_bytes = resp.read()
    except (urllib.error.HTTPError, urllib.error.URLError):
        return None
    mime = _guess_image_mime(src_url, content_type)
    encoded = base64.b64encode(image_bytes).decode("ascii")
    return f"data:{mime};base64,{encoded}"
 def _localize_images_in_html(problem_id: str, html_fragment: str, force: bool) -> str:
    base_url = f"https://www.acmicpc.net/problem/{problem_id}"
    counter = {"i": 0}
    cache: dict[str, str] = {}
    pattern = re.compile(
        r'(<img\b[^>]*?\bsrc\s*=\s*)(["\']?)([^"\'>\s]+)(["\']?)',
        flags=re.IGNORECASE,
    )
    def repl(m: re.Match) -> str:
        prefix = m.group(1)
        q1 = m.group(2)
        src = m.group(3)
        if src.startswith("data:"):
            return m.group(0)
        abs_url = urllib.parse.urljoin(base_url, src)
        if abs_url in cache:
            local_src = cache[abs_url]
            quote = q1 if q1 else '"'
            return f"{prefix}{quote}{local_src}{quote}"
        counter["i"] += 1
        local_src = _download_image_for_offline(problem_id, abs_url, counter["i"], force=force)
        if not local_src:
            return m.group(0)
        cache[abs_url] = local_src
        quote = q1 if q1 else '"'
        return f"{prefix}{quote}{local_src}{quote}"
    return pattern.sub(repl, html_fragment)
 def _extract_html_by_id(raw_html: str, tag: str, element_id: str) -> str | None:
    pattern = rf"<{tag}[^>]*id=\"{re.escape(element_id)}\"[^>]*>(.*?)</{tag}>"
    m = re.search(pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
    if not m:
        return None
    return m.group(1).strip()
 def _strip_tags(html_text: str) -> str:
    text = re.sub(r"<[^>]+>", "", html_text, flags=re.DOTALL)
    return " ".join(text.split())
 def _render_math_expressions(html_fragment: str) -> str:
    """
    Convert TeX math delimiters to MathML for offline rendering.
    - Inline: $...$
    - Block: $$...$$
    """
    if latex_to_mathml is None:
        return html_fragment
    protected_blocks: list[str] = []
    def protect(m: re.Match) -> str:
        protected_blocks.append(m.group(0))
        return f"@@PROTECTED_{len(protected_blocks) - 1}@@"
    # Do not touch code/pre blocks.
    temp = re.sub(
        r"<(pre|code)\b[^>]*>.*?</\1>",
        protect,
        html_fragment,
        flags=re.DOTALL | re.IGNORECASE,
    )
    def repl_block(m: re.Match) -> str:
        expr = unescape(m.group(1).strip())
        if not expr:
            return m.group(0)
        try:
            mathml = latex_to_mathml(expr)
            return f'<div class="math-block">{mathml}</div>'
        except Exception:
            return m.group(0)
    def repl_inline(m: re.Match) -> str:
        expr = unescape(m.group(1).strip())
        if not expr:
            return m.group(0)
        try:
            mathml = latex_to_mathml(expr)
            return f'<span class="math-inline">{mathml}</span>'
        except Exception:
            return m.group(0)
    temp = re.sub(r"\$\$(.+?)\$\$", repl_block, temp, flags=re.DOTALL)
    temp = re.sub(r"(?<!\$)\$(?!\$)(.+?)(?<!\$)\$(?!\$)", repl_inline, temp, flags=re.DOTALL)
    for i, block in enumerate(protected_blocks):
        temp = temp.replace(f"@@PROTECTED_{i}@@", block)
    return temp
 def make_offline_problem_html(problem_id: str, raw_html: str, force: bool) -> str:
    """
    Build a self-contained offline-friendly HTML page from BOJ raw HTML.
    """
    title = _extract_html_by_id(raw_html, "span", "problem_title")
    if not title:
        title = f"BOJ {problem_id}"
    blocks: list[str] = []
    core_specs = [
        ("problem_description", "문제"),
        ("problem_input", "입력"),
        ("problem_output", "출력"),
        ("problem_limit", "제한"),
        ("problem_hint", "힌트"),
    ]
    for content_id, fallback_label in core_specs:
        content = _extract_html_by_id(raw_html, "div", content_id)
        if not content or not content.strip():
            continue
        localized_content = _localize_images_in_html(
            problem_id,
            content,
            force=force,
        )
        localized_content = _render_math_expressions(localized_content)
        blocks.append(
            "\n".join(
                [
                    "<article class=\"section\">",
                    f"<h2>{fallback_label}</h2>",
                    f"{localized_content}",
                    "</article>",
                ]
            )
        )
    for sample_type, sample_label in (("sampleinput", "예제 입력"), ("sampleoutput", "예제 출력")):
        sample_pattern = rf"<section[^>]*id=\"{sample_type}(\d+)\"[^>]*>(.*?)</section>"
        sample_matches = list(
            re.finditer(sample_pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
        )
        sample_matches.sort(key=lambda m: int(m.group(1)))
        for m in sample_matches:
            idx = m.group(1)
            section_html = m.group(2)
            pre_match = re.search(
                r"(<pre[^>]*>.*?</pre>)",
                section_html,
                flags=re.DOTALL | re.IGNORECASE,
            )
            if not pre_match:
                continue
            pre_html = _localize_images_in_html(
                problem_id,
                pre_match.group(1),
                force=force,
            )
            h2_match = re.search(
                r"<h2[^>]*>(.*?)</h2>",
                section_html,
                flags=re.DOTALL | re.IGNORECASE,
            )
            if h2_match:
                h2 = _strip_tags(h2_match.group(1))
            else:
                h2 = f"{sample_label} {idx}"
            blocks.append(
                "\n".join(
                    [
                        "<article class=\"section\">",
                        f"<h2>{h2}</h2>",
                        pre_html,
                        "</article>",
                    ]
                )
            )
    if not blocks:
        body_fallback = (
            "<article class=\"section\">"
            "<h2>원본 페이지</h2>"
            "<p>문제 본문 파싱에 실패하여 원본 HTML을 포함합니다.</p>"
            f"<pre>{escape(raw_html[:100000])}</pre>"
            "</article>"
        )
        blocks.append(body_fallback)
    source_url = f"https://www.acmicpc.net/problem/{problem_id}"
    content_html = "\n".join(blocks)
    return f"""<!DOCTYPE html>
 <html lang=\"ko\">
 <head>
    <meta charset=\"UTF-8\" />
    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />
    <title>BOJ {problem_id} - Offline</title>
    <style>
        :root {{
            --bg: #fafaf8;
            --paper: #ffffff;
            --ink: #1e1f24;
            --muted: #6a6d75;
            --line: #d8dce3;
            --accent: #0d6e6e;
            --code-bg: #f4f6fb;
        }}
        * {{ box-sizing: border-box; }}
        body {{
            margin: 0;
            background:
                radial-gradient(circle at 15% 0%, #f0efe9 0%, transparent 42%),
                radial-gradient(circle at 85% 20%, #e7f1f2 0%, transparent 38%),
                var(--bg);
            color: var(--ink);
            font-family: "Noto Sans KR", "Pretendard", "Apple SD Gothic Neo", sans-serif;
            line-height: 1.65;
        }}
        main {{
            max-width: 980px;
            margin: 0 auto;
            padding: 24px 16px 56px;
        }}
        .header {{
            background: var(--paper);
            border: 1px solid var(--line);
            border-radius: 14px;
            padding: 18px 20px;
            margin-bottom: 18px;
        }}
        .header h1 {{ margin: 0 0 6px; font-size: 1.5rem; }}
        .header p {{ margin: 0; color: var(--muted); font-size: 0.95rem; }}
        .header a {{ color: var(--accent); text-decoration: none; }}
        .section {{
            background: var(--paper);
            border: 1px solid var(--line);
            border-radius: 14px;
            padding: 16px 18px;
            margin-bottom: 14px;
            overflow-x: auto;
        }}
        h2 {{
            margin: 0 0 10px;
            font-size: 1.05rem;
            color: var(--accent);
            border-bottom: 1px solid var(--line);
            padding-bottom: 8px;
        }}
        pre, code {{
            font-family: "JetBrains Mono", "Fira Code", monospace;
            background: var(--code-bg);
        }}
        pre {{
            padding: 12px;
            border-radius: 10px;
            border: 1px solid #e7ebf2;
            overflow: auto;
        }}
        blockquote {{
            margin: 14px 0;
            padding: 16px 16px 14px 22px;
            border-left: 4px solid var(--accent);
            border-radius: 10px;
            background: linear-gradient(90deg, #eef8f8 0%, #f9fdfd 100%);
            color: #24313a;
            font-weight: 600;
            position: relative;
        }}
        blockquote::before {{
            content: "“";
            position: absolute;
            left: 8px;
            top: 2px;
            font-size: 1.35rem;
            line-height: 1;
            color: #0b5f5f;
            opacity: 0.7;
        }}
        blockquote > :first-child {{ margin-top: 0; }}
        blockquote > :last-child {{ margin-bottom: 0; }}
        q {{
            color: #114f50;
            font-weight: 700;
            background: #edf8f8;
            border-radius: 6px;
            padding: 0 4px;
        }}
        .math-inline math {{
            font-size: 1em;
            vertical-align: middle;
        }}
        .math-block {{
            margin: 10px 0;
            padding: 8px 10px;
            overflow-x: auto;
            background: #f8fbff;
            border: 1px solid #e2ecf8;
            border-radius: 8px;
        }}
        .math-block math {{
            font-size: 1.04em;
            display: block;
        }}
        table {{ border-collapse: collapse; width: 100%; }}
        th, td {{ border: 1px solid var(--line); padding: 6px 8px; }}
        img {{ max-width: 100%; height: auto; }}
    </style>
 </head>
 <body>
    <main>
        <header class=\"header\">
            <h1>{title}</h1>
        </header>
        {content_html}
    </main>
 </body>
 </html>
 """
 def save_problem_html(problem_id: str, html: str, force: bool) -> str:
    """
    Save html to storage/zeta/_static/<id>.html
    Return: fetched | skipped
    """
    static_dir = pathlib.Path(STORAGE_DIR) / "zeta" / "_static"
    static_dir.mkdir(parents=True, exist_ok=True)
    dest = _problem_static_html_path(problem_id)
    if dest.exists() and not force:
        return "skipped"
    dest.write_text(html, encoding="utf-8")
    return "fetched"
@click.group()
 def cli():
    pass
@@ -791,6 +1297,76 @@ def find(keyword: str, completed: bool | None):
        click.echo(f"    {status} {file_name}.{lang_name}")
@click.command(name="fetchprob")
@click.argument("target", type=str, nargs=1, required=True)
@click.option("--force", "-f", is_flag=True, help="Overwrite existing HTML files")
 def fetchprob(target: str, force: bool):
    """
    Fetch BOJ problem HTML into storage/zeta/_static.
    TARGET:
      zeta/<id>  Fetch one problem
      zeta       Fetch all detected problem ids under storage/zeta
    """
    location, problem_id = parse_fetchprob_target(target)
    if location != "zeta":
        raise click.UsageError("only 'zeta' location is supported")
    if problem_id is not None:
        if _problem_static_html_path(problem_id).exists() and not force:
            click.echo(f"{problem_id}: skipped (already exists)")
            return
        raw_html = fetch_boj_problem_html(problem_id)
        offline_html = make_offline_problem_html(problem_id, raw_html, force=force)
        result = save_problem_html(problem_id, offline_html, force=force)
        if result == "skipped":
            click.echo(f"{problem_id}: skipped (already exists)")
        else:
            click.echo(f"{problem_id}: fetched (offline processed + images)")
        return
    ids = collect_zeta_problem_ids()
    if not ids:
        click.echo("No problem ids found in storage/zeta")
        return
    attempted = len(ids)
    fetched = 0
    skipped = 0
    failed = 0
    for pid in ids:
        try:
            if _problem_static_html_path(pid).exists() and not force:
                skipped += 1
                click.echo(f"{pid}: skipped")
                continue
            raw_html = fetch_boj_problem_html(pid)
            offline_html = make_offline_problem_html(pid, raw_html, force=force)
            result = save_problem_html(pid, offline_html, force=force)
            if result == "skipped":
                skipped += 1
                click.echo(f"{pid}: skipped")
            else:
                fetched += 1
                click.echo(f"{pid}: fetched (offline processed + images)")
        except click.ClickException as e:
            failed += 1
            click.echo(f"{pid}: failed ({e.message})")
    click.echo()
    click.secho(
        (
            f"Summary - attempted: {attempted}, fetched: {fetched}, "
            f"skipped: {skipped}, failed: {failed}"
        ),
        fg="cyan",
        bold=True,
    )
 cli.add_command(run)
 cli.add_command(load)
@@ -799,6 +1375,7 @@ cli.add_command(export)
 cli.add_command(state)
 cli.add_command(show)
 cli.add_command(find)
 cli.add_command(fetchprob)
 if __name__ == "__main__":
    cli()
`@@ -1 +1,2 @@`
	`click>=8.1`	`click>=8.1`
		`latex2mathml>=3.77.0`