From 695760da416ec10f30310f1210581228130ceea0 Mon Sep 17 00:00:00 2001 From: yenru0 Date: Mon, 27 Apr 2026 09:43:47 +0900 Subject: [PATCH] update run.py for fetchprob --- requirements.txt | 3 +- run.py | 577 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 579 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 870ca92..7642d8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -click>=8.1 \ No newline at end of file +click>=8.1 +latex2mathml>=3.77.0 \ No newline at end of file diff --git a/run.py b/run.py index a1d8eb7..93dee37 100755 --- a/run.py +++ b/run.py @@ -2,14 +2,24 @@ import os import sys import pathlib +import base64 from enum import Enum, unique, auto from dataclasses import dataclass import re import subprocess +import urllib.error +import urllib.parse +import urllib.request +from html import escape, unescape import yaml import click +try: + from latex2mathml.converter import convert as latex_to_mathml +except ImportError: + latex_to_mathml = None + CFG_PATH = "./config.yml" STATE_PATH = "./state.yml" @@ -200,6 +210,502 @@ def parse_range_string_list(str_list) -> list[int]: return list(result) +def parse_fetchprob_target(target: str) -> tuple[str, str | None]: + """ + fetchprob target parser. + - zeta/: single mode + - zeta: batch mode + """ + parts = target.split("/", 1) + location = parts[0].strip() + + if location != "zeta": + raise click.UsageError("fetchprob target must start with 'zeta'") + + if len(parts) == 1: + return location, None + + problem_id = parts[1].strip() + if not problem_id.isdigit(): + raise click.UsageError("problem id must be numeric (e.g. zeta/2447)") + + return location, problem_id + + +def extract_problem_id_from_stem(stem: str) -> str | None: + """ + Extract BOJ numeric id from file stem. + Accepted forms: , _, - + """ + m = re.match(r"^(\d+)(?:[_-].*)?$", stem) + return m.group(1) if m else None + + +def collect_zeta_problem_ids() -> list[str]: + """ + Collect problem ids from storage/zeta/* and storage/zeta/*/completed. + """ + zeta_dir = pathlib.Path(STORAGE_DIR) / "zeta" + if not zeta_dir.is_dir(): + raise click.ClickException(f"Storage location '{zeta_dir}' not found") + + ids: set[str] = set() + for lang_dir in sorted(zeta_dir.iterdir()): + if not lang_dir.is_dir() or lang_dir.name.startswith("_"): + continue + + for f in lang_dir.iterdir(): + if f.is_file(): + problem_id = extract_problem_id_from_stem(f.stem) + if problem_id: + ids.add(problem_id) + + completed_dir = lang_dir / "completed" + if completed_dir.is_dir(): + for f in completed_dir.iterdir(): + if f.is_file(): + problem_id = extract_problem_id_from_stem(f.stem) + if problem_id: + ids.add(problem_id) + + return sorted(ids, key=int) + + +def fetch_boj_problem_html(problem_id: str, timeout: int = 10) -> str: + """ + Download BOJ problem page raw HTML. + """ + url = f"https://www.acmicpc.net/problem/{problem_id}" + req = urllib.request.Request( + url, + headers={ + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36" + ) + }, + ) + + try: + with urllib.request.urlopen(req, timeout=timeout) as resp: + status = getattr(resp, "status", 200) + if status != 200: + raise click.ClickException( + f"failed to fetch problem {problem_id}: HTTP {status}" + ) + return resp.read().decode("utf-8", errors="replace") + except urllib.error.HTTPError as e: + raise click.ClickException( + f"failed to fetch problem {problem_id}: HTTP {e.code}" + ) from e + except urllib.error.URLError as e: + raise click.ClickException( + f"network error while fetching problem {problem_id}: {e.reason}" + ) from e + + +def _problem_static_html_path(problem_id: str) -> pathlib.Path: + return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / f"{problem_id}.html" + + +def _problem_static_assets_dir(problem_id: str) -> pathlib.Path: + return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / "assets" / problem_id + + +def _guess_image_mime(src_url: str, content_type: str | None) -> str: + if content_type: + mime = content_type.split(";", 1)[0].strip().lower() + if mime.startswith("image/"): + return mime + + parsed = urllib.parse.urlparse(src_url) + ext = pathlib.Path(parsed.path).suffix.lower() + ext_to_mime = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + ".svg": "image/svg+xml", + ".bmp": "image/bmp", + ".ico": "image/x-icon", + } + return ext_to_mime.get(ext, "image/png") + + +def _download_image_for_offline(problem_id: str, src_url: str, seq: int, force: bool) -> str | None: + req = urllib.request.Request( + src_url, + headers={ + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/123.0.0.0 Safari/537.36" + ), + "Referer": f"https://www.acmicpc.net/problem/{problem_id}", + }, + ) + + try: + with urllib.request.urlopen(req, timeout=10) as resp: + status = getattr(resp, "status", 200) + if status != 200: + return None + content_type = resp.headers.get("Content-Type") + image_bytes = resp.read() + except (urllib.error.HTTPError, urllib.error.URLError): + return None + + mime = _guess_image_mime(src_url, content_type) + encoded = base64.b64encode(image_bytes).decode("ascii") + return f"data:{mime};base64,{encoded}" + + +def _localize_images_in_html(problem_id: str, html_fragment: str, force: bool) -> str: + base_url = f"https://www.acmicpc.net/problem/{problem_id}" + counter = {"i": 0} + cache: dict[str, str] = {} + + pattern = re.compile( + r'(]*?\bsrc\s*=\s*)(["\']?)([^"\'>\s]+)(["\']?)', + flags=re.IGNORECASE, + ) + + def repl(m: re.Match) -> str: + prefix = m.group(1) + q1 = m.group(2) + src = m.group(3) + + if src.startswith("data:"): + return m.group(0) + + abs_url = urllib.parse.urljoin(base_url, src) + if abs_url in cache: + local_src = cache[abs_url] + quote = q1 if q1 else '"' + return f"{prefix}{quote}{local_src}{quote}" + + counter["i"] += 1 + local_src = _download_image_for_offline(problem_id, abs_url, counter["i"], force=force) + if not local_src: + return m.group(0) + + cache[abs_url] = local_src + quote = q1 if q1 else '"' + return f"{prefix}{quote}{local_src}{quote}" + + return pattern.sub(repl, html_fragment) + + +def _extract_html_by_id(raw_html: str, tag: str, element_id: str) -> str | None: + pattern = rf"<{tag}[^>]*id=\"{re.escape(element_id)}\"[^>]*>(.*?)" + m = re.search(pattern, raw_html, flags=re.DOTALL | re.IGNORECASE) + if not m: + return None + return m.group(1).strip() + + +def _strip_tags(html_text: str) -> str: + text = re.sub(r"<[^>]+>", "", html_text, flags=re.DOTALL) + return " ".join(text.split()) + + +def _render_math_expressions(html_fragment: str) -> str: + """ + Convert TeX math delimiters to MathML for offline rendering. + - Inline: $...$ + - Block: $$...$$ + """ + if latex_to_mathml is None: + return html_fragment + + protected_blocks: list[str] = [] + + def protect(m: re.Match) -> str: + protected_blocks.append(m.group(0)) + return f"@@PROTECTED_{len(protected_blocks) - 1}@@" + + # Do not touch code/pre blocks. + temp = re.sub( + r"<(pre|code)\b[^>]*>.*?", + protect, + html_fragment, + flags=re.DOTALL | re.IGNORECASE, + ) + + def repl_block(m: re.Match) -> str: + expr = unescape(m.group(1).strip()) + if not expr: + return m.group(0) + try: + mathml = latex_to_mathml(expr) + return f'
{mathml}
' + except Exception: + return m.group(0) + + def repl_inline(m: re.Match) -> str: + expr = unescape(m.group(1).strip()) + if not expr: + return m.group(0) + try: + mathml = latex_to_mathml(expr) + return f'{mathml}' + except Exception: + return m.group(0) + + temp = re.sub(r"\$\$(.+?)\$\$", repl_block, temp, flags=re.DOTALL) + temp = re.sub(r"(? str: + """ + Build a self-contained offline-friendly HTML page from BOJ raw HTML. + """ + title = _extract_html_by_id(raw_html, "span", "problem_title") + if not title: + title = f"BOJ {problem_id}" + + blocks: list[str] = [] + core_specs = [ + ("problem_description", "문제"), + ("problem_input", "입력"), + ("problem_output", "출력"), + ("problem_limit", "제한"), + ("problem_hint", "힌트"), + ] + + for content_id, fallback_label in core_specs: + content = _extract_html_by_id(raw_html, "div", content_id) + if not content or not content.strip(): + continue + + localized_content = _localize_images_in_html( + problem_id, + content, + force=force, + ) + localized_content = _render_math_expressions(localized_content) + + blocks.append( + "\n".join( + [ + "
", + f"

{fallback_label}

", + f"{localized_content}", + "
", + ] + ) + ) + + for sample_type, sample_label in (("sampleinput", "예제 입력"), ("sampleoutput", "예제 출력")): + sample_pattern = rf"]*id=\"{sample_type}(\d+)\"[^>]*>(.*?)" + sample_matches = list( + re.finditer(sample_pattern, raw_html, flags=re.DOTALL | re.IGNORECASE) + ) + sample_matches.sort(key=lambda m: int(m.group(1))) + + for m in sample_matches: + idx = m.group(1) + section_html = m.group(2) + pre_match = re.search( + r"(]*>.*?)", + section_html, + flags=re.DOTALL | re.IGNORECASE, + ) + if not pre_match: + continue + + pre_html = _localize_images_in_html( + problem_id, + pre_match.group(1), + force=force, + ) + + h2_match = re.search( + r"]*>(.*?)", + section_html, + flags=re.DOTALL | re.IGNORECASE, + ) + if h2_match: + h2 = _strip_tags(h2_match.group(1)) + else: + h2 = f"{sample_label} {idx}" + + blocks.append( + "\n".join( + [ + "
", + f"

{h2}

", + pre_html, + "
", + ] + ) + ) + + if not blocks: + body_fallback = ( + "
" + "

원본 페이지

" + "

문제 본문 파싱에 실패하여 원본 HTML을 포함합니다.

" + f"
{escape(raw_html[:100000])}
" + "
" + ) + blocks.append(body_fallback) + + source_url = f"https://www.acmicpc.net/problem/{problem_id}" + content_html = "\n".join(blocks) + + return f""" + + + + + BOJ {problem_id} - Offline + + + +
+
+

{title}

+
+ {content_html} +
+ + +""" + + +def save_problem_html(problem_id: str, html: str, force: bool) -> str: + """ + Save html to storage/zeta/_static/.html + Return: fetched | skipped + """ + static_dir = pathlib.Path(STORAGE_DIR) / "zeta" / "_static" + static_dir.mkdir(parents=True, exist_ok=True) + + dest = _problem_static_html_path(problem_id) + if dest.exists() and not force: + return "skipped" + + dest.write_text(html, encoding="utf-8") + return "fetched" + + @click.group() def cli(): pass @@ -791,6 +1297,76 @@ def find(keyword: str, completed: bool | None): click.echo(f" {status} {file_name}.{lang_name}") +@click.command(name="fetchprob") +@click.argument("target", type=str, nargs=1, required=True) +@click.option("--force", "-f", is_flag=True, help="Overwrite existing HTML files") +def fetchprob(target: str, force: bool): + """ + Fetch BOJ problem HTML into storage/zeta/_static. + + TARGET: + zeta/ Fetch one problem + zeta Fetch all detected problem ids under storage/zeta + """ + location, problem_id = parse_fetchprob_target(target) + if location != "zeta": + raise click.UsageError("only 'zeta' location is supported") + + if problem_id is not None: + if _problem_static_html_path(problem_id).exists() and not force: + click.echo(f"{problem_id}: skipped (already exists)") + return + + raw_html = fetch_boj_problem_html(problem_id) + offline_html = make_offline_problem_html(problem_id, raw_html, force=force) + result = save_problem_html(problem_id, offline_html, force=force) + if result == "skipped": + click.echo(f"{problem_id}: skipped (already exists)") + else: + click.echo(f"{problem_id}: fetched (offline processed + images)") + return + + ids = collect_zeta_problem_ids() + if not ids: + click.echo("No problem ids found in storage/zeta") + return + + attempted = len(ids) + fetched = 0 + skipped = 0 + failed = 0 + + for pid in ids: + try: + if _problem_static_html_path(pid).exists() and not force: + skipped += 1 + click.echo(f"{pid}: skipped") + continue + + raw_html = fetch_boj_problem_html(pid) + offline_html = make_offline_problem_html(pid, raw_html, force=force) + result = save_problem_html(pid, offline_html, force=force) + if result == "skipped": + skipped += 1 + click.echo(f"{pid}: skipped") + else: + fetched += 1 + click.echo(f"{pid}: fetched (offline processed + images)") + except click.ClickException as e: + failed += 1 + click.echo(f"{pid}: failed ({e.message})") + + click.echo() + click.secho( + ( + f"Summary - attempted: {attempted}, fetched: {fetched}, " + f"skipped: {skipped}, failed: {failed}" + ), + fg="cyan", + bold=True, + ) + + cli.add_command(run) cli.add_command(load) @@ -799,6 +1375,7 @@ cli.add_command(export) cli.add_command(state) cli.add_command(show) cli.add_command(find) +cli.add_command(fetchprob) if __name__ == "__main__": cli()