update run.py for fetchprob

This commit is contained in:
2026-04-27 09:43:47 +09:00
parent c332fa900c
commit 695760da41
2 changed files with 579 additions and 1 deletions

View File

@@ -1 +1,2 @@
click>=8.1
latex2mathml>=3.77.0

577
run.py
View File

@@ -2,14 +2,24 @@
import os
import sys
import pathlib
import base64
from enum import Enum, unique, auto
from dataclasses import dataclass
import re
import subprocess
import urllib.error
import urllib.parse
import urllib.request
from html import escape, unescape
import yaml
import click
try:
from latex2mathml.converter import convert as latex_to_mathml
except ImportError:
latex_to_mathml = None
CFG_PATH = "./config.yml"
STATE_PATH = "./state.yml"
@@ -200,6 +210,502 @@ def parse_range_string_list(str_list) -> list[int]:
return list(result)
def parse_fetchprob_target(target: str) -> tuple[str, str | None]:
"""
fetchprob target parser.
- zeta/<id>: single mode
- zeta: batch mode
"""
parts = target.split("/", 1)
location = parts[0].strip()
if location != "zeta":
raise click.UsageError("fetchprob target must start with 'zeta'")
if len(parts) == 1:
return location, None
problem_id = parts[1].strip()
if not problem_id.isdigit():
raise click.UsageError("problem id must be numeric (e.g. zeta/2447)")
return location, problem_id
def extract_problem_id_from_stem(stem: str) -> str | None:
"""
Extract BOJ numeric id from file stem.
Accepted forms: <id>, <id>_<suffix>, <id>-<suffix>
"""
m = re.match(r"^(\d+)(?:[_-].*)?$", stem)
return m.group(1) if m else None
def collect_zeta_problem_ids() -> list[str]:
"""
Collect problem ids from storage/zeta/* and storage/zeta/*/completed.
"""
zeta_dir = pathlib.Path(STORAGE_DIR) / "zeta"
if not zeta_dir.is_dir():
raise click.ClickException(f"Storage location '{zeta_dir}' not found")
ids: set[str] = set()
for lang_dir in sorted(zeta_dir.iterdir()):
if not lang_dir.is_dir() or lang_dir.name.startswith("_"):
continue
for f in lang_dir.iterdir():
if f.is_file():
problem_id = extract_problem_id_from_stem(f.stem)
if problem_id:
ids.add(problem_id)
completed_dir = lang_dir / "completed"
if completed_dir.is_dir():
for f in completed_dir.iterdir():
if f.is_file():
problem_id = extract_problem_id_from_stem(f.stem)
if problem_id:
ids.add(problem_id)
return sorted(ids, key=int)
def fetch_boj_problem_html(problem_id: str, timeout: int = 10) -> str:
"""
Download BOJ problem page raw HTML.
"""
url = f"https://www.acmicpc.net/problem/{problem_id}"
req = urllib.request.Request(
url,
headers={
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36"
)
},
)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
status = getattr(resp, "status", 200)
if status != 200:
raise click.ClickException(
f"failed to fetch problem {problem_id}: HTTP {status}"
)
return resp.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
raise click.ClickException(
f"failed to fetch problem {problem_id}: HTTP {e.code}"
) from e
except urllib.error.URLError as e:
raise click.ClickException(
f"network error while fetching problem {problem_id}: {e.reason}"
) from e
def _problem_static_html_path(problem_id: str) -> pathlib.Path:
return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / f"{problem_id}.html"
def _problem_static_assets_dir(problem_id: str) -> pathlib.Path:
return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / "assets" / problem_id
def _guess_image_mime(src_url: str, content_type: str | None) -> str:
if content_type:
mime = content_type.split(";", 1)[0].strip().lower()
if mime.startswith("image/"):
return mime
parsed = urllib.parse.urlparse(src_url)
ext = pathlib.Path(parsed.path).suffix.lower()
ext_to_mime = {
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
".svg": "image/svg+xml",
".bmp": "image/bmp",
".ico": "image/x-icon",
}
return ext_to_mime.get(ext, "image/png")
def _download_image_for_offline(problem_id: str, src_url: str, seq: int, force: bool) -> str | None:
req = urllib.request.Request(
src_url,
headers={
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/123.0.0.0 Safari/537.36"
),
"Referer": f"https://www.acmicpc.net/problem/{problem_id}",
},
)
try:
with urllib.request.urlopen(req, timeout=10) as resp:
status = getattr(resp, "status", 200)
if status != 200:
return None
content_type = resp.headers.get("Content-Type")
image_bytes = resp.read()
except (urllib.error.HTTPError, urllib.error.URLError):
return None
mime = _guess_image_mime(src_url, content_type)
encoded = base64.b64encode(image_bytes).decode("ascii")
return f"data:{mime};base64,{encoded}"
def _localize_images_in_html(problem_id: str, html_fragment: str, force: bool) -> str:
base_url = f"https://www.acmicpc.net/problem/{problem_id}"
counter = {"i": 0}
cache: dict[str, str] = {}
pattern = re.compile(
r'(<img\b[^>]*?\bsrc\s*=\s*)(["\']?)([^"\'>\s]+)(["\']?)',
flags=re.IGNORECASE,
)
def repl(m: re.Match) -> str:
prefix = m.group(1)
q1 = m.group(2)
src = m.group(3)
if src.startswith("data:"):
return m.group(0)
abs_url = urllib.parse.urljoin(base_url, src)
if abs_url in cache:
local_src = cache[abs_url]
quote = q1 if q1 else '"'
return f"{prefix}{quote}{local_src}{quote}"
counter["i"] += 1
local_src = _download_image_for_offline(problem_id, abs_url, counter["i"], force=force)
if not local_src:
return m.group(0)
cache[abs_url] = local_src
quote = q1 if q1 else '"'
return f"{prefix}{quote}{local_src}{quote}"
return pattern.sub(repl, html_fragment)
def _extract_html_by_id(raw_html: str, tag: str, element_id: str) -> str | None:
pattern = rf"<{tag}[^>]*id=\"{re.escape(element_id)}\"[^>]*>(.*?)</{tag}>"
m = re.search(pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
if not m:
return None
return m.group(1).strip()
def _strip_tags(html_text: str) -> str:
text = re.sub(r"<[^>]+>", "", html_text, flags=re.DOTALL)
return " ".join(text.split())
def _render_math_expressions(html_fragment: str) -> str:
"""
Convert TeX math delimiters to MathML for offline rendering.
- Inline: $...$
- Block: $$...$$
"""
if latex_to_mathml is None:
return html_fragment
protected_blocks: list[str] = []
def protect(m: re.Match) -> str:
protected_blocks.append(m.group(0))
return f"@@PROTECTED_{len(protected_blocks) - 1}@@"
# Do not touch code/pre blocks.
temp = re.sub(
r"<(pre|code)\b[^>]*>.*?</\1>",
protect,
html_fragment,
flags=re.DOTALL | re.IGNORECASE,
)
def repl_block(m: re.Match) -> str:
expr = unescape(m.group(1).strip())
if not expr:
return m.group(0)
try:
mathml = latex_to_mathml(expr)
return f'<div class="math-block">{mathml}</div>'
except Exception:
return m.group(0)
def repl_inline(m: re.Match) -> str:
expr = unescape(m.group(1).strip())
if not expr:
return m.group(0)
try:
mathml = latex_to_mathml(expr)
return f'<span class="math-inline">{mathml}</span>'
except Exception:
return m.group(0)
temp = re.sub(r"\$\$(.+?)\$\$", repl_block, temp, flags=re.DOTALL)
temp = re.sub(r"(?<!\$)\$(?!\$)(.+?)(?<!\$)\$(?!\$)", repl_inline, temp, flags=re.DOTALL)
for i, block in enumerate(protected_blocks):
temp = temp.replace(f"@@PROTECTED_{i}@@", block)
return temp
def make_offline_problem_html(problem_id: str, raw_html: str, force: bool) -> str:
"""
Build a self-contained offline-friendly HTML page from BOJ raw HTML.
"""
title = _extract_html_by_id(raw_html, "span", "problem_title")
if not title:
title = f"BOJ {problem_id}"
blocks: list[str] = []
core_specs = [
("problem_description", "문제"),
("problem_input", "입력"),
("problem_output", "출력"),
("problem_limit", "제한"),
("problem_hint", "힌트"),
]
for content_id, fallback_label in core_specs:
content = _extract_html_by_id(raw_html, "div", content_id)
if not content or not content.strip():
continue
localized_content = _localize_images_in_html(
problem_id,
content,
force=force,
)
localized_content = _render_math_expressions(localized_content)
blocks.append(
"\n".join(
[
"<article class=\"section\">",
f"<h2>{fallback_label}</h2>",
f"{localized_content}",
"</article>",
]
)
)
for sample_type, sample_label in (("sampleinput", "예제 입력"), ("sampleoutput", "예제 출력")):
sample_pattern = rf"<section[^>]*id=\"{sample_type}(\d+)\"[^>]*>(.*?)</section>"
sample_matches = list(
re.finditer(sample_pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
)
sample_matches.sort(key=lambda m: int(m.group(1)))
for m in sample_matches:
idx = m.group(1)
section_html = m.group(2)
pre_match = re.search(
r"(<pre[^>]*>.*?</pre>)",
section_html,
flags=re.DOTALL | re.IGNORECASE,
)
if not pre_match:
continue
pre_html = _localize_images_in_html(
problem_id,
pre_match.group(1),
force=force,
)
h2_match = re.search(
r"<h2[^>]*>(.*?)</h2>",
section_html,
flags=re.DOTALL | re.IGNORECASE,
)
if h2_match:
h2 = _strip_tags(h2_match.group(1))
else:
h2 = f"{sample_label} {idx}"
blocks.append(
"\n".join(
[
"<article class=\"section\">",
f"<h2>{h2}</h2>",
pre_html,
"</article>",
]
)
)
if not blocks:
body_fallback = (
"<article class=\"section\">"
"<h2>원본 페이지</h2>"
"<p>문제 본문 파싱에 실패하여 원본 HTML을 포함합니다.</p>"
f"<pre>{escape(raw_html[:100000])}</pre>"
"</article>"
)
blocks.append(body_fallback)
source_url = f"https://www.acmicpc.net/problem/{problem_id}"
content_html = "\n".join(blocks)
return f"""<!DOCTYPE html>
<html lang=\"ko\">
<head>
<meta charset=\"UTF-8\" />
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />
<title>BOJ {problem_id} - Offline</title>
<style>
:root {{
--bg: #fafaf8;
--paper: #ffffff;
--ink: #1e1f24;
--muted: #6a6d75;
--line: #d8dce3;
--accent: #0d6e6e;
--code-bg: #f4f6fb;
}}
* {{ box-sizing: border-box; }}
body {{
margin: 0;
background:
radial-gradient(circle at 15% 0%, #f0efe9 0%, transparent 42%),
radial-gradient(circle at 85% 20%, #e7f1f2 0%, transparent 38%),
var(--bg);
color: var(--ink);
font-family: "Noto Sans KR", "Pretendard", "Apple SD Gothic Neo", sans-serif;
line-height: 1.65;
}}
main {{
max-width: 980px;
margin: 0 auto;
padding: 24px 16px 56px;
}}
.header {{
background: var(--paper);
border: 1px solid var(--line);
border-radius: 14px;
padding: 18px 20px;
margin-bottom: 18px;
}}
.header h1 {{ margin: 0 0 6px; font-size: 1.5rem; }}
.header p {{ margin: 0; color: var(--muted); font-size: 0.95rem; }}
.header a {{ color: var(--accent); text-decoration: none; }}
.section {{
background: var(--paper);
border: 1px solid var(--line);
border-radius: 14px;
padding: 16px 18px;
margin-bottom: 14px;
overflow-x: auto;
}}
h2 {{
margin: 0 0 10px;
font-size: 1.05rem;
color: var(--accent);
border-bottom: 1px solid var(--line);
padding-bottom: 8px;
}}
pre, code {{
font-family: "JetBrains Mono", "Fira Code", monospace;
background: var(--code-bg);
}}
pre {{
padding: 12px;
border-radius: 10px;
border: 1px solid #e7ebf2;
overflow: auto;
}}
blockquote {{
margin: 14px 0;
padding: 16px 16px 14px 22px;
border-left: 4px solid var(--accent);
border-radius: 10px;
background: linear-gradient(90deg, #eef8f8 0%, #f9fdfd 100%);
color: #24313a;
font-weight: 600;
position: relative;
}}
blockquote::before {{
content: "";
position: absolute;
left: 8px;
top: 2px;
font-size: 1.35rem;
line-height: 1;
color: #0b5f5f;
opacity: 0.7;
}}
blockquote > :first-child {{ margin-top: 0; }}
blockquote > :last-child {{ margin-bottom: 0; }}
q {{
color: #114f50;
font-weight: 700;
background: #edf8f8;
border-radius: 6px;
padding: 0 4px;
}}
.math-inline math {{
font-size: 1em;
vertical-align: middle;
}}
.math-block {{
margin: 10px 0;
padding: 8px 10px;
overflow-x: auto;
background: #f8fbff;
border: 1px solid #e2ecf8;
border-radius: 8px;
}}
.math-block math {{
font-size: 1.04em;
display: block;
}}
table {{ border-collapse: collapse; width: 100%; }}
th, td {{ border: 1px solid var(--line); padding: 6px 8px; }}
img {{ max-width: 100%; height: auto; }}
</style>
</head>
<body>
<main>
<header class=\"header\">
<h1>{title}</h1>
</header>
{content_html}
</main>
</body>
</html>
"""
def save_problem_html(problem_id: str, html: str, force: bool) -> str:
"""
Save html to storage/zeta/_static/<id>.html
Return: fetched | skipped
"""
static_dir = pathlib.Path(STORAGE_DIR) / "zeta" / "_static"
static_dir.mkdir(parents=True, exist_ok=True)
dest = _problem_static_html_path(problem_id)
if dest.exists() and not force:
return "skipped"
dest.write_text(html, encoding="utf-8")
return "fetched"
@click.group()
def cli():
pass
@@ -791,6 +1297,76 @@ def find(keyword: str, completed: bool | None):
click.echo(f" {status} {file_name}.{lang_name}")
@click.command(name="fetchprob")
@click.argument("target", type=str, nargs=1, required=True)
@click.option("--force", "-f", is_flag=True, help="Overwrite existing HTML files")
def fetchprob(target: str, force: bool):
"""
Fetch BOJ problem HTML into storage/zeta/_static.
TARGET:
zeta/<id> Fetch one problem
zeta Fetch all detected problem ids under storage/zeta
"""
location, problem_id = parse_fetchprob_target(target)
if location != "zeta":
raise click.UsageError("only 'zeta' location is supported")
if problem_id is not None:
if _problem_static_html_path(problem_id).exists() and not force:
click.echo(f"{problem_id}: skipped (already exists)")
return
raw_html = fetch_boj_problem_html(problem_id)
offline_html = make_offline_problem_html(problem_id, raw_html, force=force)
result = save_problem_html(problem_id, offline_html, force=force)
if result == "skipped":
click.echo(f"{problem_id}: skipped (already exists)")
else:
click.echo(f"{problem_id}: fetched (offline processed + images)")
return
ids = collect_zeta_problem_ids()
if not ids:
click.echo("No problem ids found in storage/zeta")
return
attempted = len(ids)
fetched = 0
skipped = 0
failed = 0
for pid in ids:
try:
if _problem_static_html_path(pid).exists() and not force:
skipped += 1
click.echo(f"{pid}: skipped")
continue
raw_html = fetch_boj_problem_html(pid)
offline_html = make_offline_problem_html(pid, raw_html, force=force)
result = save_problem_html(pid, offline_html, force=force)
if result == "skipped":
skipped += 1
click.echo(f"{pid}: skipped")
else:
fetched += 1
click.echo(f"{pid}: fetched (offline processed + images)")
except click.ClickException as e:
failed += 1
click.echo(f"{pid}: failed ({e.message})")
click.echo()
click.secho(
(
f"Summary - attempted: {attempted}, fetched: {fetched}, "
f"skipped: {skipped}, failed: {failed}"
),
fg="cyan",
bold=True,
)
cli.add_command(run)
cli.add_command(load)
@@ -799,6 +1375,7 @@ cli.add_command(export)
cli.add_command(state)
cli.add_command(show)
cli.add_command(find)
cli.add_command(fetchprob)
if __name__ == "__main__":
cli()