update run.py for fetchprob
This commit is contained in:
@@ -1 +1,2 @@
|
||||
click>=8.1
|
||||
click>=8.1
|
||||
latex2mathml>=3.77.0
|
||||
577
run.py
577
run.py
@@ -2,14 +2,24 @@
|
||||
import os
|
||||
import sys
|
||||
import pathlib
|
||||
import base64
|
||||
from enum import Enum, unique, auto
|
||||
from dataclasses import dataclass
|
||||
import re
|
||||
import subprocess
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from html import escape, unescape
|
||||
import yaml
|
||||
|
||||
import click
|
||||
|
||||
try:
|
||||
from latex2mathml.converter import convert as latex_to_mathml
|
||||
except ImportError:
|
||||
latex_to_mathml = None
|
||||
|
||||
CFG_PATH = "./config.yml"
|
||||
STATE_PATH = "./state.yml"
|
||||
|
||||
@@ -200,6 +210,502 @@ def parse_range_string_list(str_list) -> list[int]:
|
||||
return list(result)
|
||||
|
||||
|
||||
def parse_fetchprob_target(target: str) -> tuple[str, str | None]:
|
||||
"""
|
||||
fetchprob target parser.
|
||||
- zeta/<id>: single mode
|
||||
- zeta: batch mode
|
||||
"""
|
||||
parts = target.split("/", 1)
|
||||
location = parts[0].strip()
|
||||
|
||||
if location != "zeta":
|
||||
raise click.UsageError("fetchprob target must start with 'zeta'")
|
||||
|
||||
if len(parts) == 1:
|
||||
return location, None
|
||||
|
||||
problem_id = parts[1].strip()
|
||||
if not problem_id.isdigit():
|
||||
raise click.UsageError("problem id must be numeric (e.g. zeta/2447)")
|
||||
|
||||
return location, problem_id
|
||||
|
||||
|
||||
def extract_problem_id_from_stem(stem: str) -> str | None:
|
||||
"""
|
||||
Extract BOJ numeric id from file stem.
|
||||
Accepted forms: <id>, <id>_<suffix>, <id>-<suffix>
|
||||
"""
|
||||
m = re.match(r"^(\d+)(?:[_-].*)?$", stem)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def collect_zeta_problem_ids() -> list[str]:
|
||||
"""
|
||||
Collect problem ids from storage/zeta/* and storage/zeta/*/completed.
|
||||
"""
|
||||
zeta_dir = pathlib.Path(STORAGE_DIR) / "zeta"
|
||||
if not zeta_dir.is_dir():
|
||||
raise click.ClickException(f"Storage location '{zeta_dir}' not found")
|
||||
|
||||
ids: set[str] = set()
|
||||
for lang_dir in sorted(zeta_dir.iterdir()):
|
||||
if not lang_dir.is_dir() or lang_dir.name.startswith("_"):
|
||||
continue
|
||||
|
||||
for f in lang_dir.iterdir():
|
||||
if f.is_file():
|
||||
problem_id = extract_problem_id_from_stem(f.stem)
|
||||
if problem_id:
|
||||
ids.add(problem_id)
|
||||
|
||||
completed_dir = lang_dir / "completed"
|
||||
if completed_dir.is_dir():
|
||||
for f in completed_dir.iterdir():
|
||||
if f.is_file():
|
||||
problem_id = extract_problem_id_from_stem(f.stem)
|
||||
if problem_id:
|
||||
ids.add(problem_id)
|
||||
|
||||
return sorted(ids, key=int)
|
||||
|
||||
|
||||
def fetch_boj_problem_html(problem_id: str, timeout: int = 10) -> str:
|
||||
"""
|
||||
Download BOJ problem page raw HTML.
|
||||
"""
|
||||
url = f"https://www.acmicpc.net/problem/{problem_id}"
|
||||
req = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/123.0.0.0 Safari/537.36"
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
status = getattr(resp, "status", 200)
|
||||
if status != 200:
|
||||
raise click.ClickException(
|
||||
f"failed to fetch problem {problem_id}: HTTP {status}"
|
||||
)
|
||||
return resp.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
raise click.ClickException(
|
||||
f"failed to fetch problem {problem_id}: HTTP {e.code}"
|
||||
) from e
|
||||
except urllib.error.URLError as e:
|
||||
raise click.ClickException(
|
||||
f"network error while fetching problem {problem_id}: {e.reason}"
|
||||
) from e
|
||||
|
||||
|
||||
def _problem_static_html_path(problem_id: str) -> pathlib.Path:
|
||||
return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / f"{problem_id}.html"
|
||||
|
||||
|
||||
def _problem_static_assets_dir(problem_id: str) -> pathlib.Path:
|
||||
return pathlib.Path(STORAGE_DIR) / "zeta" / "_static" / "assets" / problem_id
|
||||
|
||||
|
||||
def _guess_image_mime(src_url: str, content_type: str | None) -> str:
|
||||
if content_type:
|
||||
mime = content_type.split(";", 1)[0].strip().lower()
|
||||
if mime.startswith("image/"):
|
||||
return mime
|
||||
|
||||
parsed = urllib.parse.urlparse(src_url)
|
||||
ext = pathlib.Path(parsed.path).suffix.lower()
|
||||
ext_to_mime = {
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".svg": "image/svg+xml",
|
||||
".bmp": "image/bmp",
|
||||
".ico": "image/x-icon",
|
||||
}
|
||||
return ext_to_mime.get(ext, "image/png")
|
||||
|
||||
|
||||
def _download_image_for_offline(problem_id: str, src_url: str, seq: int, force: bool) -> str | None:
|
||||
req = urllib.request.Request(
|
||||
src_url,
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/123.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Referer": f"https://www.acmicpc.net/problem/{problem_id}",
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
status = getattr(resp, "status", 200)
|
||||
if status != 200:
|
||||
return None
|
||||
content_type = resp.headers.get("Content-Type")
|
||||
image_bytes = resp.read()
|
||||
except (urllib.error.HTTPError, urllib.error.URLError):
|
||||
return None
|
||||
|
||||
mime = _guess_image_mime(src_url, content_type)
|
||||
encoded = base64.b64encode(image_bytes).decode("ascii")
|
||||
return f"data:{mime};base64,{encoded}"
|
||||
|
||||
|
||||
def _localize_images_in_html(problem_id: str, html_fragment: str, force: bool) -> str:
|
||||
base_url = f"https://www.acmicpc.net/problem/{problem_id}"
|
||||
counter = {"i": 0}
|
||||
cache: dict[str, str] = {}
|
||||
|
||||
pattern = re.compile(
|
||||
r'(<img\b[^>]*?\bsrc\s*=\s*)(["\']?)([^"\'>\s]+)(["\']?)',
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
|
||||
def repl(m: re.Match) -> str:
|
||||
prefix = m.group(1)
|
||||
q1 = m.group(2)
|
||||
src = m.group(3)
|
||||
|
||||
if src.startswith("data:"):
|
||||
return m.group(0)
|
||||
|
||||
abs_url = urllib.parse.urljoin(base_url, src)
|
||||
if abs_url in cache:
|
||||
local_src = cache[abs_url]
|
||||
quote = q1 if q1 else '"'
|
||||
return f"{prefix}{quote}{local_src}{quote}"
|
||||
|
||||
counter["i"] += 1
|
||||
local_src = _download_image_for_offline(problem_id, abs_url, counter["i"], force=force)
|
||||
if not local_src:
|
||||
return m.group(0)
|
||||
|
||||
cache[abs_url] = local_src
|
||||
quote = q1 if q1 else '"'
|
||||
return f"{prefix}{quote}{local_src}{quote}"
|
||||
|
||||
return pattern.sub(repl, html_fragment)
|
||||
|
||||
|
||||
def _extract_html_by_id(raw_html: str, tag: str, element_id: str) -> str | None:
|
||||
pattern = rf"<{tag}[^>]*id=\"{re.escape(element_id)}\"[^>]*>(.*?)</{tag}>"
|
||||
m = re.search(pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
|
||||
if not m:
|
||||
return None
|
||||
return m.group(1).strip()
|
||||
|
||||
|
||||
def _strip_tags(html_text: str) -> str:
|
||||
text = re.sub(r"<[^>]+>", "", html_text, flags=re.DOTALL)
|
||||
return " ".join(text.split())
|
||||
|
||||
|
||||
def _render_math_expressions(html_fragment: str) -> str:
|
||||
"""
|
||||
Convert TeX math delimiters to MathML for offline rendering.
|
||||
- Inline: $...$
|
||||
- Block: $$...$$
|
||||
"""
|
||||
if latex_to_mathml is None:
|
||||
return html_fragment
|
||||
|
||||
protected_blocks: list[str] = []
|
||||
|
||||
def protect(m: re.Match) -> str:
|
||||
protected_blocks.append(m.group(0))
|
||||
return f"@@PROTECTED_{len(protected_blocks) - 1}@@"
|
||||
|
||||
# Do not touch code/pre blocks.
|
||||
temp = re.sub(
|
||||
r"<(pre|code)\b[^>]*>.*?</\1>",
|
||||
protect,
|
||||
html_fragment,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
|
||||
def repl_block(m: re.Match) -> str:
|
||||
expr = unescape(m.group(1).strip())
|
||||
if not expr:
|
||||
return m.group(0)
|
||||
try:
|
||||
mathml = latex_to_mathml(expr)
|
||||
return f'<div class="math-block">{mathml}</div>'
|
||||
except Exception:
|
||||
return m.group(0)
|
||||
|
||||
def repl_inline(m: re.Match) -> str:
|
||||
expr = unescape(m.group(1).strip())
|
||||
if not expr:
|
||||
return m.group(0)
|
||||
try:
|
||||
mathml = latex_to_mathml(expr)
|
||||
return f'<span class="math-inline">{mathml}</span>'
|
||||
except Exception:
|
||||
return m.group(0)
|
||||
|
||||
temp = re.sub(r"\$\$(.+?)\$\$", repl_block, temp, flags=re.DOTALL)
|
||||
temp = re.sub(r"(?<!\$)\$(?!\$)(.+?)(?<!\$)\$(?!\$)", repl_inline, temp, flags=re.DOTALL)
|
||||
|
||||
for i, block in enumerate(protected_blocks):
|
||||
temp = temp.replace(f"@@PROTECTED_{i}@@", block)
|
||||
|
||||
return temp
|
||||
|
||||
|
||||
def make_offline_problem_html(problem_id: str, raw_html: str, force: bool) -> str:
|
||||
"""
|
||||
Build a self-contained offline-friendly HTML page from BOJ raw HTML.
|
||||
"""
|
||||
title = _extract_html_by_id(raw_html, "span", "problem_title")
|
||||
if not title:
|
||||
title = f"BOJ {problem_id}"
|
||||
|
||||
blocks: list[str] = []
|
||||
core_specs = [
|
||||
("problem_description", "문제"),
|
||||
("problem_input", "입력"),
|
||||
("problem_output", "출력"),
|
||||
("problem_limit", "제한"),
|
||||
("problem_hint", "힌트"),
|
||||
]
|
||||
|
||||
for content_id, fallback_label in core_specs:
|
||||
content = _extract_html_by_id(raw_html, "div", content_id)
|
||||
if not content or not content.strip():
|
||||
continue
|
||||
|
||||
localized_content = _localize_images_in_html(
|
||||
problem_id,
|
||||
content,
|
||||
force=force,
|
||||
)
|
||||
localized_content = _render_math_expressions(localized_content)
|
||||
|
||||
blocks.append(
|
||||
"\n".join(
|
||||
[
|
||||
"<article class=\"section\">",
|
||||
f"<h2>{fallback_label}</h2>",
|
||||
f"{localized_content}",
|
||||
"</article>",
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
for sample_type, sample_label in (("sampleinput", "예제 입력"), ("sampleoutput", "예제 출력")):
|
||||
sample_pattern = rf"<section[^>]*id=\"{sample_type}(\d+)\"[^>]*>(.*?)</section>"
|
||||
sample_matches = list(
|
||||
re.finditer(sample_pattern, raw_html, flags=re.DOTALL | re.IGNORECASE)
|
||||
)
|
||||
sample_matches.sort(key=lambda m: int(m.group(1)))
|
||||
|
||||
for m in sample_matches:
|
||||
idx = m.group(1)
|
||||
section_html = m.group(2)
|
||||
pre_match = re.search(
|
||||
r"(<pre[^>]*>.*?</pre>)",
|
||||
section_html,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if not pre_match:
|
||||
continue
|
||||
|
||||
pre_html = _localize_images_in_html(
|
||||
problem_id,
|
||||
pre_match.group(1),
|
||||
force=force,
|
||||
)
|
||||
|
||||
h2_match = re.search(
|
||||
r"<h2[^>]*>(.*?)</h2>",
|
||||
section_html,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
)
|
||||
if h2_match:
|
||||
h2 = _strip_tags(h2_match.group(1))
|
||||
else:
|
||||
h2 = f"{sample_label} {idx}"
|
||||
|
||||
blocks.append(
|
||||
"\n".join(
|
||||
[
|
||||
"<article class=\"section\">",
|
||||
f"<h2>{h2}</h2>",
|
||||
pre_html,
|
||||
"</article>",
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
if not blocks:
|
||||
body_fallback = (
|
||||
"<article class=\"section\">"
|
||||
"<h2>원본 페이지</h2>"
|
||||
"<p>문제 본문 파싱에 실패하여 원본 HTML을 포함합니다.</p>"
|
||||
f"<pre>{escape(raw_html[:100000])}</pre>"
|
||||
"</article>"
|
||||
)
|
||||
blocks.append(body_fallback)
|
||||
|
||||
source_url = f"https://www.acmicpc.net/problem/{problem_id}"
|
||||
content_html = "\n".join(blocks)
|
||||
|
||||
return f"""<!DOCTYPE html>
|
||||
<html lang=\"ko\">
|
||||
<head>
|
||||
<meta charset=\"UTF-8\" />
|
||||
<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />
|
||||
<title>BOJ {problem_id} - Offline</title>
|
||||
<style>
|
||||
:root {{
|
||||
--bg: #fafaf8;
|
||||
--paper: #ffffff;
|
||||
--ink: #1e1f24;
|
||||
--muted: #6a6d75;
|
||||
--line: #d8dce3;
|
||||
--accent: #0d6e6e;
|
||||
--code-bg: #f4f6fb;
|
||||
}}
|
||||
* {{ box-sizing: border-box; }}
|
||||
body {{
|
||||
margin: 0;
|
||||
background:
|
||||
radial-gradient(circle at 15% 0%, #f0efe9 0%, transparent 42%),
|
||||
radial-gradient(circle at 85% 20%, #e7f1f2 0%, transparent 38%),
|
||||
var(--bg);
|
||||
color: var(--ink);
|
||||
font-family: "Noto Sans KR", "Pretendard", "Apple SD Gothic Neo", sans-serif;
|
||||
line-height: 1.65;
|
||||
}}
|
||||
main {{
|
||||
max-width: 980px;
|
||||
margin: 0 auto;
|
||||
padding: 24px 16px 56px;
|
||||
}}
|
||||
.header {{
|
||||
background: var(--paper);
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 14px;
|
||||
padding: 18px 20px;
|
||||
margin-bottom: 18px;
|
||||
}}
|
||||
.header h1 {{ margin: 0 0 6px; font-size: 1.5rem; }}
|
||||
.header p {{ margin: 0; color: var(--muted); font-size: 0.95rem; }}
|
||||
.header a {{ color: var(--accent); text-decoration: none; }}
|
||||
.section {{
|
||||
background: var(--paper);
|
||||
border: 1px solid var(--line);
|
||||
border-radius: 14px;
|
||||
padding: 16px 18px;
|
||||
margin-bottom: 14px;
|
||||
overflow-x: auto;
|
||||
}}
|
||||
h2 {{
|
||||
margin: 0 0 10px;
|
||||
font-size: 1.05rem;
|
||||
color: var(--accent);
|
||||
border-bottom: 1px solid var(--line);
|
||||
padding-bottom: 8px;
|
||||
}}
|
||||
pre, code {{
|
||||
font-family: "JetBrains Mono", "Fira Code", monospace;
|
||||
background: var(--code-bg);
|
||||
}}
|
||||
pre {{
|
||||
padding: 12px;
|
||||
border-radius: 10px;
|
||||
border: 1px solid #e7ebf2;
|
||||
overflow: auto;
|
||||
}}
|
||||
blockquote {{
|
||||
margin: 14px 0;
|
||||
padding: 16px 16px 14px 22px;
|
||||
border-left: 4px solid var(--accent);
|
||||
border-radius: 10px;
|
||||
background: linear-gradient(90deg, #eef8f8 0%, #f9fdfd 100%);
|
||||
color: #24313a;
|
||||
font-weight: 600;
|
||||
position: relative;
|
||||
}}
|
||||
blockquote::before {{
|
||||
content: "“";
|
||||
position: absolute;
|
||||
left: 8px;
|
||||
top: 2px;
|
||||
font-size: 1.35rem;
|
||||
line-height: 1;
|
||||
color: #0b5f5f;
|
||||
opacity: 0.7;
|
||||
}}
|
||||
blockquote > :first-child {{ margin-top: 0; }}
|
||||
blockquote > :last-child {{ margin-bottom: 0; }}
|
||||
q {{
|
||||
color: #114f50;
|
||||
font-weight: 700;
|
||||
background: #edf8f8;
|
||||
border-radius: 6px;
|
||||
padding: 0 4px;
|
||||
}}
|
||||
.math-inline math {{
|
||||
font-size: 1em;
|
||||
vertical-align: middle;
|
||||
}}
|
||||
.math-block {{
|
||||
margin: 10px 0;
|
||||
padding: 8px 10px;
|
||||
overflow-x: auto;
|
||||
background: #f8fbff;
|
||||
border: 1px solid #e2ecf8;
|
||||
border-radius: 8px;
|
||||
}}
|
||||
.math-block math {{
|
||||
font-size: 1.04em;
|
||||
display: block;
|
||||
}}
|
||||
table {{ border-collapse: collapse; width: 100%; }}
|
||||
th, td {{ border: 1px solid var(--line); padding: 6px 8px; }}
|
||||
img {{ max-width: 100%; height: auto; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<main>
|
||||
<header class=\"header\">
|
||||
<h1>{title}</h1>
|
||||
</header>
|
||||
{content_html}
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
def save_problem_html(problem_id: str, html: str, force: bool) -> str:
|
||||
"""
|
||||
Save html to storage/zeta/_static/<id>.html
|
||||
Return: fetched | skipped
|
||||
"""
|
||||
static_dir = pathlib.Path(STORAGE_DIR) / "zeta" / "_static"
|
||||
static_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
dest = _problem_static_html_path(problem_id)
|
||||
if dest.exists() and not force:
|
||||
return "skipped"
|
||||
|
||||
dest.write_text(html, encoding="utf-8")
|
||||
return "fetched"
|
||||
|
||||
|
||||
@click.group()
|
||||
def cli():
|
||||
pass
|
||||
@@ -791,6 +1297,76 @@ def find(keyword: str, completed: bool | None):
|
||||
click.echo(f" {status} {file_name}.{lang_name}")
|
||||
|
||||
|
||||
@click.command(name="fetchprob")
|
||||
@click.argument("target", type=str, nargs=1, required=True)
|
||||
@click.option("--force", "-f", is_flag=True, help="Overwrite existing HTML files")
|
||||
def fetchprob(target: str, force: bool):
|
||||
"""
|
||||
Fetch BOJ problem HTML into storage/zeta/_static.
|
||||
|
||||
TARGET:
|
||||
zeta/<id> Fetch one problem
|
||||
zeta Fetch all detected problem ids under storage/zeta
|
||||
"""
|
||||
location, problem_id = parse_fetchprob_target(target)
|
||||
if location != "zeta":
|
||||
raise click.UsageError("only 'zeta' location is supported")
|
||||
|
||||
if problem_id is not None:
|
||||
if _problem_static_html_path(problem_id).exists() and not force:
|
||||
click.echo(f"{problem_id}: skipped (already exists)")
|
||||
return
|
||||
|
||||
raw_html = fetch_boj_problem_html(problem_id)
|
||||
offline_html = make_offline_problem_html(problem_id, raw_html, force=force)
|
||||
result = save_problem_html(problem_id, offline_html, force=force)
|
||||
if result == "skipped":
|
||||
click.echo(f"{problem_id}: skipped (already exists)")
|
||||
else:
|
||||
click.echo(f"{problem_id}: fetched (offline processed + images)")
|
||||
return
|
||||
|
||||
ids = collect_zeta_problem_ids()
|
||||
if not ids:
|
||||
click.echo("No problem ids found in storage/zeta")
|
||||
return
|
||||
|
||||
attempted = len(ids)
|
||||
fetched = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for pid in ids:
|
||||
try:
|
||||
if _problem_static_html_path(pid).exists() and not force:
|
||||
skipped += 1
|
||||
click.echo(f"{pid}: skipped")
|
||||
continue
|
||||
|
||||
raw_html = fetch_boj_problem_html(pid)
|
||||
offline_html = make_offline_problem_html(pid, raw_html, force=force)
|
||||
result = save_problem_html(pid, offline_html, force=force)
|
||||
if result == "skipped":
|
||||
skipped += 1
|
||||
click.echo(f"{pid}: skipped")
|
||||
else:
|
||||
fetched += 1
|
||||
click.echo(f"{pid}: fetched (offline processed + images)")
|
||||
except click.ClickException as e:
|
||||
failed += 1
|
||||
click.echo(f"{pid}: failed ({e.message})")
|
||||
|
||||
click.echo()
|
||||
click.secho(
|
||||
(
|
||||
f"Summary - attempted: {attempted}, fetched: {fetched}, "
|
||||
f"skipped: {skipped}, failed: {failed}"
|
||||
),
|
||||
fg="cyan",
|
||||
bold=True,
|
||||
)
|
||||
|
||||
|
||||
|
||||
cli.add_command(run)
|
||||
cli.add_command(load)
|
||||
@@ -799,6 +1375,7 @@ cli.add_command(export)
|
||||
cli.add_command(state)
|
||||
cli.add_command(show)
|
||||
cli.add_command(find)
|
||||
cli.add_command(fetchprob)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
||||
Reference in New Issue
Block a user