Source code for onlinejudge_template.analyzer.html

"""
the module to find the input format string written with ``<pre>`` tags from HTML

この module は HTML を解析し ``<pre>`` タグに囲まれた入力フォーマット文字列を発見します。

たとえば `AtCoder Grand Contest 038: F - Two Permutations <https://atcoder.jp/contests/agc038/tasks/agc038_f>`_ の HTML は
::

    <h3>Input</h3><p>Input is given from Standard Input in the following format:</p>
    <pre><var>N</var>
    <var>P_0</var> <var>P_1</var> <var>\\cdots</var> <var>P_{N-1}</var>
    <var>Q_0</var> <var>Q_1</var> <var>\\cdots</var> <var>Q_{N-1}</var>
    </pre>

という部分文字列を含みますが、ここから次のような文字列を抜き出します。
::

    N
    P_0 P_1 \cdots P_{N-1}
    Q_0 Q_1 \cdots Q_{N-1}
"""

from logging import getLogger
from typing import *

import bs4

from onlinejudge_template.types import AnalyzerError

logger = getLogger(__name__)


[docs]class HTMLParserError(AnalyzerError): pass
table = { 'in': ('Input', 'Input / 入力', '入力'), 'out': ('Output', 'Output / 出力', '出力'), } def _extract_format_string_from_pre(x: bs4.Tag) -> str: s = '' for y in x: if isinstance(y, bs4.Tag): if y.name == 'br': s += '<br>' s += _extract_format_string_from_pre(y) # It seems some `<pre> xxx <br /> yyy </pre>` is recognized as `<pre> xxx <br> yyy </br></pre>`. e.g. https://yukicoder.me/problems/no/1078 elif y.name == 'var': s += '<var>' s += _extract_format_string_from_pre(y) s += '</var>' elif y.name == 'code': s += _extract_format_string_from_pre(y) else: logger.warning('ignored an unexpected tag: %s', y) s += _extract_format_string_from_pre(y) elif isinstance(y, bs4.NavigableString): s += y.string elif isinstance(y, bs4.Comment): pass else: assert False return s
[docs]def parse_generic_format_string(html: bytes, *, kind: str, url: str) -> str: """ :param kind: ``"in"`` or ``"out"`` :raises HTMLParserError: """ soup = bs4.BeautifulSoup(html, 'html.parser') logger.debug('parsed HTML: %s...', repr(str(soup))[:200]) if 'atcoder.jp' in url: for h3 in soup.find_all('h3'): if h3.string in table[kind]: pre = h3.parent.find('pre') if pre: return _extract_format_string_from_pre(pre).strip() + '\r\n' raise HTMLParserError elif 'yukicoder.me' in url: for h4 in soup.find_all('h4'): if h4.string in table[kind]: pre = h4.parent.find('pre') if pre: return _extract_format_string_from_pre(pre).strip() + '\n' raise HTMLParserError elif 'yosupo.jp' in url: assert 'old.yosupo.jp' in url # TODO: update this for new site https://judge.yosupo.jp/. The current implementation is for https://old.yosupo.jp/. for h2 in soup.find_all('h2'): found = False for div in h2.find_all('div'): if div.string in table[kind]: found = True if found: pre = h2.find_next_sibling('pre') if pre: code = pre.find('code') if code: return _extract_format_string_from_pre(code).strip() + '\n' raise HTMLParserError else: raise NotImplementedError
[docs]def parse_input_format_string(html: bytes, *, url: str) -> str: return parse_generic_format_string(html, kind='in', url=url)
[docs]def parse_output_format_string(html: bytes, *, url: str) -> str: return parse_generic_format_string(html, kind='out', url=url)