Source code for onlinejudge_template.analyzer.html
"""
the module to find the input format string written with ``<pre>`` tags from HTML
この module は HTML を解析し ``<pre>`` タグに囲まれた入力フォーマット文字列を発見します。
たとえば `AtCoder Grand Contest 038: F - Two Permutations <https://atcoder.jp/contests/agc038/tasks/agc038_f>`_ の HTML は
::
<h3>Input</h3><p>Input is given from Standard Input in the following format:</p>
<pre><var>N</var>
<var>P_0</var> <var>P_1</var> <var>\\cdots</var> <var>P_{N-1}</var>
<var>Q_0</var> <var>Q_1</var> <var>\\cdots</var> <var>Q_{N-1}</var>
</pre>
という部分文字列を含みますが、ここから次のような文字列を抜き出します。
::
N
P_0 P_1 \cdots P_{N-1}
Q_0 Q_1 \cdots Q_{N-1}
"""
from logging import getLogger
from typing import *
import bs4
from onlinejudge_template.types import AnalyzerError
logger = getLogger(__name__)
table = {
'in': ('Input', 'Input / 入力', '入力'),
'out': ('Output', 'Output / 出力', '出力'),
}
def _extract_format_string_from_pre(x: bs4.Tag) -> str:
s = ''
for y in x:
if isinstance(y, bs4.Tag):
if y.name == 'br':
s += '<br>'
s += _extract_format_string_from_pre(y) # It seems some `<pre> xxx <br /> yyy </pre>` is recognized as `<pre> xxx <br> yyy </br></pre>`. e.g. https://yukicoder.me/problems/no/1078
elif y.name == 'var':
s += '<var>'
s += _extract_format_string_from_pre(y)
s += '</var>'
elif y.name == 'code':
s += _extract_format_string_from_pre(y)
else:
logger.warning('ignored an unexpected tag: %s', y)
s += _extract_format_string_from_pre(y)
elif isinstance(y, bs4.NavigableString):
s += y.string
elif isinstance(y, bs4.Comment):
pass
else:
assert False
return s
[docs]def parse_generic_format_string(html: bytes, *, kind: str, url: str) -> str:
"""
:param kind: ``"in"`` or ``"out"``
:raises HTMLParserError:
"""
soup = bs4.BeautifulSoup(html, 'html.parser')
logger.debug('parsed HTML: %s...', repr(str(soup))[:200])
if 'atcoder.jp' in url:
for h3 in soup.find_all('h3'):
if h3.string in table[kind]:
pre = h3.parent.find('pre')
if pre:
return _extract_format_string_from_pre(pre).strip() + '\r\n'
raise HTMLParserError
elif 'yukicoder.me' in url:
for h4 in soup.find_all('h4'):
if h4.string in table[kind]:
pre = h4.parent.find('pre')
if pre:
return _extract_format_string_from_pre(pre).strip() + '\n'
raise HTMLParserError
elif 'yosupo.jp' in url:
assert 'old.yosupo.jp' in url # TODO: update this for new site https://judge.yosupo.jp/. The current implementation is for https://old.yosupo.jp/.
for h2 in soup.find_all('h2'):
found = False
for div in h2.find_all('div'):
if div.string in table[kind]:
found = True
if found:
pre = h2.find_next_sibling('pre')
if pre:
code = pre.find('code')
if code:
return _extract_format_string_from_pre(code).strip() + '\n'
raise HTMLParserError
else:
raise NotImplementedError
[docs]def parse_input_format_string(html: bytes, *, url: str) -> str:
return parse_generic_format_string(html, kind='in', url=url)
[docs]def parse_output_format_string(html: bytes, *, url: str) -> str:
return parse_generic_format_string(html, kind='out', url=url)