This commit is contained in:
2024-07-09 06:50:23 +03:00
commit 990b5d684a
12 changed files with 633 additions and 0 deletions

View File

@@ -0,0 +1,6 @@
import asyncio
from .main import run_parse
if __name__ == "__main__":
asyncio.run(run_parse())

View File

@@ -0,0 +1,41 @@
import asyncio
import time
from pathlib import Path
from . import parser, writer
async def parser_task(file: Path):
soup = await parser.read_html(file)
data = await parser.parse_html(soup)
return data
async def run_parse():
start_time = time.perf_counter()
tasks = []
data_path = Path.cwd().joinpath("testcase_paraminer").joinpath("data")
output_path = Path.cwd().joinpath("testcase_paraminer").joinpath("output")
files = sorted(data_path.glob("*.html"))
for file in files:
tasks.append(asyncio.create_task(parser_task(file), name=str(file.name)))
print(f" --- Total tasks: {len(tasks)} --- Starting")
for task in tasks:
try:
task_result = await task
task_status = await writer.write_docx(
filename=output_path.joinpath(f"{task.get_name().split(".")[0]}.docx"),
data=task_result,
)
print(f" --- {task.get_name()} --- {task_status}")
except Exception as e:
print(e)
print(f" --- Elapsed time: {time.perf_counter() - start_time:.1f} seconds --- ")
if __name__ == "__main__":
asyncio.run(run_parse())

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,89 @@
from pathlib import Path
from typing import Any
import aiofiles
from bs4 import BeautifulSoup, NavigableString, Tag
async def read_html(filename: Path) -> Tag | NavigableString | None:
async with aiofiles.open(filename, "r") as f:
html = await f.read()
soup = BeautifulSoup(html, "html.parser")
return soup.find("div", {"id": "premain"})
async def parse_html(data: Tag | NavigableString | None) -> dict[Any, Any]:
result = {"f_x": await parse_x_coefficents(data)}
result["tables"] = await parse_tables(data)
return await modify_data(result)
async def parse_x_coefficents(data: Tag | NavigableString | None) -> list[Any]:
if data is None:
return []
f_x = []
b_el = data.find("b", string="Симплекс-метод")
for _ in range(6):
if b_el is not None:
b_el = b_el.next_sibling
if b_el is not None:
f_x = ["".join(list(b_el.text.split()[-1])[:-1])]
b_el = b_el.next_sibling
while b_el is not None and b_el.text != " при следующих условиях-ограничений.":
if b_el.text.endswith("x"):
f_x.append((b_el.text[:-1]).replace("+", ""))
b_el = b_el.next_sibling
return f_x
async def parse_tables(data: Tag | NavigableString | None) -> dict[int, list[Any]]:
if data is None:
return {}
tables = data.find_all("table", {"class": "table-bordered"})
result = {}
table_num = 0
for table_idx in range(len(tables)):
marked_ceil = False
tmp_table = []
for rows in tables[table_idx].find_all("tr"):
tmp_row = []
for col in rows.find_all("td"):
if col.has_attr("bgcolor"):
marked_ceil = True
tmp_row.append((col.text, col["bgcolor"]))
else:
tmp_row.append(col.text)
tmp_table.append(tmp_row)
if marked_ceil or table_idx == len(tables) - 1:
result[table_num] = tmp_table
table_num += 1
return result
async def modify_data(data: dict[Any, Any]) -> dict[Any, Any]:
for table in data["tables"].values():
first_row = ["", "C", "-"] + data["f_x"]
for _ in range(len(table[0]) - len(first_row) + 1):
first_row.append("0")
table.insert(0, first_row)
for i in range(1, len(table)):
f_ceil = table[i][0]
(
table[i].insert(0, "")
if not f_ceil.startswith("x")
else table[i].insert(0, first_row[int(f_ceil[1:]) + 2])
)
return data

View File

@@ -0,0 +1,26 @@
from pathlib import Path
from typing import Any
from docx import Document
from docx.shared import RGBColor
async def write_docx(filename: Path, data: dict[str, Any]) -> str:
doc = Document()
try:
for idx, table in data["tables"].items():
doc.add_paragraph(f"Table {idx}")
table = doc.add_table(rows=len(table), cols=len(table[0]))
for i, row in enumerate(data["tables"][idx]):
for j, col in enumerate(row):
table.cell(i, j).text = col if type(col) is str else str(col[0])
if isinstance(col, tuple):
table.cell(i, j).paragraphs[0].runs[0].font.color.rgb = (
RGBColor(255, 0, 0)
)
doc.save(str(filename))
return "Ok"
except Exception as e:
return str(e)