fixes 3 pairs подряд, etc

This commit is contained in:
2025-09-12 20:07:04 +03:00
parent 6920d24a98
commit ed65e5b483
8 changed files with 239 additions and 78 deletions

138
main.py
View File

@@ -1,62 +1,127 @@
# Copyright Stanislav Mironov
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
import json
import re
import os
import time
import traceback
from urllib.parse import urljoin
import pandas as pd
import xlwt
import xlrd
import requests
from bs4 import BeautifulSoup
import uuid
import aigenerated
import parser
import translations
import utils
import json
import links_parser
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
import shutil
def currt():
return round(time.time())
FACULTETS = [
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
]
DIRNAME = "excels"
DEBUG_ONE_FAC = None #'fevt'
DEBUG_ONE_FAC = None #'htf'
result_groups = {}
result = {
"version": 1,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав",
"actual_at": round(time.time()),
"documentation": "TODO",
"daypicture": "QwQ",
"university": "VSTU",
"university_site": "https://www.vstu.ru/",
"stat": {
"total_parsing_time": -1,
},
"api_notices": {
"updated_at": 1757688552,
"text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;",
"warning": False,
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
},
"doubled_groups": [],
"debug": {
"bleu~~": 1
},
"excels": [],
"facultets": FACULTETS,
"emptykey1": "",
"emptykey2": "",
"groups": result_groups,
"emptykey3": "",
"emptykey4": "",
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
}
def process_excel_file(facultet, excel_url, counter, timeid):
is_xlsx = excel_url.endswith(".xlsx")
filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
excel_info = {
"filename": excel_url.split("/")[-1],
"url": excel_url,
"download_place": filename,
"stat": {
"download": -1,
"create_reader": -1,
"parse": -1,
"cycles": 0
},
"group_names_parsed": [],
"facultet": facultet,
"counter": counter
}
parser.LOGGING = False
try:
filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
t = utils.StepTimeCounter()
aigenerated.download_file_from_url(excel_url, filename)
excel_info["stat"]['download'] = t.step()
reader = translations.create_reader(filename)
print("Reader info")
print(reader.info())
excel_info["stat"]['create_reader'] = t.step()
while True:
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
parser.LOGGING = False
excel_info['stat']['cycles'] += 1
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
prs = parser.Parser(reader)
prs.parse()
if prs.parser_error is not None:
excel_info["parser_error_cycle_" + excel_info['stat']['cycles']] = prs.parser_error
for group_name in prs.groups.keys():
if group_name in result.keys():
if group_name in result_groups.keys():
print(f" -- WTF -- Doubled groups -- name: {group_name}")
if 'warning_doubled_groups_skip' not in excel_info.keys():
excel_info['warning_doubled_groups_skip'] = []
excel_info['warning_doubled_groups_skip'].append(group_name)
result['doubled_groups'].append(group_name)
continue
gr = result[group_name] = prs.groups[group_name]
gr = result_groups[group_name] = prs.groups[group_name]
gr['facultet'] = facultet
gr['data_source'] = excel_url.split("/")[-1]
gr['parser_debug'] = {
"C_COUNTER": counter,
gr['debug'] = {
"counter": counter,
"timeid": timeid,
"excel_url": excel_url,
"reader_info": reader.info(),
"reader_sheet_index": reader.get_sheet_index(),
"filename": filename
}
excel_info["group_names_parsed"].append(group_name)
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
@@ -67,22 +132,40 @@ def process_excel_file(facultet, excel_url, counter, timeid):
reader.next_sheet()
print("Next sheet!")
excel_info["stat"]['parse'] = t.step()
except Exception as e:
print(f"Error while {excel_url}")
print(e)
traceback.print_exc()
u = uuid.uuid4()
excel_info['error'] = {
"smile": ":(",
"error_message": str(e),
"log_anchor": str(u),
"time": currt()
}
print(f"Log Anchor: {u}")
faileds.append({
"ex": e,
"fac": facultet,
"url": excel_url
})
result['excels'].append(excel_info)
result = {}
faileds = []
def main():
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
t = utils.StepTimeCounter()
try:
os.mkdir(DIRNAME)
print(f"Directory '{DIRNAME}' created successfully.")
except Exception:
print(f"Directory '{DIRNAME}' already exists.")
print("main(); parse links starting...")
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
counter = 0
timeid = str(round(time.time()))
for facultet in EXCEL_LINKS.keys():
@@ -99,13 +182,24 @@ def main():
print("Excel file processing done!")
print("Saving result.json")
result['stat']['total_parsing_time'] = t.step()
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json")
print("Faileds:")
print(faileds)
# Delete a non-empty directory and its contents
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
if __name__ == "__main__":
print("Start")
main()
print("Bye!")