refactor: big refactor

This commit is contained in:
2026-03-18 22:15:49 +03:00
parent 7e0e4a0b71
commit 1199ce1554
11 changed files with 264 additions and 555 deletions

282
main.py
View File

@@ -9,14 +9,12 @@ import random
import time
import traceback
import uuid
import aigenerated
import parser
import translations
import utils
import json
import links_parser
import shutil
import hashes
def currt():
return round(time.time())
@@ -25,119 +23,40 @@ FACULTETS = sorted([
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
])
DIRNAME = "excels"
DIFFABLE_DATES = "diffable_dates.txt"
SKIP_DIFFABLE_DATES = True
PARSED_DIR = "parsed"
DEBUG_ONE_FAC = None #'fevt'
LOGGING = False
unique_raws = set()
result = {
"version": 1,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php",
"actual_at": round(time.time()),
"documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json (temporary outdated)",
"daypicture": "0w0",
"daycite": "KIlLSWITCH",
"contact": "https://fazziclay.com/",
"university": "VSTU",
"university_site": "https://www.vstu.ru/",
"source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json",
"stat": {
"total_parsing_time": -1,
"excels": {
"fine": 0,
"bad": 0
},
"groups": 0,
"unique_raws": -1
},
"api_notices": {
"updated_at": 1773523692,
"text_pre1": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.",
"text": "2026-03-15 BREAKING CHANGES! By Stanislav Mironov.\n\nИзменено многое в угоду унифкации и расширению спаршенных групп. Пока alpha",
"warning": True,
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
},
"debug": {
"bleu~~": 3
},
"excels": [],
"facultets": FACULTETS,
"group_names_parsed": [],
"unique_raws": unique_raws,
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
}
parser.LOGGING = LOGGING = False
def process_obj(data):
def parse_sheets(download_place):
to_return = {}
try:
if isinstance(data, dict):
for key, value in data.items():
if key == "raw":
unique_raws.update(value)
process_obj(value)
# Если это список, проходим по его элементам
elif isinstance(data, list):
for item in data:
process_obj(item)
except Exception as e:
print("Failed process_obj")
print(e)
def process_excel_file(facultet, excel_url, counter, latest_changed):
is_xlsx = excel_url.endswith(".xlsx")
download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
excel_filename = excel_url.split("/")[-1]
if "ФЭУ" not in excel_filename:
print("SKIPPED")
return
excel_info = {
"filename": excel_filename,
"data_source_hash": None,
"url": excel_url,
"latest_changed": latest_changed,
"download_place": download_place,
"group_names_parsed": [],
"facultet": facultet,
"counter": counter,
"sheets": []
}
parser.LOGGING = LOGGING
try:
aigenerated.download_file_from_url(excel_url, download_place)
sha1hash = hashes.calculate_sha1(download_place)
excel_info['data_source_hash'] = sha1hash
reader = translations.create_reader(download_place)
print("Reader info")
print(reader.info())
while True:
t = utils.StepTimeCounter()
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
sheet_dict = {
"index": reader.get_sheet_index(),
"name": reader.get_sheet_name(),
"reader_info": reader.info(),
"group_names_parsed": [],
"groups": {}
}
excel_info['sheets'].append(sheet_dict)
to_return["SHEET_"+str(reader.get_sheet_index())] = sheet_dict
prs = parser.Parser(reader)
print("Parser created; parser.parse();")
prs.parse()
print("parsed done!")
sheet_dict['parse_time'] = round(t.step())
if len(prs.raw_no_schedule) > 0:
sheet_dict["raw_no_schedule"] = prs.raw_no_schedule
sheet_dict["other_raws"] = prs.raw_no_schedule
if len(prs.features) > 0:
sheet_dict["features"] = sorted(prs.features)
@@ -147,16 +66,11 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
sheet_dict["parser_warnings"] = prs.parser_warnings
for group_name in prs.groups.keys():
gr = prs.groups[group_name]
gr["excel_url"] = excel_url
sheet_dict["group_names_parsed"].append(group_name)
excel_info["group_names_parsed"].append(group_name)
result["group_names_parsed"].append(group_name)
result['stat']['groups'] += 1
for group_name_key in prs.groups.keys():
gr = prs.groups[group_name_key]
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
sheet_dict['groups'][group_name] = gr
process_obj(gr['slots'])
sheet_dict['groups'][group_name_key] = gr
print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
@@ -168,31 +82,57 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
print("Next sheet!")
except Exception as e:
print(f"Error while {excel_url}")
print(e)
traceback.print_exc()
u = uuid.uuid4()
excel_info['error'] = {
to_return['error'] = {
"smile": ":(",
"error_message": str(e),
"log_anchor": str(u),
"time": currt()
}
print(f"Log Anchor: {u}")
faileds.append({
"ex": e,
"fac": facultet,
"url": excel_url
})
result['excels'].append(excel_info)
k = "fine" if len(excel_info['group_names_parsed']) > 0 else "bad"
result['stat']['excels'][k] += 1
return to_return
faileds = []
def main():
global result
def parsed_file_path(excel_filename: str):
format = excel_filename.split(".")[-1]
fl = format.lower()
if fl not in ["json", "xls", "xlsx"]:
print(f"Unknown filename format: {excel_filename}")
return
if fl != "json":
excel_filename = excel_filename.replace("." + format, ".json")
excel_filename = excel_filename.lower()
filepath = PARSED_DIR + os.path.sep + excel_filename
return filepath
def load_parsed_state(excel_filename):
filepath = parsed_file_path(excel_filename)
if not os.path.exists(filepath):
return
with open(filepath, "r", encoding="utf-8") as fp:
return json.load(fp=fp)
def save_parsed_state(excel_filename, obj):
filepath = parsed_file_path(excel_filename)
with open(filepath, "w", encoding="utf-8") as fp:
json.dump(obj, fp=fp, ensure_ascii=False, sort_keys=True)
print(f"Saved parsed state to '{filepath}'")
def run_session():
faileds = []
t = utils.StepTimeCounter()
# Delete tempdir
try:
try:
shutil.rmtree(DIRNAME)
@@ -205,45 +145,85 @@ def main():
print(f"Failed create '{DIRNAME}': ")
raise e
print("main(); parse links starting...")
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
now_diffable_dates = links_parser.excels_to_diffabledates(EXCEL_LINKS)
prev_diffable_dates = None
if os.path.exists("diffable_dates.txt"):
with open(DIFFABLE_DATES, 'r') as fp:
prev_diffable_dates = fp.read().strip()
with open(DIFFABLE_DATES, 'w') as fp:
fp.write(now_diffable_dates)
last_changeds = set()
for excel_dict in EXCEL_LINKS:
try:
last_changeds.add(excel_dict['last_changed'])
if now_diffable_dates == prev_diffable_dates:
print("No date changes in vstu.ru website. Stopping")
if not SKIP_DIFFABLE_DATES:
return
print("SKIP_DIFFABLE_DATES is True, force resuming")
counter = 10000
for excel_link in EXCEL_LINKS:
counter += 1
facultet = excel_link['facultet']
excel_url = excel_link['url']
latest_changed = excel_link['last_changed']
process_excel_file(facultet, excel_url, counter, latest_changed)
excel_url = excel_dict['url']
facultet = excel_dict['facultet']
excel_filename = excel_url.split("/")[-1]
excel_dict['json_represent'] = parsed_file_path(excel_filename).split(os.path.sep)[-1]
print("Saving result.json")
result['stat']['total_parsing_time'] = t.step()
result['unique_raws'] = sorted(unique_raws)
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json indent=2")
state = load_parsed_state(excel_filename)
is_new = state is None
if is_new:
state = {}
else:
same_date = False
try:
same_date = state['excel']['last_changed'] == excel_dict['last_changed']
print(f"Excel[{excel_filename}]: inServer={excel_dict['last_changed']}, inState={state['excel']['last_changed']} same={same_date}")
except Exception as e:
print(f"Excel[{excel_filename}]: failed testify last_changed")
if same_date:
state['actual_at'] = currt()
try:
del state['excel']['different_in_this_session']
except: pass
save_parsed_state(excel_filename, state)
continue
excel_dict['different_in_this_session'] = True
state['actual_at'] = currt()
state['excel'] = excel_dict
is_xlsx = excel_url.endswith(".xlsx")
download_place = f"{DIRNAME}/" + excel_filename + "_" + facultet + ".xls" + ("x" if is_xlsx else "")
utils.download_file_from_url(excel_url, download_place)
sha1hash = utils.calculate_sha1(download_place)
state['excel']['sha1hash'] = sha1hash
state['sheets'] = parse_sheets(download_place)
save_parsed_state(excel_filename, state)
except Exception as e:
faileds.append({
"uuid": str(uuid.uuid4()),
"exception": str(e),
"traceback": traceback.format_exception(e),
"context": f"Failed process excel file {excel_dict['url']}"
})
traceback.print_exception(e)
json.dump(result, open('result-no-indent.json', 'w'), ensure_ascii=False)
print("Saved to result-no-indent.json")
print("Faileds:")
print(faileds)
with open("parser.json", 'w', encoding="utf-8") as fp:
lc = {"*_x": ":("}
try:
s = sorted(last_changeds)
lc = {
"early": s[0],
"newly": s[-1]
}
except: pass
json.dump({
"last_changeds": lc,
"actual_at": currt(),
"all_files": EXCEL_LINKS,
"faileds": faileds
}, fp=fp, ensure_ascii=False)
# Delete a non-empty directory and its contents
# Delete a non-empty directory and its contents
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
@@ -251,6 +231,22 @@ def main():
print(f"Error deleting directory '{DIRNAME}': {e}")
def main():
while True:
try:
print("BEGIN run_session();")
run_session()
print("END run_session();")
except Exception as e:
print("Exception in run_session();")
traceback.print_exception(e)
print("Sleep for 30 minutes")
time.sleep(60*30)
print("Wake up!")
if __name__ == "__main__":
print("Start")
main()