Files
VSTU_Schedule_Parser/main.py

258 lines
9.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright Stanislav Mironov
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
import json
import os
import random
import time
import traceback
import uuid
import aigenerated
import parser
import translations
import utils
import json
import links_parser
import shutil
import hashes
def currt():
return round(time.time())
FACULTETS = sorted([
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
])
DIRNAME = "excels"
DIFFABLE_DATES = "diffable_dates.txt"
SKIP_DIFFABLE_DATES = True
DEBUG_ONE_FAC = None #'fevt'
LOGGING = False
unique_raws = set()
result = {
"version": 1,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php",
"actual_at": round(time.time()),
"documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json (temporary outdated)",
"daypicture": "0w0",
"daycite": "KIlLSWITCH",
"contact": "https://fazziclay.com/",
"university": "VSTU",
"university_site": "https://www.vstu.ru/",
"source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json",
"stat": {
"total_parsing_time": -1,
"excels": {
"fine": 0,
"bad": 0
},
"groups": 0,
"unique_raws": -1
},
"api_notices": {
"updated_at": 1773523692,
"text_pre1": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.",
"text": "2026-03-15 BREAKING CHANGES! By Stanislav Mironov.\n\nИзменено многое в угоду унифкации и расширению спаршенных групп. Пока alpha",
"warning": True,
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
},
"debug": {
"bleu~~": 3
},
"excels": [],
"facultets": FACULTETS,
"group_names_parsed": [],
"unique_raws": unique_raws,
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
}
def process_obj(data):
try:
if isinstance(data, dict):
for key, value in data.items():
if key == "raw":
unique_raws.update(value)
process_obj(value)
# Если это список, проходим по его элементам
elif isinstance(data, list):
for item in data:
process_obj(item)
except Exception as e:
print("Failed process_obj")
print(e)
def process_excel_file(facultet, excel_url, counter, latest_changed):
is_xlsx = excel_url.endswith(".xlsx")
download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
excel_filename = excel_url.split("/")[-1]
if "ФЭУ" not in excel_filename:
print("SKIPPED")
return
excel_info = {
"filename": excel_filename,
"data_source_hash": None,
"url": excel_url,
"latest_changed": latest_changed,
"download_place": download_place,
"group_names_parsed": [],
"facultet": facultet,
"counter": counter,
"sheets": []
}
parser.LOGGING = LOGGING
try:
aigenerated.download_file_from_url(excel_url, download_place)
sha1hash = hashes.calculate_sha1(download_place)
excel_info['data_source_hash'] = sha1hash
reader = translations.create_reader(download_place)
print("Reader info")
print(reader.info())
while True:
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
sheet_dict = {
"index": reader.get_sheet_index(),
"name": reader.get_sheet_name(),
"reader_info": reader.info(),
"group_names_parsed": [],
"groups": {}
}
excel_info['sheets'].append(sheet_dict)
prs = parser.Parser(reader)
print("Parser created; parser.parse();")
prs.parse()
print("parsed done!")
if len(prs.raw_no_schedule) > 0:
sheet_dict["raw_no_schedule"] = prs.raw_no_schedule
if len(prs.features) > 0:
sheet_dict["features"] = sorted(prs.features)
if prs.parser_error is not None:
sheet_dict["parser_error"] = prs.parser_error
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
sheet_dict["parser_warnings"] = prs.parser_warnings
for group_name in prs.groups.keys():
gr = prs.groups[group_name]
gr["excel_url"] = excel_url
sheet_dict["group_names_parsed"].append(group_name)
excel_info["group_names_parsed"].append(group_name)
result["group_names_parsed"].append(group_name)
result['stat']['groups'] += 1
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
sheet_dict['groups'][group_name] = gr
process_obj(gr['slots'])
print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
if not reader.has_next_sheet():
print("File ended")
break
else:
reader.next_sheet()
print("Next sheet!")
except Exception as e:
print(f"Error while {excel_url}")
print(e)
traceback.print_exc()
u = uuid.uuid4()
excel_info['error'] = {
"smile": ":(",
"error_message": str(e),
"log_anchor": str(u),
"time": currt()
}
print(f"Log Anchor: {u}")
faileds.append({
"ex": e,
"fac": facultet,
"url": excel_url
})
result['excels'].append(excel_info)
k = "fine" if len(excel_info['group_names_parsed']) > 0 else "bad"
result['stat']['excels'][k] += 1
faileds = []
def main():
global result
t = utils.StepTimeCounter()
try:
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
os.mkdir(DIRNAME)
print(f"Directory '{DIRNAME}' created successfully.")
except Exception as e:
print(f"Failed create '{DIRNAME}': ")
raise e
print("main(); parse links starting...")
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
now_diffable_dates = links_parser.excels_to_diffabledates(EXCEL_LINKS)
prev_diffable_dates = None
if os.path.exists("diffable_dates.txt"):
with open(DIFFABLE_DATES, 'r') as fp:
prev_diffable_dates = fp.read().strip()
with open(DIFFABLE_DATES, 'w') as fp:
fp.write(now_diffable_dates)
if now_diffable_dates == prev_diffable_dates:
print("No date changes in vstu.ru website. Stopping")
if not SKIP_DIFFABLE_DATES:
return
print("SKIP_DIFFABLE_DATES is True, force resuming")
counter = 10000
for excel_link in EXCEL_LINKS:
counter += 1
facultet = excel_link['facultet']
excel_url = excel_link['url']
latest_changed = excel_link['last_changed']
process_excel_file(facultet, excel_url, counter, latest_changed)
print("Saving result.json")
result['stat']['total_parsing_time'] = t.step()
result['unique_raws'] = sorted(unique_raws)
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json indent=2")
json.dump(result, open('result-no-indent.json', 'w'), ensure_ascii=False)
print("Saved to result-no-indent.json")
print("Faileds:")
print(faileds)
# Delete a non-empty directory and its contents
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
if __name__ == "__main__":
print("Start")
main()
print("Bye!")