258 lines
9.3 KiB
Python
258 lines
9.3 KiB
Python
# Copyright Stanislav Mironov
|
||
|
||
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
||
|
||
|
||
import json
|
||
import os
|
||
import random
|
||
import time
|
||
import traceback
|
||
import uuid
|
||
import aigenerated
|
||
import parser
|
||
import translations
|
||
import utils
|
||
import json
|
||
import links_parser
|
||
import shutil
|
||
import hashes
|
||
|
||
def currt():
|
||
return round(time.time())
|
||
|
||
FACULTETS = sorted([
|
||
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
||
])
|
||
DIRNAME = "excels"
|
||
DIFFABLE_DATES = "diffable_dates.txt"
|
||
|
||
SKIP_DIFFABLE_DATES = True
|
||
|
||
DEBUG_ONE_FAC = None #'fevt'
|
||
LOGGING = False
|
||
|
||
unique_raws = set()
|
||
result = {
|
||
"version": 1,
|
||
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php",
|
||
"actual_at": round(time.time()),
|
||
"documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json (temporary outdated)",
|
||
"daypicture": "0w0",
|
||
"daycite": "KIlLSWITCH",
|
||
"contact": "https://fazziclay.com/",
|
||
"university": "VSTU",
|
||
"university_site": "https://www.vstu.ru/",
|
||
"source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json",
|
||
"stat": {
|
||
"total_parsing_time": -1,
|
||
"excels": {
|
||
"fine": 0,
|
||
"bad": 0
|
||
},
|
||
"groups": 0,
|
||
"unique_raws": -1
|
||
},
|
||
"api_notices": {
|
||
"updated_at": 1773523692,
|
||
"text_pre1": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.",
|
||
"text": "2026-03-15 BREAKING CHANGES! By Stanislav Mironov.\n\nИзменено многое в угоду унифкации и расширению спаршенных групп. Пока alpha",
|
||
"warning": True,
|
||
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
|
||
},
|
||
"debug": {
|
||
"bleu~~": 3
|
||
},
|
||
"excels": [],
|
||
"facultets": FACULTETS,
|
||
"group_names_parsed": [],
|
||
"unique_raws": unique_raws,
|
||
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
|
||
}
|
||
|
||
def process_obj(data):
|
||
try:
|
||
if isinstance(data, dict):
|
||
for key, value in data.items():
|
||
if key == "raw":
|
||
unique_raws.update(value)
|
||
|
||
process_obj(value)
|
||
|
||
# Если это список, проходим по его элементам
|
||
elif isinstance(data, list):
|
||
for item in data:
|
||
process_obj(item)
|
||
|
||
except Exception as e:
|
||
print("Failed process_obj")
|
||
print(e)
|
||
|
||
def process_excel_file(facultet, excel_url, counter, latest_changed):
|
||
is_xlsx = excel_url.endswith(".xlsx")
|
||
download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
|
||
|
||
excel_filename = excel_url.split("/")[-1]
|
||
if "ФЭУ" not in excel_filename:
|
||
print("SKIPPED")
|
||
return
|
||
|
||
excel_info = {
|
||
"filename": excel_filename,
|
||
"data_source_hash": None,
|
||
"url": excel_url,
|
||
"latest_changed": latest_changed,
|
||
"download_place": download_place,
|
||
"group_names_parsed": [],
|
||
"facultet": facultet,
|
||
"counter": counter,
|
||
"sheets": []
|
||
}
|
||
parser.LOGGING = LOGGING
|
||
|
||
try:
|
||
aigenerated.download_file_from_url(excel_url, download_place)
|
||
sha1hash = hashes.calculate_sha1(download_place)
|
||
excel_info['data_source_hash'] = sha1hash
|
||
reader = translations.create_reader(download_place)
|
||
print("Reader info")
|
||
print(reader.info())
|
||
|
||
while True:
|
||
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
|
||
sheet_dict = {
|
||
"index": reader.get_sheet_index(),
|
||
"name": reader.get_sheet_name(),
|
||
"reader_info": reader.info(),
|
||
"group_names_parsed": [],
|
||
"groups": {}
|
||
}
|
||
excel_info['sheets'].append(sheet_dict)
|
||
prs = parser.Parser(reader)
|
||
|
||
print("Parser created; parser.parse();")
|
||
prs.parse()
|
||
|
||
print("parsed done!")
|
||
|
||
if len(prs.raw_no_schedule) > 0:
|
||
sheet_dict["raw_no_schedule"] = prs.raw_no_schedule
|
||
|
||
if len(prs.features) > 0:
|
||
sheet_dict["features"] = sorted(prs.features)
|
||
|
||
if prs.parser_error is not None:
|
||
sheet_dict["parser_error"] = prs.parser_error
|
||
|
||
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
|
||
sheet_dict["parser_warnings"] = prs.parser_warnings
|
||
|
||
for group_name in prs.groups.keys():
|
||
gr = prs.groups[group_name]
|
||
gr["excel_url"] = excel_url
|
||
sheet_dict["group_names_parsed"].append(group_name)
|
||
excel_info["group_names_parsed"].append(group_name)
|
||
result["group_names_parsed"].append(group_name)
|
||
result['stat']['groups'] += 1
|
||
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
|
||
sheet_dict['groups'][group_name] = gr
|
||
process_obj(gr['slots'])
|
||
|
||
print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
|
||
|
||
if not reader.has_next_sheet():
|
||
print("File ended")
|
||
break
|
||
else:
|
||
reader.next_sheet()
|
||
print("Next sheet!")
|
||
|
||
except Exception as e:
|
||
print(f"Error while {excel_url}")
|
||
print(e)
|
||
traceback.print_exc()
|
||
u = uuid.uuid4()
|
||
excel_info['error'] = {
|
||
"smile": ":(",
|
||
"error_message": str(e),
|
||
"log_anchor": str(u),
|
||
"time": currt()
|
||
}
|
||
print(f"Log Anchor: {u}")
|
||
faileds.append({
|
||
"ex": e,
|
||
"fac": facultet,
|
||
"url": excel_url
|
||
})
|
||
|
||
result['excels'].append(excel_info)
|
||
k = "fine" if len(excel_info['group_names_parsed']) > 0 else "bad"
|
||
result['stat']['excels'][k] += 1
|
||
|
||
faileds = []
|
||
def main():
|
||
global result
|
||
t = utils.StepTimeCounter()
|
||
try:
|
||
try:
|
||
shutil.rmtree(DIRNAME)
|
||
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
|
||
except Exception as e:
|
||
print(f"Error deleting directory '{DIRNAME}': {e}")
|
||
os.mkdir(DIRNAME)
|
||
print(f"Directory '{DIRNAME}' created successfully.")
|
||
except Exception as e:
|
||
print(f"Failed create '{DIRNAME}': ")
|
||
raise e
|
||
|
||
print("main(); parse links starting...")
|
||
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
||
now_diffable_dates = links_parser.excels_to_diffabledates(EXCEL_LINKS)
|
||
prev_diffable_dates = None
|
||
if os.path.exists("diffable_dates.txt"):
|
||
with open(DIFFABLE_DATES, 'r') as fp:
|
||
prev_diffable_dates = fp.read().strip()
|
||
|
||
with open(DIFFABLE_DATES, 'w') as fp:
|
||
fp.write(now_diffable_dates)
|
||
|
||
if now_diffable_dates == prev_diffable_dates:
|
||
print("No date changes in vstu.ru website. Stopping")
|
||
if not SKIP_DIFFABLE_DATES:
|
||
return
|
||
print("SKIP_DIFFABLE_DATES is True, force resuming")
|
||
|
||
counter = 10000
|
||
for excel_link in EXCEL_LINKS:
|
||
counter += 1
|
||
facultet = excel_link['facultet']
|
||
excel_url = excel_link['url']
|
||
latest_changed = excel_link['last_changed']
|
||
process_excel_file(facultet, excel_url, counter, latest_changed)
|
||
|
||
print("Saving result.json")
|
||
result['stat']['total_parsing_time'] = t.step()
|
||
result['unique_raws'] = sorted(unique_raws)
|
||
|
||
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
||
print("Saved to result.json indent=2")
|
||
|
||
json.dump(result, open('result-no-indent.json', 'w'), ensure_ascii=False)
|
||
print("Saved to result-no-indent.json")
|
||
|
||
print("Faileds:")
|
||
print(faileds)
|
||
|
||
# Delete a non-empty directory and its contents
|
||
try:
|
||
shutil.rmtree(DIRNAME)
|
||
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
|
||
except Exception as e:
|
||
print(f"Error deleting directory '{DIRNAME}': {e}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
print("Start")
|
||
main()
|
||
print("Bye!")
|