# Copyright Stanislav Mironov # Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля import json import pika import os import random import time import traceback import uuid import parser import translations import utils import json import links_parser import shutil from dotenv import load_dotenv load_dotenv() RABBITMQ_URL = os.environ.get("RABBITMQ_URL") EXCHANGE_NAME = os.environ.get("RABBITMQ_EXCHANGE", "vstu_schedule") try: connection = pika.BlockingConnection(pika.URLParameters(RABBITMQ_URL)) channel = connection.channel() channel.exchange_declare(exchange=EXCHANGE_NAME, exchange_type='topic', durable=True) except Exception as e: print("Failed to connect RabbitMQ") traceback.print_exception(e) def currt(): return round(time.time()) FACULTETS = sorted([ "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik" ]) DIRNAME = "excels" PARSED_DIR = "parsed" DEBUG_ONE_FAC = None #'fevt' DEBUG_NO_SAVE_STATES = False parser.LOGGING = LOGGING = True def parse_sheets(download_place): to_return = {} try: reader = translations.create_reader(download_place) print("Reader info") print(reader.info()) while True: t = utils.StepTimeCounter() print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)") sheet_dict = { "index": reader.get_sheet_index(), "name": reader.get_sheet_name(), "reader_info": reader.info(), "groups": {} } to_return["SHEET_"+str(reader.get_sheet_index())] = sheet_dict prs = parser.Parser(reader) print("Parser created; parser.parse();") prs.parse() print("parsed done!") sheet_dict['parse_time'] = round(t.step()) if len(prs.raw_no_schedule) > 0: sheet_dict["other_raws"] = prs.raw_no_schedule if len(prs.features) > 0: sheet_dict["features"] = sorted(prs.features) if prs.parser_error is not None: sheet_dict["parser_error"] = prs.parser_error if prs.parser_warnings is not None and len(prs.parser_warnings) > 0: sheet_dict["parser_warnings"] = prs.parser_warnings for group_name_key in prs.groups.keys(): gr = prs.groups[group_name_key] sheet_dict['week_keys_metadata'] = prs.week_keys_metadata sheet_dict['groups'][group_name_key] = gr print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys())) if not reader.has_next_sheet(): print("File ended") break else: reader.next_sheet() print("Next sheet!") except Exception as e: print(e) traceback.print_exc() u = uuid.uuid4() to_return['error'] = { "smile": ":(", "error_message": str(e), "log_anchor": str(u), "time": currt() } print(f"Log Anchor: {u}") return to_return def parsed_file_path(excel_filename: str): format = excel_filename.split(".")[-1] fl = format.lower() if fl not in ["json", "xls", "xlsx"]: print(f"Unknown filename format: {excel_filename}") return if fl != "json": excel_filename = excel_filename.replace("." + format, ".json") excel_filename = excel_filename.lower() filepath = PARSED_DIR + os.path.sep + excel_filename return filepath def load_parsed_state(excel_filename): filepath = parsed_file_path(excel_filename) if not os.path.exists(filepath): return with open(filepath, "r", encoding="utf-8") as fp: return json.load(fp=fp) def save_parsed_state(excel_filename, obj): filepath = parsed_file_path(excel_filename) if DEBUG_NO_SAVE_STATES: print("Saved! (fake because DEBUG_NO_SAVE_STATES)") with open(filepath, "w", encoding="utf-8") as fp: json.dump(obj, fp=fp, ensure_ascii=False, sort_keys=True) print(f"Saved parsed state to '{filepath}'") def run_session(): faileds = [] t = utils.StepTimeCounter() # Delete tempdir try: try: shutil.rmtree(DIRNAME) print(f"Directory '{DIRNAME}' and its contents deleted successfully.") except Exception as e: print(f"Error deleting directory '{DIRNAME}': {e}") os.mkdir(DIRNAME) print(f"Directory '{DIRNAME}' created successfully.") except Exception as e: print(f"Failed create '{DIRNAME}': ") raise e print("main(); parse links starting...") EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC]) if len(EXCEL_LINKS) < 5 and not DEBUG_ONE_FAC: raise Exception("Safety exception: excel links count < 5; maybe in vstu.ru tech works") last_changeds = set() states = [] changed = False changed_files = 0 total_files = len(EXCEL_LINKS) new_files = 0 for excel_dict in EXCEL_LINKS: try: last_changeds.add(excel_dict['last_changed']) excel_url = excel_dict['url'] facultet = excel_dict['facultet'] excel_filename = excel_url.split("/")[-1] excel_dict['json_represent'] = parsed_file_path(excel_filename).split(os.path.sep)[-1] print(f"Processing {facultet} {excel_filename}") state = load_parsed_state(excel_filename) is_new = state is None if is_new: state = {} channel.basic_publish( exchange=EXCHANGE_NAME, routing_key='parser.excel_found.new', properties=pika.BasicProperties( content_type="application/json", delivery_mode=2 ), body=json.dumps({ "type": "excel_file_found", "same": False, "is_new": True, "excel_dict": excel_dict }, ensure_ascii=False).encode('utf-8') ) print(f"RabbitMQ published r={r}") new_files += 1 else: same_date = False try: same_date = state['excel']['last_changed'] == excel_dict['last_changed'] print(f"Excel[{excel_filename}]: inServer={excel_dict['last_changed']}, inState={state['excel']['last_changed']} same={same_date}") except Exception as e: print(f"Excel[{excel_filename}]: failed testify last_changed") r = "parser.excel_found." + ("same" if same_date else "different") + "." + facultet channel.basic_publish( exchange=EXCHANGE_NAME, routing_key=r, properties=pika.BasicProperties( content_type="application/json", delivery_mode=2 ), body=json.dumps({ "type": "excel_file_found", "same": same_date, "is_new": False, "excel_dict": excel_dict }, ensure_ascii=False).encode('utf-8') ) print(f"RabbitMQ published r={r}") if same_date: state['actual_at'] = currt() try: del state['excel']['different_in_this_session'] except: pass states.append(state) save_parsed_state(excel_filename, state) continue changed_files += 1 changed = True excel_dict['different_in_this_session'] = True state['actual_at'] = currt() state['excel'] = excel_dict is_xlsx = excel_url.endswith(".xlsx") download_place = f"{DIRNAME}/" + excel_filename + "_" + facultet + ".xls" + ("x" if is_xlsx else "") utils.download_file_from_url(excel_url, download_place) sha1hash = utils.calculate_sha1(download_place) state['excel']['sha1hash'] = sha1hash state['sheets'] = parse_sheets(download_place) channel.basic_publish( exchange=EXCHANGE_NAME, routing_key="parser.excel_parsed." + facultet, properties=pika.BasicProperties( content_type="application/json", delivery_mode=2 ), body=json.dumps({ "type": "excel_file_parsed", "is_new": is_new, "state": state }, ensure_ascii=False).encode('utf-8') ) save_parsed_state(excel_filename, state) states.append(state) except Exception as e: faileds.append({ "uuid": str(uuid.uuid4()), "exception": str(e), "traceback": traceback.format_exception(e), "context": f"Failed process excel file {excel_dict['url']}" }) traceback.print_exception(e) with open("parser.json", 'w', encoding="utf-8") as fp: lc = {"*_x": ":("} try: s = sorted(last_changeds) lc = { "early": s[0], "newly": s[-1] } except: pass json.dump({ "last_changeds": lc, "actual_at": currt(), "all_files": EXCEL_LINKS, "faileds": faileds }, fp=fp, ensure_ascii=False) if changed: all_files = states d = { "version": 2, "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: ПРЕДОСТАВЛЯЕТСЯ КАК-ЕСТЬ (AS-IS) БЕЗ КАКИХ ЛИБО ГАРАНТИЙ", "contact": "https://fazziclay.com/ или fazziclay@gmail.com", "api_notices": { "just_save_and_check_diffs": "просто сохраните и проверяйте разницу" }, "actual_at": currt(), "all_files": sorted(all_files, key=lambda d: d['excel']['url']), "faileds": faileds } with open("result_v2.json", 'w', encoding="utf-8") as fp: json.dump(d, fp=fp, ensure_ascii=False) channel.basic_publish( exchange=EXCHANGE_NAME, routing_key="parser.result_v2", properties=pika.BasicProperties( content_type="application/json", delivery_mode=2 ), body=json.dumps({ "type": "schedule_result_v2", "data": d }, ensure_ascii=False).encode('utf-8') ) # Delete a non-empty directory and its contents try: shutil.rmtree(DIRNAME) print(f"Directory '{DIRNAME}' and its contents deleted successfully.") except Exception as e: print(f"Error deleting directory '{DIRNAME}': {e}") return {"changed_files": changed_files, 'total_files': total_files, "changed": changed, "new_files": new_files} def check_dirs(): if not os.path.exists(PARSED_DIR): os.mkdir(PARSED_DIR) def main(): while True: t = utils.StepTimeCounter() err = None sess = None try: check_dirs() print("BEGIN run_session();") sess = run_session() print("END run_session();") if DEBUG_ONE_FAC: print("DEBUG_ONE_FAC; break infinity-loop") break except Exception as e: err = e print("Exception in run_session();") traceback.print_exception(e) channel.basic_publish( exchange=EXCHANGE_NAME, routing_key="parser.session_end." + ('complete' if err is None else 'failed'), properties=pika.BasicProperties( content_type="application/json", delivery_mode=2 ), body=json.dumps({ "type": "session_end", "err": str(err) if err else None, "duration": t.step(), "session": sess }, ensure_ascii=False).encode('utf-8') ) sleep_time = random.randint(14*60, 21*60) print(f"Sleep for {round(sleep_time/6)/10} minutes") time.sleep(sleep_time) print("Wake up!") if __name__ == "__main__": print("Start") main() print("Bye!")