VSTU_Schedule_Parser/main.py

# Copyright Stanislav Mironov

# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля


import json
import pika
import os
import random
import time
import traceback
import uuid
import parser
import translations
import utils
import json
import links_parser
import shutil
from dotenv import load_dotenv
load_dotenv()

RABBITMQ_URL = os.environ.get("RABBITMQ_URL")
EXCHANGE_NAME = os.environ.get("RABBITMQ_EXCHANGE", "vstu_schedule")

try:
    connection = pika.BlockingConnection(pika.URLParameters(RABBITMQ_URL))
    channel = connection.channel()

    channel.exchange_declare(exchange=EXCHANGE_NAME,
                            exchange_type='topic',
                            durable=True)
except Exception as e:
    print("Failed to connect RabbitMQ")
    traceback.print_exception(e)

def currt():
    return round(time.time())

FACULTETS = sorted([
    "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
])
DIRNAME = "excels"
PARSED_DIR = "parsed"

DEBUG_ONE_FAC = None #'fevt'
DEBUG_NO_SAVE_STATES = False

parser.LOGGING = LOGGING = True

def parse_sheets(download_place):
    to_return = {}
    try:
        reader = translations.create_reader(download_place)
        print("Reader info")
        print(reader.info())

        while True:
            t = utils.StepTimeCounter()
            print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
            sheet_dict = {
                "index": reader.get_sheet_index(),
                "name": reader.get_sheet_name(),
                "reader_info": reader.info(),
                "groups": {}
            }
            to_return["SHEET_"+str(reader.get_sheet_index())] = sheet_dict
            prs = parser.Parser(reader)

            print("Parser created; parser.parse();")
            prs.parse()

            print("parsed done!")
            sheet_dict['parse_time'] = round(t.step())

            if len(prs.raw_no_schedule) > 0:
                sheet_dict["other_raws"] = prs.raw_no_schedule

            if len(prs.features) > 0:
                sheet_dict["features"] = sorted(prs.features)

            if prs.parser_error is not None:
                sheet_dict["parser_error"] = prs.parser_error

            if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
                sheet_dict["parser_warnings"] = prs.parser_warnings

            for group_name_key in prs.groups.keys():
                gr = prs.groups[group_name_key]
                sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
                sheet_dict['groups'][group_name_key] = gr


            print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))

            if not reader.has_next_sheet():
                print("File ended")
                break
            else:
                reader.next_sheet()
                print("Next sheet!")

    except Exception as e:
        print(e)
        traceback.print_exc()
        u = uuid.uuid4()
        to_return['error'] = {
            "smile": ":(",
            "error_message": str(e),
            "log_anchor": str(u),
            "time": currt()
        }
        print(f"Log Anchor: {u}")

    return to_return

def parsed_file_path(excel_filename: str):
    format = excel_filename.split(".")[-1]
    fl = format.lower()

    if fl not in ["json", "xls", "xlsx"]:
        print(f"Unknown filename format: {excel_filename}")
        return

    if fl != "json":
        excel_filename = excel_filename.replace("." + format, ".json")

    excel_filename = excel_filename.lower()
    filepath = PARSED_DIR + os.path.sep + excel_filename
    return filepath

def load_parsed_state(excel_filename):
    filepath = parsed_file_path(excel_filename)
    if not os.path.exists(filepath):
        return

    with open(filepath, "r", encoding="utf-8") as fp:
        return json.load(fp=fp)

def save_parsed_state(excel_filename, obj):
    filepath = parsed_file_path(excel_filename)
    if DEBUG_NO_SAVE_STATES:
        print("Saved! (fake because DEBUG_NO_SAVE_STATES)")

    with open(filepath, "w", encoding="utf-8") as fp:
        json.dump(obj, fp=fp, ensure_ascii=False, sort_keys=True)

    print(f"Saved parsed state to '{filepath}'")

def run_session():
    faileds = []

    t = utils.StepTimeCounter()

    # Delete tempdir
    try:
        try:
            shutil.rmtree(DIRNAME)
            print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
        except Exception as e:
            print(f"Error deleting directory '{DIRNAME}': {e}")
        os.mkdir(DIRNAME)
        print(f"Directory '{DIRNAME}' created successfully.")
    except Exception as e:
        print(f"Failed create '{DIRNAME}': ")
        raise e


    print("main(); parse links starting...")
    EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])

    if len(EXCEL_LINKS) < 5 and not DEBUG_ONE_FAC:
        raise Exception("Safety exception: excel links count < 5; maybe in vstu.ru tech works")


    last_changeds = set()
    states = []
    changed = False
    for excel_dict in EXCEL_LINKS:
        try:
            last_changeds.add(excel_dict['last_changed'])

            excel_url = excel_dict['url']
            facultet = excel_dict['facultet']
            excel_filename = excel_url.split("/")[-1]
            excel_dict['json_represent'] = parsed_file_path(excel_filename).split(os.path.sep)[-1]
            print(f"Processing {facultet} {excel_filename}")

            state = load_parsed_state(excel_filename)
            is_new = state is None
            if is_new:
                state = {}

            else:
                same_date = False
                try:
                    same_date = state['excel']['last_changed'] == excel_dict['last_changed']
                    print(f"Excel[{excel_filename}]: inServer={excel_dict['last_changed']}, inState={state['excel']['last_changed']} same={same_date}")

                except Exception as e:
                    print(f"Excel[{excel_filename}]: failed testify last_changed")

                r = "parser.excel_found." + ("same" if same_date else "different") + "." + facultet
                channel.basic_publish(
                    exchange=EXCHANGE_NAME,
                    routing_key=r,
                    properties=pika.BasicProperties(
                        content_type="application/json",
                        delivery_mode=2
                    ),
                    body=json.dumps({
                        "type": "excel_file_found",
                        "same": same_date,
                        "excel_dict": excel_dict
                        }, ensure_ascii=False).encode('utf-8')
                )
                print(f"RabbitMQ published r={r}")

                if same_date:
                    state['actual_at'] = currt()
                    try:
                        del state['excel']['different_in_this_session']
                    except: pass
                    states.append(state)
                    save_parsed_state(excel_filename, state)
                    continue

            changed = True
            excel_dict['different_in_this_session'] = True
            state['actual_at'] = currt()
            state['excel'] = excel_dict

            is_xlsx = excel_url.endswith(".xlsx")
            download_place = f"{DIRNAME}/" + excel_filename + "_" + facultet + ".xls" + ("x" if is_xlsx else "")
            utils.download_file_from_url(excel_url, download_place)
            sha1hash = utils.calculate_sha1(download_place)
            state['excel']['sha1hash'] = sha1hash

            state['sheets'] = parse_sheets(download_place)

            channel.basic_publish(
                    exchange=EXCHANGE_NAME,
                    routing_key="parser.excel_parsed." + facultet,
                    properties=pika.BasicProperties(
                        content_type="application/json",
                        delivery_mode=2
                    ),
                    body=json.dumps({
                        "type": "excel_file_parsed",
                        "is_new": is_new,
                        "state": state
                        }, ensure_ascii=False).encode('utf-8')
                )

            save_parsed_state(excel_filename, state)
            states.append(state)

        except Exception as e:
            faileds.append({
                "uuid": str(uuid.uuid4()),
                "exception": str(e),
                "traceback": traceback.format_exception(e),
                "context": f"Failed process excel file {excel_dict['url']}"
            })
            traceback.print_exception(e)


    with open("parser.json", 'w', encoding="utf-8") as fp:
        lc = {"*_x": ":("}
        try:
            s = sorted(last_changeds)
            lc = {
                    "early": s[0],
                    "newly": s[-1]
                }
        except: pass

        json.dump({
                "last_changeds": lc,
                "actual_at": currt(),
                "all_files": EXCEL_LINKS,
                "faileds": faileds
            }, fp=fp, ensure_ascii=False)

    if changed:
        all_files = states
        d = {
            "version": 2,
            "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: ПРЕДОСТАВЛЯЕТСЯ КАК-ЕСТЬ (AS-IS) БЕЗ КАКИХ ЛИБО ГАРАНТИЙ",
            "contact": "https://fazziclay.com/ или fazziclay@gmail.com",
            "api_notices": {
                "just_save_and_check_diffs": "просто сохраните и проверяйте разницу"
            },
            "actual_at": currt(),
            "all_files": sorted(all_files, key=lambda d: d['excel']['url']),
            "faileds": faileds
        }
        with open("result_v2.json", 'w', encoding="utf-8") as fp:
            json.dump(d, fp=fp, ensure_ascii=False)

        channel.basic_publish(
            exchange=EXCHANGE_NAME,
            routing_key="parser.result_v2",
            properties=pika.BasicProperties(
                content_type="application/json",
                delivery_mode=2
            ),
            body=json.dumps({
                "type": "schedule_result_v2",
                "data": d
                }, ensure_ascii=False).encode('utf-8')
        )

    # Delete a non-empty directory and its contents
    try:
        shutil.rmtree(DIRNAME)
        print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
    except Exception as e:
        print(f"Error deleting directory '{DIRNAME}': {e}")

    return {"changed": changed}

def check_dirs():

    if not os.path.exists(PARSED_DIR):
        os.mkdir(PARSED_DIR)

def main():
    while True:
        t = utils.StepTimeCounter()
        err = None
        sess = None
        try:
            check_dirs()

            print("BEGIN run_session();")
            sess = run_session()
            print("END run_session();")

            if DEBUG_ONE_FAC:
                print("DEBUG_ONE_FAC; break infinity-loop")
                break

        except Exception as e:
            err = e
            print("Exception in run_session();")
            traceback.print_exception(e)

        channel.basic_publish(
            exchange=EXCHANGE_NAME,
            routing_key="parser.session_end." + ('complete' if err is None else 'failed'),
            properties=pika.BasicProperties(
                content_type="application/json",
                delivery_mode=2
            ),
            body=json.dumps({
                "type": "session_end",
                "err": err,
                "duration": t.step(),
                "session": sess
                }, ensure_ascii=False).encode('utf-8')
        )

        print("Sleep for 30 minutes")
        time.sleep(60*30)
        print("Wake up!")


if __name__ == "__main__":
    print("Start")
    main()
    print("Bye!")