VSTU_Schedule_Parser/main.py

# Copyright Stanislav Mironov

# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля


import json
import pika
import os
import random
import time
import traceback
import uuid

from pika.exceptions import ChannelWrongStateError
import parser
import translations
import utils
import json
import links_parser
import shutil
from dotenv import load_dotenv
load_dotenv()

RABBITMQ_URL = os.environ.get("RABBITMQ_URL")
EXCHANGE_NAME = os.environ.get("RABBITMQ_EXCHANGE", "vstu_schedule")
INFINITY_LOOP = os.environ.get("INFINITY_LOOP", "no").lower() in ['yes', "true"]
parser.LOGGING = LOGGING = os.environ.get("PARSER_LOGGING", "no").lower() in ['yes', "true"]


try:
    connection = pika.BlockingConnection(pika.URLParameters(RABBITMQ_URL))
    channel = connection.channel()

    channel.exchange_declare(exchange=EXCHANGE_NAME,
                            exchange_type='topic',
                            durable=True)
except Exception as e:
    print("Failed to connect RabbitMQ")
    traceback.print_exception(e)

def currt():
    return round(time.time())

FACULTETS = sorted([
    "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
])
DIRNAME = "excels"
PARSED_DIR = "parsed"

DEBUG_ONE_FAC = None# 'fevt'
DEBUG_NO_SAVE_STATES = False
DEBUG_NO_LINKS_DELAY = True

facultets_data = None


def gen_groups_from_states(states):
    groups = {}
    if facultets_data is None:
        print("FAILED BECAUSE facultets_data is NONE!!!")
        return

    for state in states:
        for sheet in state['sheets'].values():
            gr = sheet.get('groups', {})
            if len(gr.keys()) == 0:
                continue

            for key, group_dict in gr.items():
                group_name = group_dict['name']

                full_path_key = key.upper()
                recognized_fac = utils.get_preferer_facultet(facultets_data, state['excel']['url'], skip_for=['mag', 'asp'])
                tech_fac = state['excel']['facultet']
                full_path_key = utils.get_abbrev_for_facultet(facultets_data, tech_fac) + "/" + full_path_key
                if tech_fac != recognized_fac and recognized_fac is not None:
                    full_path_key = utils.get_abbrev_for_facultet(facultets_data, recognized_fac) + "/" + full_path_key

                full_path_key = full_path_key.replace(" ", "").replace("\n", "").upper().strip()
                if full_path_key in groups.keys():
                    groups[full_path_key]['doubled'] = True
                    groups[full_path_key]['excels'].append(state['excel'])
                else:
                    groups[full_path_key] = {
                        "full_path_key": full_path_key,
                        "real_name": group_name,
                        "facultet_tech": tech_fac,
                        "facultet_regognized": recognized_fac,
                        "excels": [state['excel']],
                        "excel_position": group_dict['position_human'],
                        "excel_sheet": {
                            "name": sheet['name'],
                            "index": sheet['index']
                        },
                        "slots_weekdays_used": sorted(group_dict['slots'].keys())
                    }

    return {"version": 1, "groups": json.loads(json.dumps(groups, sort_keys=True, ensure_ascii=False))}

def parse_sheets(download_place):
    to_return = {}
    try:
        reader = translations.create_reader(download_place)
        print("Reader info")
        print(reader.info())

        while True:
            t = utils.StepTimeCounter()
            print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
            sheet_dict = {
                "index": reader.get_sheet_index(),
                "name": reader.get_sheet_name(),
                "reader_info": reader.info(),
                "groups": {}
            }
            to_return["SHEET_"+str(reader.get_sheet_index())] = sheet_dict
            prs = parser.Parser(reader)

            print("Parser created; parser.parse();")
            prs.parse()

            print("parsed done!")
            sheet_dict['parse_time'] = round(t.step())

            if len(prs.raw_no_schedule) > 0:
                sheet_dict["other_raws"] = prs.raw_no_schedule

            if len(prs.features) > 0:
                sheet_dict["features"] = sorted(prs.features)

            if prs.parser_error is not None:
                sheet_dict["parser_error"] = prs.parser_error

            if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
                sheet_dict["parser_warnings"] = prs.parser_warnings

            for group_name_key in prs.groups.keys():
                gr = prs.groups[group_name_key]
                sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
                sheet_dict['groups'][group_name_key] = gr


            print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))

            if not reader.has_next_sheet():
                print("File ended")
                break
            else:
                reader.next_sheet()
                print("Next sheet!")

    except Exception as e:
        print(e)
        traceback.print_exc()
        u = uuid.uuid4()
        to_return['error'] = {
            "smile": ":(",
            "error_message": str(e),
            "log_anchor": str(u),
            "time": currt()
        }
        print(f"Log Anchor: {u}")

    return to_return

def parsed_file_path(excel_filename: str):
    format = excel_filename.split(".")[-1]
    fl = format.lower()

    if fl not in ["json", "xls", "xlsx"]:
        print(f"Unknown filename format: {excel_filename}")
        return

    if fl != "json":
        excel_filename = excel_filename.replace("." + format, ".json")

    excel_filename = excel_filename.lower()
    filepath = PARSED_DIR + os.path.sep + excel_filename
    return filepath

def load_parsed_state(excel_filename):
    filepath = parsed_file_path(excel_filename)
    if not os.path.exists(filepath):
        return

    with open(filepath, "r", encoding="utf-8") as fp:
        return json.load(fp=fp)

def save_parsed_state(excel_filename, obj):
    filepath = parsed_file_path(excel_filename)
    if DEBUG_NO_SAVE_STATES:
        print("Saved! (fake because DEBUG_NO_SAVE_STATES)")

    with open(filepath, "w", encoding="utf-8") as fp:
        json.dump(obj, fp=fp, ensure_ascii=False, sort_keys=True)

    print(f"Saved parsed state to '{filepath}'")

def run_session():
    faileds = []

    t = utils.StepTimeCounter()

    # Delete tempdir
    try:
        try:
            shutil.rmtree(DIRNAME)
            print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
        except Exception as e:
            print(f"Error deleting directory '{DIRNAME}': {e}")
        os.mkdir(DIRNAME)
        print(f"Directory '{DIRNAME}' created successfully.")
    except Exception as e:
        print(f"Failed create '{DIRNAME}': ")
        raise e


    print("main(); parse links starting...")
    EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC], DEBUG_NO_LINKS_DELAY=DEBUG_NO_LINKS_DELAY)

    if len(EXCEL_LINKS) < 5 and not DEBUG_ONE_FAC:
        raise Exception("Safety exception: excel links count < 5; maybe in vstu.ru tech works")


    last_changeds = set()
    states = []
    changed = False
    changed_files = 0
    total_files = len(EXCEL_LINKS)
    new_files = 0
    for excel_dict in EXCEL_LINKS:
        try:
            last_changeds.add(excel_dict['last_changed'])

            excel_url = excel_dict['url']

            for state in states:
                ch = state['excel']['url']
                if excel_url == ch:
                    print(f"Doubled excel files(By URLs)! Current 1th={excel_dict}; 2th={state['excel']}")
                    print("Skipped!")
                    continue

            facultet = excel_dict['facultet']
            excel_filename = excel_url.split("/")[-1]
            excel_dict['json_represent'] = parsed_file_path(excel_filename).split(os.path.sep)[-1]
            print(f"Processing {facultet} {excel_filename}")

            state = load_parsed_state(excel_filename)
            is_new = state is None
            if is_new:
                state = {}
                channel.basic_publish(
                    exchange=EXCHANGE_NAME,
                    routing_key='parser.excel_found.new',
                    properties=pika.BasicProperties(
                        content_type="application/json",
                        delivery_mode=2
                    ),
                    body=json.dumps({
                        "type": "excel_file_found",
                        "same": False,
                        "is_new": True,
                        "excel_dict": excel_dict
                        }, ensure_ascii=False).encode('utf-8')
                )
                print(f"RabbitMQ published 'parser.excel_found.new'")
                new_files += 1

            else:
                same_date = False
                try:
                    same_date = state['excel']['last_changed'] == excel_dict['last_changed']
                    print(f"Excel[{excel_filename}]: inServer={excel_dict['last_changed']}, inState={state['excel']['last_changed']} same={same_date}")

                except Exception as e:
                    print(f"Excel[{excel_filename}]: failed testify last_changed")

                r = "parser.excel_found." + ("same" if same_date else "different") + "." + facultet
                channel.basic_publish(
                    exchange=EXCHANGE_NAME,
                    routing_key=r,
                    properties=pika.BasicProperties(
                        content_type="application/json",
                        delivery_mode=2
                    ),
                    body=json.dumps({
                        "type": "excel_file_found",
                        "same": same_date,
                        "is_new": False,
                        "excel_dict": excel_dict
                        }, ensure_ascii=False).encode('utf-8')
                )
                print(f"RabbitMQ published r={r}")

                if same_date:
                    state['actual_at'] = currt()
                    try:
                        del state['excel']['different_in_this_session']
                    except: pass
                    states.append(state)
                    save_parsed_state(excel_filename, state)
                    continue

            changed_files += 1
            changed = True
            excel_dict['different_in_this_session'] = True
            excel_dict['recognized_facultet'] = utils.get_preferer_facultet(facultets_data, excel_url=excel_dict['url'])
            state['actual_at'] = currt()
            state['excel'] = excel_dict

            is_xlsx = excel_url.endswith(".xlsx")
            download_place = f"{DIRNAME}/" + excel_filename + "_" + facultet + ".xls" + ("x" if is_xlsx else "")
            utils.download_file_from_url(excel_url, download_place)
            sha1hash = utils.calculate_sha1(download_place)
            state['excel']['sha1hash'] = sha1hash

            state['sheets'] = parse_sheets(download_place)

            channel.basic_publish(
                    exchange=EXCHANGE_NAME,
                    routing_key="parser.excel_parsed." + facultet,
                    properties=pika.BasicProperties(
                        content_type="application/json",
                        delivery_mode=2
                    ),
                    body=json.dumps({
                        "type": "excel_file_parsed_not_same",
                        "is_new": is_new,
                        "state": state
                        }, ensure_ascii=False).encode('utf-8')
                )

            save_parsed_state(excel_filename, state)
            states.append(state)

        except Exception as e:
            if isinstance(e, ChannelWrongStateError):
                raise e

            faileds.append({
                "uuid": str(uuid.uuid4()),
                "exception": str(e),
                "traceback": traceback.format_exception(e),
                "context": f"Failed process excel file {excel_dict['url']}"
            })
            traceback.print_exception(e)


    with open("parser.json", 'w', encoding="utf-8") as fp:
        lc = {"*_x": ":("}
        try:
            s = sorted(last_changeds)
            lc = {
                    "early": s[0],
                    "newly": s[-1]
                }
        except: pass

        json.dump({
                "last_changeds": lc,
                "actual_at": currt(),
                "all_files": EXCEL_LINKS,
                "faileds": faileds
            }, fp=fp, ensure_ascii=False)

    with open("groups.json", 'w', encoding="utf-8") as fp:
        json.dump(gen_groups_from_states(states), fp=fp, ensure_ascii=False)

    if changed:
        all_files = states
        d = {
            "version": 2,
            "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: ПРЕДОСТАВЛЯЕТСЯ КАК-ЕСТЬ (AS-IS) БЕЗ КАКИХ ЛИБО ГАРАНТИЙ",
            "contact": "https://fazziclay.com/ или fazziclay@gmail.com",
            "api_notices": {
                "just_save_and_check_diffs": "просто сохраните и проверяйте разницу"
            },
            "actual_at": currt(),
            "all_files": sorted(all_files, key=lambda d: d['excel']['url']),
            "faileds": faileds
        }


        with open("result_v2.json", 'w', encoding="utf-8") as fp:
            json.dump(d, fp=fp, ensure_ascii=False)

        channel.basic_publish(
            exchange=EXCHANGE_NAME,
            routing_key="parser.result_v2",
            properties=pika.BasicProperties(
                content_type="application/json",
                delivery_mode=2
            ),
            body=json.dumps({
                "type": "schedule_result_v2_changed",
                }, ensure_ascii=False).encode('utf-8')
        )

    # Delete a non-empty directory and its contents
    try:
        shutil.rmtree(DIRNAME)
        print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
    except Exception as e:
        print(f"Error deleting directory '{DIRNAME}': {e}")

    return {"changed_files": changed_files, 'total_files': total_files, "changed": changed, "new_files": new_files, "faileds": faileds, "faileds_count": len(faileds)}

def check_dirs():

    if not os.path.exists(PARSED_DIR):
        os.mkdir(PARSED_DIR)

def main():
    global facultets_data
    with open("facultets.json") as fp:
        facultets_data = json.load(fp=fp)

    flag = True
    while flag:
        if not INFINITY_LOOP:
            flag = False

        t = utils.StepTimeCounter()
        err = None
        sess = None
        try:
            check_dirs()

            print("BEGIN run_session();")
            sess = run_session()
            print("END run_session();")

            if DEBUG_ONE_FAC:
                print("DEBUG_ONE_FAC; break infinity-loop")
                break

        except Exception as e:
            err = e
            print("Exception in run_session();")
            traceback.print_exception(e)

        channel.basic_publish(
            exchange=EXCHANGE_NAME,
            routing_key="parser.session_end." + ('complete' if err is None else 'failed'),
            properties=pika.BasicProperties(
                content_type="application/json",
                delivery_mode=2
            ),
            body=json.dumps({
                "type": "session_end",
                "err": str(err) if err else None,
                "duration": t.step(),
                "session": sess
                }, ensure_ascii=False).encode('utf-8')
        )

        if flag:
            sleep_time = random.randint(14*60, 21*60)
            print(f"Sleep for {round(sleep_time/6)/10} minutes")
            time.sleep(sleep_time)
            print("Wake up!")


if __name__ == "__main__":
    print("Start")
    main()
    print("Bye!")