Files
VSTU_Schedule_Parser/main.py
2026-03-26 00:12:37 +03:00

373 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright Stanislav Mironov
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
import json
import pika
import os
import random
import time
import traceback
import uuid
import parser
import translations
import utils
import json
import links_parser
import shutil
from dotenv import load_dotenv
load_dotenv()
RABBITMQ_URL = os.environ.get("RABBITMQ_URL")
EXCHANGE_NAME = os.environ.get("RABBITMQ_EXCHANGE", "vstu_schedule")
try:
connection = pika.BlockingConnection(pika.URLParameters(RABBITMQ_URL))
channel = connection.channel()
channel.exchange_declare(exchange=EXCHANGE_NAME,
exchange_type='topic',
durable=True)
except Exception as e:
print("Failed to connect RabbitMQ")
traceback.print_exception(e)
def currt():
return round(time.time())
FACULTETS = sorted([
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
])
DIRNAME = "excels"
PARSED_DIR = "parsed"
DEBUG_ONE_FAC = None #'fevt'
DEBUG_NO_SAVE_STATES = False
parser.LOGGING = LOGGING = True
def parse_sheets(download_place):
to_return = {}
try:
reader = translations.create_reader(download_place)
print("Reader info")
print(reader.info())
while True:
t = utils.StepTimeCounter()
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
sheet_dict = {
"index": reader.get_sheet_index(),
"name": reader.get_sheet_name(),
"reader_info": reader.info(),
"groups": {}
}
to_return["SHEET_"+str(reader.get_sheet_index())] = sheet_dict
prs = parser.Parser(reader)
print("Parser created; parser.parse();")
prs.parse()
print("parsed done!")
sheet_dict['parse_time'] = round(t.step())
if len(prs.raw_no_schedule) > 0:
sheet_dict["other_raws"] = prs.raw_no_schedule
if len(prs.features) > 0:
sheet_dict["features"] = sorted(prs.features)
if prs.parser_error is not None:
sheet_dict["parser_error"] = prs.parser_error
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
sheet_dict["parser_warnings"] = prs.parser_warnings
for group_name_key in prs.groups.keys():
gr = prs.groups[group_name_key]
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
sheet_dict['groups'][group_name_key] = gr
print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
if not reader.has_next_sheet():
print("File ended")
break
else:
reader.next_sheet()
print("Next sheet!")
except Exception as e:
print(e)
traceback.print_exc()
u = uuid.uuid4()
to_return['error'] = {
"smile": ":(",
"error_message": str(e),
"log_anchor": str(u),
"time": currt()
}
print(f"Log Anchor: {u}")
return to_return
def parsed_file_path(excel_filename: str):
format = excel_filename.split(".")[-1]
fl = format.lower()
if fl not in ["json", "xls", "xlsx"]:
print(f"Unknown filename format: {excel_filename}")
return
if fl != "json":
excel_filename = excel_filename.replace("." + format, ".json")
excel_filename = excel_filename.lower()
filepath = PARSED_DIR + os.path.sep + excel_filename
return filepath
def load_parsed_state(excel_filename):
filepath = parsed_file_path(excel_filename)
if not os.path.exists(filepath):
return
with open(filepath, "r", encoding="utf-8") as fp:
return json.load(fp=fp)
def save_parsed_state(excel_filename, obj):
filepath = parsed_file_path(excel_filename)
if DEBUG_NO_SAVE_STATES:
print("Saved! (fake because DEBUG_NO_SAVE_STATES)")
with open(filepath, "w", encoding="utf-8") as fp:
json.dump(obj, fp=fp, ensure_ascii=False, sort_keys=True)
print(f"Saved parsed state to '{filepath}'")
def run_session():
faileds = []
t = utils.StepTimeCounter()
# Delete tempdir
try:
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
os.mkdir(DIRNAME)
print(f"Directory '{DIRNAME}' created successfully.")
except Exception as e:
print(f"Failed create '{DIRNAME}': ")
raise e
print("main(); parse links starting...")
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
if len(EXCEL_LINKS) < 5 and not DEBUG_ONE_FAC:
raise Exception("Safety exception: excel links count < 5; maybe in vstu.ru tech works")
last_changeds = set()
states = []
changed = False
for excel_dict in EXCEL_LINKS:
try:
last_changeds.add(excel_dict['last_changed'])
excel_url = excel_dict['url']
facultet = excel_dict['facultet']
excel_filename = excel_url.split("/")[-1]
excel_dict['json_represent'] = parsed_file_path(excel_filename).split(os.path.sep)[-1]
print(f"Processing {facultet} {excel_filename}")
state = load_parsed_state(excel_filename)
is_new = state is None
if is_new:
state = {}
else:
same_date = False
try:
same_date = state['excel']['last_changed'] == excel_dict['last_changed']
print(f"Excel[{excel_filename}]: inServer={excel_dict['last_changed']}, inState={state['excel']['last_changed']} same={same_date}")
except Exception as e:
print(f"Excel[{excel_filename}]: failed testify last_changed")
r = "parser.excel_found." + ("same" if same_date else "different") + "." + facultet
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key=r,
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "excel_file_found",
"same": same_date,
"excel_dict": excel_dict
}, ensure_ascii=False).encode('utf-8')
)
print(f"RabbitMQ published r={r}")
if same_date:
state['actual_at'] = currt()
try:
del state['excel']['different_in_this_session']
except: pass
states.append(state)
save_parsed_state(excel_filename, state)
continue
changed = True
excel_dict['different_in_this_session'] = True
state['actual_at'] = currt()
state['excel'] = excel_dict
is_xlsx = excel_url.endswith(".xlsx")
download_place = f"{DIRNAME}/" + excel_filename + "_" + facultet + ".xls" + ("x" if is_xlsx else "")
utils.download_file_from_url(excel_url, download_place)
sha1hash = utils.calculate_sha1(download_place)
state['excel']['sha1hash'] = sha1hash
state['sheets'] = parse_sheets(download_place)
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key="parser.excel_parsed." + facultet,
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "excel_file_parsed",
"is_new": is_new,
"state": state
}, ensure_ascii=False).encode('utf-8')
)
save_parsed_state(excel_filename, state)
states.append(state)
except Exception as e:
faileds.append({
"uuid": str(uuid.uuid4()),
"exception": str(e),
"traceback": traceback.format_exception(e),
"context": f"Failed process excel file {excel_dict['url']}"
})
traceback.print_exception(e)
with open("parser.json", 'w', encoding="utf-8") as fp:
lc = {"*_x": ":("}
try:
s = sorted(last_changeds)
lc = {
"early": s[0],
"newly": s[-1]
}
except: pass
json.dump({
"last_changeds": lc,
"actual_at": currt(),
"all_files": EXCEL_LINKS,
"faileds": faileds
}, fp=fp, ensure_ascii=False)
if changed:
all_files = states
d = {
"version": 2,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: ПРЕДОСТАВЛЯЕТСЯ КАК-ЕСТЬ (AS-IS) БЕЗ КАКИХ ЛИБО ГАРАНТИЙ",
"contact": "https://fazziclay.com/ или fazziclay@gmail.com",
"api_notices": {
"just_save_and_check_diffs": "просто сохраните и проверяйте разницу"
},
"actual_at": currt(),
"all_files": sorted(all_files, key=lambda d: d['excel']['url']),
"faileds": faileds
}
with open("result_v2.json", 'w', encoding="utf-8") as fp:
json.dump(d, fp=fp, ensure_ascii=False)
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key="parser.result_v2",
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "schedule_result_v2",
"data": d
}, ensure_ascii=False).encode('utf-8')
)
# Delete a non-empty directory and its contents
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
return {"changed": changed}
def check_dirs():
if not os.path.exists(PARSED_DIR):
os.mkdir(PARSED_DIR)
def main():
while True:
t = utils.StepTimeCounter()
err = None
sess = None
try:
check_dirs()
print("BEGIN run_session();")
sess = run_session()
print("END run_session();")
if DEBUG_ONE_FAC:
print("DEBUG_ONE_FAC; break infinity-loop")
break
except Exception as e:
err = e
print("Exception in run_session();")
traceback.print_exception(e)
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key="parser.session_end." + ('complete' if err is None else 'failed'),
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "session_end",
"err": err,
"duration": t.step(),
"session": sess
}, ensure_ascii=False).encode('utf-8')
)
print("Sleep for 30 minutes")
time.sleep(60*30)
print("Wake up!")
if __name__ == "__main__":
print("Start")
main()
print("Bye!")