Files
VSTU_Schedule_Parser/main.py
FazziCLAY 1700b6db14
All checks were successful
Build and Run VSTU Schedule Parser / build_and_run (push) Successful in 18s
enable delays
2026-04-05 13:28:16 +03:00

471 lines
16 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright Stanislav Mironov
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
import json
import pika
import os
import random
import time
import traceback
import uuid
from pika.exceptions import ChannelWrongStateError
import parser
import translations
import utils
import json
import links_parser
import shutil
from dotenv import load_dotenv
load_dotenv()
RABBITMQ_URL = os.environ.get("RABBITMQ_URL")
EXCHANGE_NAME = os.environ.get("RABBITMQ_EXCHANGE", "vstu_schedule")
INFINITY_LOOP = os.environ.get("INFINITY_LOOP", "no").lower() in ['yes', "true"]
parser.LOGGING = LOGGING = os.environ.get("PARSER_LOGGING", "no").lower() in ['yes', "true"]
try:
connection = pika.BlockingConnection(pika.URLParameters(RABBITMQ_URL))
channel = connection.channel()
channel.exchange_declare(exchange=EXCHANGE_NAME,
exchange_type='topic',
durable=True)
except Exception as e:
print("Failed to connect RabbitMQ")
traceback.print_exception(e)
def currt():
return round(time.time())
FACULTETS = sorted([
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
])
DIRNAME = "excels"
PARSED_DIR = "parsed"
DEBUG_ONE_FAC = None# 'fevt'
DEBUG_NO_SAVE_STATES = False
DEBUG_NO_LINKS_DELAY = False
facultets_data = None
def gen_groups_from_states(states):
groups = {}
if facultets_data is None:
print("FAILED BECAUSE facultets_data is NONE!!!")
return
for state in states:
for sheet in state['sheets'].values():
gr = sheet.get('groups', {})
if len(gr.keys()) == 0:
continue
for key, group_dict in gr.items():
group_name = group_dict['name']
full_path_key = key.upper()
recognized_fac = utils.get_preferer_facultet(facultets_data, state['excel']['url'], skip_for=['mag', 'asp'])
tech_fac = state['excel']['facultet']
full_path_key = utils.get_abbrev_for_facultet(facultets_data, tech_fac) + "/" + full_path_key
if tech_fac != recognized_fac and recognized_fac is not None:
full_path_key = utils.get_abbrev_for_facultet(facultets_data, recognized_fac) + "/" + full_path_key
full_path_key = full_path_key.replace(" ", "").replace("\n", "").upper().strip()
if full_path_key in groups.keys():
groups[full_path_key]['doubled'] = True
groups[full_path_key]['excels'].append(state['excel'])
else:
groups[full_path_key] = {
"full_path_key": full_path_key,
"real_name": group_name,
"facultet_tech": tech_fac,
"facultet_regognized": recognized_fac,
"excels": [state['excel']],
"excel_position": group_dict['position_human'],
"excel_sheet": {
"name": sheet['name'],
"index": sheet['index']
},
"slots_weekdays_used": sorted(group_dict['slots'].keys())
}
return {"version": 1, "groups": json.loads(json.dumps(groups, sort_keys=True, ensure_ascii=False))}
def parse_sheets(download_place):
to_return = {}
try:
reader = translations.create_reader(download_place)
print("Reader info")
print(reader.info())
while True:
t = utils.StepTimeCounter()
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
sheet_dict = {
"index": reader.get_sheet_index(),
"name": reader.get_sheet_name(),
"reader_info": reader.info(),
"groups": {}
}
to_return["SHEET_"+str(reader.get_sheet_index())] = sheet_dict
prs = parser.Parser(reader)
print("Parser created; parser.parse();")
prs.parse()
print("parsed done!")
sheet_dict['parse_time'] = round(t.step())
if len(prs.raw_no_schedule) > 0:
sheet_dict["other_raws"] = prs.raw_no_schedule
if len(prs.features) > 0:
sheet_dict["features"] = sorted(prs.features)
if prs.parser_error is not None:
sheet_dict["parser_error"] = prs.parser_error
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
sheet_dict["parser_warnings"] = prs.parser_warnings
for group_name_key in prs.groups.keys():
gr = prs.groups[group_name_key]
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
sheet_dict['groups'][group_name_key] = gr
print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
if not reader.has_next_sheet():
print("File ended")
break
else:
reader.next_sheet()
print("Next sheet!")
except Exception as e:
print(e)
traceback.print_exc()
u = uuid.uuid4()
to_return['error'] = {
"smile": ":(",
"error_message": str(e),
"log_anchor": str(u),
"time": currt()
}
print(f"Log Anchor: {u}")
return to_return
def parsed_file_path(excel_filename: str):
format = excel_filename.split(".")[-1]
fl = format.lower()
if fl not in ["json", "xls", "xlsx"]:
print(f"Unknown filename format: {excel_filename}")
return
if fl != "json":
excel_filename = excel_filename.replace("." + format, ".json")
excel_filename = excel_filename.lower()
filepath = PARSED_DIR + os.path.sep + excel_filename
return filepath
def load_parsed_state(excel_filename):
filepath = parsed_file_path(excel_filename)
if not os.path.exists(filepath):
return
with open(filepath, "r", encoding="utf-8") as fp:
return json.load(fp=fp)
def save_parsed_state(excel_filename, obj):
filepath = parsed_file_path(excel_filename)
if DEBUG_NO_SAVE_STATES:
print("Saved! (fake because DEBUG_NO_SAVE_STATES)")
with open(filepath, "w", encoding="utf-8") as fp:
json.dump(obj, fp=fp, ensure_ascii=False, sort_keys=True)
print(f"Saved parsed state to '{filepath}'")
def run_session():
faileds = []
t = utils.StepTimeCounter()
# Delete tempdir
try:
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
os.mkdir(DIRNAME)
print(f"Directory '{DIRNAME}' created successfully.")
except Exception as e:
print(f"Failed create '{DIRNAME}': ")
raise e
print("main(); parse links starting...")
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC], DEBUG_NO_LINKS_DELAY=DEBUG_NO_LINKS_DELAY)
if len(EXCEL_LINKS) < 5 and not DEBUG_ONE_FAC:
raise Exception("Safety exception: excel links count < 5; maybe in vstu.ru tech works")
last_changeds = set()
states = []
changed = False
changed_files = 0
total_files = len(EXCEL_LINKS)
new_files = 0
for excel_dict in EXCEL_LINKS:
try:
last_changeds.add(excel_dict['last_changed'])
excel_url = excel_dict['url']
for state in states:
ch = state['excel']['url']
if excel_url == ch:
print(f"Doubled excel files(By URLs)! Current 1th={excel_dict}; 2th={state['excel']}")
print("Skipped!")
continue
facultet = excel_dict['facultet']
excel_filename = excel_url.split("/")[-1]
excel_dict['json_represent'] = parsed_file_path(excel_filename).split(os.path.sep)[-1]
print(f"Processing {facultet} {excel_filename}")
state = load_parsed_state(excel_filename)
is_new = state is None
if is_new:
state = {}
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key='parser.excel_found.new',
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "excel_file_found",
"same": False,
"is_new": True,
"excel_dict": excel_dict
}, ensure_ascii=False).encode('utf-8')
)
print(f"RabbitMQ published 'parser.excel_found.new'")
new_files += 1
else:
same_date = False
try:
same_date = state['excel']['last_changed'] == excel_dict['last_changed']
print(f"Excel[{excel_filename}]: inServer={excel_dict['last_changed']}, inState={state['excel']['last_changed']} same={same_date}")
except Exception as e:
print(f"Excel[{excel_filename}]: failed testify last_changed")
r = "parser.excel_found." + ("same" if same_date else "different") + "." + facultet
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key=r,
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "excel_file_found",
"same": same_date,
"is_new": False,
"excel_dict": excel_dict
}, ensure_ascii=False).encode('utf-8')
)
print(f"RabbitMQ published r={r}")
if same_date:
state['actual_at'] = currt()
try:
del state['excel']['different_in_this_session']
except: pass
states.append(state)
save_parsed_state(excel_filename, state)
continue
changed_files += 1
changed = True
excel_dict['different_in_this_session'] = True
excel_dict['recognized_facultet'] = utils.get_preferer_facultet(facultets_data, excel_url=excel_dict['url'])
state['actual_at'] = currt()
state['excel'] = excel_dict
is_xlsx = excel_url.endswith(".xlsx")
download_place = f"{DIRNAME}/" + excel_filename + "_" + facultet + ".xls" + ("x" if is_xlsx else "")
utils.download_file_from_url(excel_url, download_place)
sha1hash = utils.calculate_sha1(download_place)
state['excel']['sha1hash'] = sha1hash
state['sheets'] = parse_sheets(download_place)
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key="parser.excel_parsed." + facultet,
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "excel_file_parsed_not_same",
"is_new": is_new,
"state": state
}, ensure_ascii=False).encode('utf-8')
)
save_parsed_state(excel_filename, state)
states.append(state)
except Exception as e:
if isinstance(e, ChannelWrongStateError):
raise e
faileds.append({
"uuid": str(uuid.uuid4()),
"exception": str(e),
"traceback": traceback.format_exception(e),
"context": f"Failed process excel file {excel_dict['url']}"
})
traceback.print_exception(e)
with open("parser.json", 'w', encoding="utf-8") as fp:
lc = {"*_x": ":("}
try:
s = sorted(last_changeds)
lc = {
"early": s[0],
"newly": s[-1]
}
except: pass
json.dump({
"last_changeds": lc,
"actual_at": currt(),
"all_files": EXCEL_LINKS,
"faileds": faileds
}, fp=fp, ensure_ascii=False)
with open("groups.json", 'w', encoding="utf-8") as fp:
json.dump(gen_groups_from_states(states), fp=fp, ensure_ascii=False)
if changed:
all_files = states
d = {
"version": 2,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: ПРЕДОСТАВЛЯЕТСЯ КАК-ЕСТЬ (AS-IS) БЕЗ КАКИХ ЛИБО ГАРАНТИЙ",
"contact": "https://fazziclay.com/ или fazziclay@gmail.com",
"api_notices": {
"just_save_and_check_diffs": "просто сохраните и проверяйте разницу"
},
"actual_at": currt(),
"all_files": sorted(all_files, key=lambda d: d['excel']['url']),
"faileds": faileds
}
with open("result_v2.json", 'w', encoding="utf-8") as fp:
json.dump(d, fp=fp, ensure_ascii=False)
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key="parser.result_v2",
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "schedule_result_v2_changed",
}, ensure_ascii=False).encode('utf-8')
)
# Delete a non-empty directory and its contents
try:
shutil.rmtree(DIRNAME)
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
except Exception as e:
print(f"Error deleting directory '{DIRNAME}': {e}")
return {"changed_files": changed_files, 'total_files': total_files, "changed": changed, "new_files": new_files, "faileds": faileds, "faileds_count": len(faileds)}
def check_dirs():
if not os.path.exists(PARSED_DIR):
os.mkdir(PARSED_DIR)
def main():
global facultets_data
with open("facultets.json") as fp:
facultets_data = json.load(fp=fp)
flag = True
while flag:
if not INFINITY_LOOP:
flag = False
t = utils.StepTimeCounter()
err = None
sess = None
try:
check_dirs()
print("BEGIN run_session();")
sess = run_session()
print("END run_session();")
if DEBUG_ONE_FAC:
print("DEBUG_ONE_FAC; break infinity-loop")
break
except Exception as e:
err = e
print("Exception in run_session();")
traceback.print_exception(e)
channel.basic_publish(
exchange=EXCHANGE_NAME,
routing_key="parser.session_end." + ('complete' if err is None else 'failed'),
properties=pika.BasicProperties(
content_type="application/json",
delivery_mode=2
),
body=json.dumps({
"type": "session_end",
"err": str(err) if err else None,
"duration": t.step(),
"session": sess
}, ensure_ascii=False).encode('utf-8')
)
if flag:
sleep_time = random.randint(14*60, 21*60)
print(f"Sleep for {round(sleep_time/6)/10} minutes")
time.sleep(sleep_time)
print("Wake up!")
if __name__ == "__main__":
print("Start")
main()
print("Bye!")