All checks were successful
Build and Run VSTU Schedule Parser / build_and_run (push) Successful in 16s
471 lines
16 KiB
Python
471 lines
16 KiB
Python
# Copyright Stanislav Mironov
|
||
|
||
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
||
|
||
|
||
import json
|
||
import pika
|
||
import os
|
||
import random
|
||
import time
|
||
import traceback
|
||
import uuid
|
||
|
||
from pika.exceptions import ChannelWrongStateError
|
||
import parser
|
||
import translations
|
||
import utils
|
||
import json
|
||
import links_parser
|
||
import shutil
|
||
from dotenv import load_dotenv
|
||
load_dotenv()
|
||
|
||
RABBITMQ_URL = os.environ.get("RABBITMQ_URL")
|
||
EXCHANGE_NAME = os.environ.get("RABBITMQ_EXCHANGE", "vstu_schedule")
|
||
INFINITY_LOOP = os.environ.get("INFINITY_LOOP", "no").lower() in ['yes', "true"]
|
||
parser.LOGGING = LOGGING = os.environ.get("PARSER_LOGGING", "no").lower() in ['yes', "true"]
|
||
|
||
|
||
try:
|
||
connection = pika.BlockingConnection(pika.URLParameters(RABBITMQ_URL))
|
||
channel = connection.channel()
|
||
|
||
channel.exchange_declare(exchange=EXCHANGE_NAME,
|
||
exchange_type='topic',
|
||
durable=True)
|
||
except Exception as e:
|
||
print("Failed to connect RabbitMQ")
|
||
traceback.print_exception(e)
|
||
|
||
def currt():
|
||
return round(time.time())
|
||
|
||
FACULTETS = sorted([
|
||
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
||
])
|
||
DIRNAME = "excels"
|
||
PARSED_DIR = "parsed"
|
||
|
||
DEBUG_ONE_FAC = None# 'fevt'
|
||
DEBUG_NO_SAVE_STATES = False
|
||
DEBUG_NO_LINKS_DELAY = True
|
||
|
||
facultets_data = None
|
||
|
||
|
||
def gen_groups_from_states(states):
|
||
groups = {}
|
||
if facultets_data is None:
|
||
print("FAILED BECAUSE facultets_data is NONE!!!")
|
||
return
|
||
|
||
for state in states:
|
||
for sheet in state['sheets'].values():
|
||
gr = sheet.get('groups', {})
|
||
if len(gr.keys()) == 0:
|
||
continue
|
||
|
||
for key, group_dict in gr.items():
|
||
group_name = group_dict['name']
|
||
|
||
full_path_key = key.upper()
|
||
recognized_fac = utils.get_preferer_facultet(facultets_data, state['excel']['url'], skip_for=['mag', 'asp'])
|
||
tech_fac = state['excel']['facultet']
|
||
full_path_key = utils.get_abbrev_for_facultet(facultets_data, tech_fac) + "/" + full_path_key
|
||
if tech_fac != recognized_fac and recognized_fac is not None:
|
||
full_path_key = utils.get_abbrev_for_facultet(facultets_data, recognized_fac) + "/" + full_path_key
|
||
|
||
full_path_key = full_path_key.replace(" ", "").replace("\n", "").upper().strip()
|
||
if full_path_key in groups.keys():
|
||
groups[full_path_key]['doubled'] = True
|
||
groups[full_path_key]['excels'].append(state['excel'])
|
||
else:
|
||
groups[full_path_key] = {
|
||
"full_path_key": full_path_key,
|
||
"real_name": group_name,
|
||
"facultet_tech": tech_fac,
|
||
"facultet_regognized": recognized_fac,
|
||
"excels": [state['excel']],
|
||
"excel_position": group_dict['position_human'],
|
||
"excel_sheet": {
|
||
"name": sheet['name'],
|
||
"index": sheet['index']
|
||
},
|
||
"slots_weekdays_used": sorted(group_dict['slots'].keys())
|
||
}
|
||
|
||
return {"version": 1, "groups": json.loads(json.dumps(groups, sort_keys=True, ensure_ascii=False))}
|
||
|
||
def parse_sheets(download_place):
|
||
to_return = {}
|
||
try:
|
||
reader = translations.create_reader(download_place)
|
||
print("Reader info")
|
||
print(reader.info())
|
||
|
||
while True:
|
||
t = utils.StepTimeCounter()
|
||
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
|
||
sheet_dict = {
|
||
"index": reader.get_sheet_index(),
|
||
"name": reader.get_sheet_name(),
|
||
"reader_info": reader.info(),
|
||
"groups": {}
|
||
}
|
||
to_return["SHEET_"+str(reader.get_sheet_index())] = sheet_dict
|
||
prs = parser.Parser(reader)
|
||
|
||
print("Parser created; parser.parse();")
|
||
prs.parse()
|
||
|
||
print("parsed done!")
|
||
sheet_dict['parse_time'] = round(t.step())
|
||
|
||
if len(prs.raw_no_schedule) > 0:
|
||
sheet_dict["other_raws"] = prs.raw_no_schedule
|
||
|
||
if len(prs.features) > 0:
|
||
sheet_dict["features"] = sorted(prs.features)
|
||
|
||
if prs.parser_error is not None:
|
||
sheet_dict["parser_error"] = prs.parser_error
|
||
|
||
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
|
||
sheet_dict["parser_warnings"] = prs.parser_warnings
|
||
|
||
for group_name_key in prs.groups.keys():
|
||
gr = prs.groups[group_name_key]
|
||
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
|
||
sheet_dict['groups'][group_name_key] = gr
|
||
|
||
|
||
print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
|
||
|
||
if not reader.has_next_sheet():
|
||
print("File ended")
|
||
break
|
||
else:
|
||
reader.next_sheet()
|
||
print("Next sheet!")
|
||
|
||
except Exception as e:
|
||
print(e)
|
||
traceback.print_exc()
|
||
u = uuid.uuid4()
|
||
to_return['error'] = {
|
||
"smile": ":(",
|
||
"error_message": str(e),
|
||
"log_anchor": str(u),
|
||
"time": currt()
|
||
}
|
||
print(f"Log Anchor: {u}")
|
||
|
||
return to_return
|
||
|
||
def parsed_file_path(excel_filename: str):
|
||
format = excel_filename.split(".")[-1]
|
||
fl = format.lower()
|
||
|
||
if fl not in ["json", "xls", "xlsx"]:
|
||
print(f"Unknown filename format: {excel_filename}")
|
||
return
|
||
|
||
if fl != "json":
|
||
excel_filename = excel_filename.replace("." + format, ".json")
|
||
|
||
excel_filename = excel_filename.lower()
|
||
filepath = PARSED_DIR + os.path.sep + excel_filename
|
||
return filepath
|
||
|
||
def load_parsed_state(excel_filename):
|
||
filepath = parsed_file_path(excel_filename)
|
||
if not os.path.exists(filepath):
|
||
return
|
||
|
||
with open(filepath, "r", encoding="utf-8") as fp:
|
||
return json.load(fp=fp)
|
||
|
||
def save_parsed_state(excel_filename, obj):
|
||
filepath = parsed_file_path(excel_filename)
|
||
if DEBUG_NO_SAVE_STATES:
|
||
print("Saved! (fake because DEBUG_NO_SAVE_STATES)")
|
||
|
||
with open(filepath, "w", encoding="utf-8") as fp:
|
||
json.dump(obj, fp=fp, ensure_ascii=False, sort_keys=True)
|
||
|
||
print(f"Saved parsed state to '{filepath}'")
|
||
|
||
def run_session():
|
||
faileds = []
|
||
|
||
t = utils.StepTimeCounter()
|
||
|
||
# Delete tempdir
|
||
try:
|
||
try:
|
||
shutil.rmtree(DIRNAME)
|
||
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
|
||
except Exception as e:
|
||
print(f"Error deleting directory '{DIRNAME}': {e}")
|
||
os.mkdir(DIRNAME)
|
||
print(f"Directory '{DIRNAME}' created successfully.")
|
||
except Exception as e:
|
||
print(f"Failed create '{DIRNAME}': ")
|
||
raise e
|
||
|
||
|
||
print("main(); parse links starting...")
|
||
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC], DEBUG_NO_LINKS_DELAY=DEBUG_NO_LINKS_DELAY)
|
||
|
||
if len(EXCEL_LINKS) < 5 and not DEBUG_ONE_FAC:
|
||
raise Exception("Safety exception: excel links count < 5; maybe in vstu.ru tech works")
|
||
|
||
|
||
last_changeds = set()
|
||
states = []
|
||
changed = False
|
||
changed_files = 0
|
||
total_files = len(EXCEL_LINKS)
|
||
new_files = 0
|
||
for excel_dict in EXCEL_LINKS:
|
||
try:
|
||
last_changeds.add(excel_dict['last_changed'])
|
||
|
||
excel_url = excel_dict['url']
|
||
|
||
for state in states:
|
||
ch = state['excel']['url']
|
||
if excel_url == ch:
|
||
print(f"Doubled excel files(By URLs)! Current 1th={excel_dict}; 2th={state['excel']}")
|
||
print("Skipped!")
|
||
continue
|
||
|
||
facultet = excel_dict['facultet']
|
||
excel_filename = excel_url.split("/")[-1]
|
||
excel_dict['json_represent'] = parsed_file_path(excel_filename).split(os.path.sep)[-1]
|
||
print(f"Processing {facultet} {excel_filename}")
|
||
|
||
state = load_parsed_state(excel_filename)
|
||
is_new = state is None
|
||
if is_new:
|
||
state = {}
|
||
channel.basic_publish(
|
||
exchange=EXCHANGE_NAME,
|
||
routing_key='parser.excel_found.new',
|
||
properties=pika.BasicProperties(
|
||
content_type="application/json",
|
||
delivery_mode=2
|
||
),
|
||
body=json.dumps({
|
||
"type": "excel_file_found",
|
||
"same": False,
|
||
"is_new": True,
|
||
"excel_dict": excel_dict
|
||
}, ensure_ascii=False).encode('utf-8')
|
||
)
|
||
print(f"RabbitMQ published 'parser.excel_found.new'")
|
||
new_files += 1
|
||
|
||
else:
|
||
same_date = False
|
||
try:
|
||
same_date = state['excel']['last_changed'] == excel_dict['last_changed']
|
||
print(f"Excel[{excel_filename}]: inServer={excel_dict['last_changed']}, inState={state['excel']['last_changed']} same={same_date}")
|
||
|
||
except Exception as e:
|
||
print(f"Excel[{excel_filename}]: failed testify last_changed")
|
||
|
||
r = "parser.excel_found." + ("same" if same_date else "different") + "." + facultet
|
||
channel.basic_publish(
|
||
exchange=EXCHANGE_NAME,
|
||
routing_key=r,
|
||
properties=pika.BasicProperties(
|
||
content_type="application/json",
|
||
delivery_mode=2
|
||
),
|
||
body=json.dumps({
|
||
"type": "excel_file_found",
|
||
"same": same_date,
|
||
"is_new": False,
|
||
"excel_dict": excel_dict
|
||
}, ensure_ascii=False).encode('utf-8')
|
||
)
|
||
print(f"RabbitMQ published r={r}")
|
||
|
||
if same_date:
|
||
state['actual_at'] = currt()
|
||
try:
|
||
del state['excel']['different_in_this_session']
|
||
except: pass
|
||
states.append(state)
|
||
save_parsed_state(excel_filename, state)
|
||
continue
|
||
|
||
changed_files += 1
|
||
changed = True
|
||
excel_dict['different_in_this_session'] = True
|
||
excel_dict['recognized_facultet'] = utils.get_preferer_facultet(facultets_data, excel_url=excel_dict['url'])
|
||
state['actual_at'] = currt()
|
||
state['excel'] = excel_dict
|
||
|
||
is_xlsx = excel_url.endswith(".xlsx")
|
||
download_place = f"{DIRNAME}/" + excel_filename + "_" + facultet + ".xls" + ("x" if is_xlsx else "")
|
||
utils.download_file_from_url(excel_url, download_place)
|
||
sha1hash = utils.calculate_sha1(download_place)
|
||
state['excel']['sha1hash'] = sha1hash
|
||
|
||
state['sheets'] = parse_sheets(download_place)
|
||
|
||
channel.basic_publish(
|
||
exchange=EXCHANGE_NAME,
|
||
routing_key="parser.excel_parsed." + facultet,
|
||
properties=pika.BasicProperties(
|
||
content_type="application/json",
|
||
delivery_mode=2
|
||
),
|
||
body=json.dumps({
|
||
"type": "excel_file_parsed_not_same",
|
||
"is_new": is_new,
|
||
"state": state
|
||
}, ensure_ascii=False).encode('utf-8')
|
||
)
|
||
|
||
save_parsed_state(excel_filename, state)
|
||
states.append(state)
|
||
|
||
except Exception as e:
|
||
if isinstance(e, ChannelWrongStateError):
|
||
raise e
|
||
|
||
faileds.append({
|
||
"uuid": str(uuid.uuid4()),
|
||
"exception": str(e),
|
||
"traceback": traceback.format_exception(e),
|
||
"context": f"Failed process excel file {excel_dict['url']}"
|
||
})
|
||
traceback.print_exception(e)
|
||
|
||
|
||
with open("parser.json", 'w', encoding="utf-8") as fp:
|
||
lc = {"*_x": ":("}
|
||
try:
|
||
s = sorted(last_changeds)
|
||
lc = {
|
||
"early": s[0],
|
||
"newly": s[-1]
|
||
}
|
||
except: pass
|
||
|
||
json.dump({
|
||
"last_changeds": lc,
|
||
"actual_at": currt(),
|
||
"all_files": EXCEL_LINKS,
|
||
"faileds": faileds
|
||
}, fp=fp, ensure_ascii=False)
|
||
|
||
with open("groups.json", 'w', encoding="utf-8") as fp:
|
||
json.dump(gen_groups_from_states(states), fp=fp, ensure_ascii=False)
|
||
|
||
if changed:
|
||
all_files = states
|
||
d = {
|
||
"version": 2,
|
||
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: ПРЕДОСТАВЛЯЕТСЯ КАК-ЕСТЬ (AS-IS) БЕЗ КАКИХ ЛИБО ГАРАНТИЙ",
|
||
"contact": "https://fazziclay.com/ или fazziclay@gmail.com",
|
||
"api_notices": {
|
||
"just_save_and_check_diffs": "просто сохраните и проверяйте разницу"
|
||
},
|
||
"actual_at": currt(),
|
||
"all_files": sorted(all_files, key=lambda d: d['excel']['url']),
|
||
"faileds": faileds
|
||
}
|
||
|
||
|
||
|
||
with open("result_v2.json", 'w', encoding="utf-8") as fp:
|
||
json.dump(d, fp=fp, ensure_ascii=False)
|
||
|
||
channel.basic_publish(
|
||
exchange=EXCHANGE_NAME,
|
||
routing_key="parser.result_v2",
|
||
properties=pika.BasicProperties(
|
||
content_type="application/json",
|
||
delivery_mode=2
|
||
),
|
||
body=json.dumps({
|
||
"type": "schedule_result_v2_changed",
|
||
}, ensure_ascii=False).encode('utf-8')
|
||
)
|
||
|
||
# Delete a non-empty directory and its contents
|
||
try:
|
||
shutil.rmtree(DIRNAME)
|
||
print(f"Directory '{DIRNAME}' and its contents deleted successfully.")
|
||
except Exception as e:
|
||
print(f"Error deleting directory '{DIRNAME}': {e}")
|
||
|
||
return {"changed_files": changed_files, 'total_files': total_files, "changed": changed, "new_files": new_files, "faileds": faileds, "faileds_count": len(faileds)}
|
||
|
||
def check_dirs():
|
||
|
||
if not os.path.exists(PARSED_DIR):
|
||
os.mkdir(PARSED_DIR)
|
||
|
||
def main():
|
||
global facultets_data
|
||
with open("facultets.json") as fp:
|
||
facultets_data = json.load(fp=fp)
|
||
|
||
flag = True
|
||
while flag:
|
||
if not INFINITY_LOOP:
|
||
flag = False
|
||
|
||
t = utils.StepTimeCounter()
|
||
err = None
|
||
sess = None
|
||
try:
|
||
check_dirs()
|
||
|
||
print("BEGIN run_session();")
|
||
sess = run_session()
|
||
print("END run_session();")
|
||
|
||
if DEBUG_ONE_FAC:
|
||
print("DEBUG_ONE_FAC; break infinity-loop")
|
||
break
|
||
|
||
except Exception as e:
|
||
err = e
|
||
print("Exception in run_session();")
|
||
traceback.print_exception(e)
|
||
|
||
channel.basic_publish(
|
||
exchange=EXCHANGE_NAME,
|
||
routing_key="parser.session_end." + ('complete' if err is None else 'failed'),
|
||
properties=pika.BasicProperties(
|
||
content_type="application/json",
|
||
delivery_mode=2
|
||
),
|
||
body=json.dumps({
|
||
"type": "session_end",
|
||
"err": str(err) if err else None,
|
||
"duration": t.step(),
|
||
"session": sess
|
||
}, ensure_ascii=False).encode('utf-8')
|
||
)
|
||
|
||
if flag:
|
||
sleep_time = random.randint(14*60, 21*60)
|
||
print(f"Sleep for {round(sleep_time/6)/10} minutes")
|
||
time.sleep(sleep_time)
|
||
print("Wake up!")
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
print("Start")
|
||
main()
|
||
print("Bye!")
|