From 777fae4276a203c701fb1bdd7ade986263155e49 Mon Sep 17 00:00:00 2001 From: FazziCLAY Date: Sun, 5 Oct 2025 14:19:59 +0300 Subject: [PATCH] Added left calendar dates parsing --- hashes.py | 34 ++++++++++++++++++++++++ main.py | 36 +++++++++++++++++--------- parser.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++----- utils.py | 2 +- 4 files changed, 130 insertions(+), 19 deletions(-) create mode 100644 hashes.py diff --git a/hashes.py b/hashes.py new file mode 100644 index 0000000..fe847fe --- /dev/null +++ b/hashes.py @@ -0,0 +1,34 @@ +import hashlib + +def calculate_sha1(filepath): + """ + Calculates the SHA1 hash of a given file. + + Args: + filepath (str): The path to the file. + + Returns: + str: The hexadecimal representation of the SHA1 hash, or None if the file is not found. + """ + sha1_hash = hashlib.sha1() + try: + with open(filepath, "rb") as f: + # Read the file in chunks to handle large files efficiently + for chunk in iter(lambda: f.read(4096), b""): + sha1_hash.update(chunk) + return sha1_hash.hexdigest() + except FileNotFoundError: + print(f"Error: File not found at {filepath}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None + + +if __name__ == "__main__": + # Example usage: + file_path = "xls.xls" # Replace with the actual path to your file + sha1_result = calculate_sha1(file_path) + + if sha1_result: + print(f"The SHA1 hash of '{file_path}' is: {sha1_result}") \ No newline at end of file diff --git a/main.py b/main.py index 4f6fa2c..2b33f81 100644 --- a/main.py +++ b/main.py @@ -15,6 +15,7 @@ import utils import json import links_parser import shutil +import hashes def currt(): return round(time.time()) @@ -25,7 +26,7 @@ FACULTETS = sorted([ DIRNAME = "excels" DIFFABLE_DATES = "diffable_dates.txt" -DEBUG_ONE_FAC = None #'htf' +DEBUG_ONE_FAC = None #'fevt' result_groups = {} result = { "version": 1, @@ -42,14 +43,14 @@ result = { "total_parsing_time": -1, }, "api_notices": { - "updated_at": 1757688552, - "text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;", + "updated_at": 1759651871, + "text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.", "warning": False, "tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'" }, "doubled_groups": [], "debug": { - "bleu~~": 1 + "bleu~~": 2 }, "excels": [], "facultets": FACULTETS, @@ -66,22 +67,28 @@ result = { def process_excel_file(facultet, excel_url, counter, latest_changed): is_xlsx = excel_url.endswith(".xlsx") - filename = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") + download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") + + excel_filename = excel_url.split("/")[-1] excel_info = { - "filename": excel_url.split("/")[-1], + "filename": excel_filename, + "data_source_hash": None, "url": excel_url, "latest_changed": latest_changed, - "download_place": filename, + "download_place": download_place, "group_names_parsed": [], "facultet": facultet, - "counter": counter + "counter": counter, + "week_keys_metadata": {} } parser.LOGGING = False try: - aigenerated.download_file_from_url(excel_url, filename) - reader = translations.create_reader(filename) + aigenerated.download_file_from_url(excel_url, download_place) + sha1hash = hashes.calculate_sha1(download_place) + excel_info['data_source_hash'] = sha1hash + reader = translations.create_reader(download_place) print("Reader info") print(reader.info()) @@ -95,6 +102,9 @@ def process_excel_file(facultet, excel_url, counter, latest_changed): print("parsed done!") if prs.parser_error is not None: excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error + + if prs.parser_warnings is not None and len(prs.parser_warnings) > 0: + excel_info["parser_warnings_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_warnings for group_name in prs.groups.keys(): if group_name in result_groups.keys(): @@ -110,14 +120,16 @@ def process_excel_file(facultet, excel_url, counter, latest_changed): gr = result_groups[group_name] = prs.groups[group_name] gr['facultet'] = facultet - gr['data_source'] = excel_url.split("/")[-1] + gr['data_source'] = excel_filename # same as 'filename' in excel_info's + gr['data_source_hash'] = sha1hash gr['debug'] = { "excel_url": excel_url, "reader_info": reader.info(), "reader_sheet_index": reader.get_sheet_index(), - "filename": filename + "download_place": download_place } excel_info["group_names_parsed"].append(group_name) + excel_info['week_keys_metadata'] = prs.week_keys_metadata print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys())) diff --git a/parser.py b/parser.py index 060a3e1..9c4952a 100644 --- a/parser.py +++ b/parser.py @@ -10,8 +10,9 @@ import aigenerated from coord import Coord, Merged from translations import ExcelSheetReader import utils +from collections import defaultdict -LOGGING = True +LOGGING = False def pprint(*args, **kwargs): if LOGGING: @@ -21,9 +22,11 @@ class Parser: def __init__(self, reader: ExcelSheetReader): self.reader = reader self.groups = {} - self.teachers = set() - self.places = set() + self.week_keys_metadata = {} + + self.weeknums: defaultdict = defaultdict(set) # no support json! self.parser_error = None + self.parser_warnings = [] pprint("Parser created for '{0}'".format(reader.info())) def parse(self): @@ -34,6 +37,11 @@ class Parser: self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице." return + if monday.col != 4: + print("--- warning parse! ---") + print(f"Monday col != 4 (actual: {monday})") + self.parser_warnings.append(f"Monday col != 4 (actual: {monday}); Это, наверное, может работать не стабильно!") + head_rx = monday.row - 1 # выше первого понидельника if head_rx < 0: raise Exception("head_rx < 0: Программа пыталась найти 'ПОНЕДЕЛЬНИК', но по всей видимости не нашла.") @@ -49,8 +57,62 @@ class Parser: pprint("\nSTART OF PROCESS GROUP\n") self.process_group(group, monday) pprint("\nEND OF PROCESS GROUP\n") + + self.process_weekmetadatas(monday) - pprint(self.teachers) + def process_weekmetadatas(self, first_monday: "Coord"): + for x in self.weeknums.keys(): + pprint(x) + set_of_merged: set = self.weeknums[x] + l = len(set_of_merged) + if l != 1: + self.week_keys_metadata[x] = { + "error": True, + "error_text": f"Parse error: count of found '{x}' (need view like WEEKDAY_1; weekday - in r; 1 - weeknum[1, 2]) is {l}; required only one!" + } + self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because count of uniqie merged cells not one (actual: {l}). :<") + continue + + weekday_merged: Merged = set_of_merged.pop() + if weekday_merged.width() != 1: + self.week_keys_metadata[x] = { + "error": True, + "error_text": f"Weekday excel block width != 1 (actual {weekday_merged.width()})" + } + self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})") + continue + + month_row = first_monday.row - 1 + curr_col = weekday_merged.low.col - 1 + while curr_col >= 0: + month_pos = Coord(month_row, curr_col) + month_cell = month_pos.cell(self.reader) + if month_cell.is_empty(): + pprint("month cell is empty") + break + month_name = str(month_cell.value).strip() + pprint(month_cell) + all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.shift(down=1), right=1, down=weekday_merged.height()) + + if (x not in self.week_keys_metadata.keys()): + self.week_keys_metadata[x] = {} + + if (month_name not in self.week_keys_metadata[x].keys()): + self.week_keys_metadata[x][month_name] = [] + + for x2 in all_nums_of_month: + m = self.week_keys_metadata[x][month_name] + if x2 not in m: + try: + m.append(str(x2).replace(".0", "")) + except: + m.append(x2) + + curr_col -= 1 + + + def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"): + self.weeknums[week_key_name].add(merged) def parse_potokoviy(self, merged: Merged): speaker = None @@ -66,7 +128,7 @@ class Parser: return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()} - def process_group(self, group, monday): + def process_group(self, group: dict, monday: Coord): """ Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups) group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'} @@ -109,6 +171,9 @@ class Parser: if not skip: next = 3 # на сколько пыгнуть для следующего шага? + weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2") + self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr) + is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего.. parsed_discipline_name = None parsed_location = None @@ -182,7 +247,7 @@ class Parser: # если не пустой предмет то записываем его if not is_empty_lesson: slots = group['slots'] - w = weekday + ("_1" if weeknum == 1 else "_2") + w = weekday_key_name if w not in slots.keys(): slots[w] = {} diff --git a/utils.py b/utils.py index 7fc2cb2..b91968f 100644 --- a/utils.py +++ b/utils.py @@ -53,7 +53,7 @@ def remove_from_list(l: list, todel: list): return l -def parse_all_dirt(reader: "ExcelSheetReader", min_pos, right, down): +def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down): RET = set() row = min_pos.row