From 7e0e4a0b714059eff0d80bd6ce9cbf7de9c741f4 Mon Sep 17 00:00:00 2001 From: FazziCLAY Date: Mon, 16 Mar 2026 20:53:42 +0300 Subject: [PATCH] refactor: big, more patterns\n\nBREAKING CHANGES --- Dockerfile | 0 main.py | 131 ++++++++------ parser.py | 453 +++++++++++++++++++++++++++++------------------- translations.py | 63 ++++++- utils.py | 13 +- 5 files changed, 416 insertions(+), 244 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py index 2b33f81..fbd80ed 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ import json import os +import random import time import traceback import uuid @@ -26,50 +27,75 @@ FACULTETS = sorted([ DIRNAME = "excels" DIFFABLE_DATES = "diffable_dates.txt" +SKIP_DIFFABLE_DATES = True + DEBUG_ONE_FAC = None #'fevt' -result_groups = {} +LOGGING = False + +unique_raws = set() result = { "version": 1, "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php", "actual_at": round(time.time()), - "documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json", - "daypicture": "QwQ", - "daycite": "running on a rope", + "documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json (temporary outdated)", + "daypicture": "0w0", + "daycite": "KIlLSWITCH", "contact": "https://fazziclay.com/", "university": "VSTU", "university_site": "https://www.vstu.ru/", "source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json", "stat": { "total_parsing_time": -1, + "excels": { + "fine": 0, + "bad": 0 + }, + "groups": 0, + "unique_raws": -1 }, "api_notices": { - "updated_at": 1759651871, - "text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.", - "warning": False, + "updated_at": 1773523692, + "text_pre1": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.", + "text": "2026-03-15 BREAKING CHANGES! By Stanislav Mironov.\n\nИзменено многое в угоду унифкации и расширению спаршенных групп. Пока alpha", + "warning": True, "tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'" }, - "doubled_groups": [], "debug": { - "bleu~~": 2 + "bleu~~": 3 }, "excels": [], "facultets": FACULTETS, - - "emptykey1": "", - "emptykey2": "", - - "groups": result_groups, - - "emptykey3": "", - "emptykey4": "", + "group_names_parsed": [], + "unique_raws": unique_raws, "see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА" } +def process_obj(data): + try: + if isinstance(data, dict): + for key, value in data.items(): + if key == "raw": + unique_raws.update(value) + + process_obj(value) + + # Если это список, проходим по его элементам + elif isinstance(data, list): + for item in data: + process_obj(item) + + except Exception as e: + print("Failed process_obj") + print(e) + def process_excel_file(facultet, excel_url, counter, latest_changed): is_xlsx = excel_url.endswith(".xlsx") download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") excel_filename = excel_url.split("/")[-1] + if "ФЭУ" not in excel_filename: + print("SKIPPED") + return excel_info = { "filename": excel_filename, @@ -80,9 +106,9 @@ def process_excel_file(facultet, excel_url, counter, latest_changed): "group_names_parsed": [], "facultet": facultet, "counter": counter, - "week_keys_metadata": {} + "sheets": [] } - parser.LOGGING = False + parser.LOGGING = LOGGING try: aigenerated.download_file_from_url(excel_url, download_place) @@ -94,44 +120,45 @@ def process_excel_file(facultet, excel_url, counter, latest_changed): while True: print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)") + sheet_dict = { + "index": reader.get_sheet_index(), + "name": reader.get_sheet_name(), + "reader_info": reader.info(), + "group_names_parsed": [], + "groups": {} + } + excel_info['sheets'].append(sheet_dict) prs = parser.Parser(reader) print("Parser created; parser.parse();") prs.parse() print("parsed done!") + + if len(prs.raw_no_schedule) > 0: + sheet_dict["raw_no_schedule"] = prs.raw_no_schedule + + if len(prs.features) > 0: + sheet_dict["features"] = sorted(prs.features) + if prs.parser_error is not None: - excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error + sheet_dict["parser_error"] = prs.parser_error if prs.parser_warnings is not None and len(prs.parser_warnings) > 0: - excel_info["parser_warnings_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_warnings + sheet_dict["parser_warnings"] = prs.parser_warnings for group_name in prs.groups.keys(): - if group_name in result_groups.keys(): - print(f" -- WTF -- Doubled groups -- name: {group_name}") - if 'warning_doubled_groups_skip' not in excel_info.keys(): - excel_info['warning_doubled_groups_skip'] = [] - - excel_info['warning_doubled_groups_skip'].append(group_name) - result['doubled_groups'].append(group_name) - - - continue - - gr = result_groups[group_name] = prs.groups[group_name] - gr['facultet'] = facultet - gr['data_source'] = excel_filename # same as 'filename' in excel_info's - gr['data_source_hash'] = sha1hash - gr['debug'] = { - "excel_url": excel_url, - "reader_info": reader.info(), - "reader_sheet_index": reader.get_sheet_index(), - "download_place": download_place - } + gr = prs.groups[group_name] + gr["excel_url"] = excel_url + sheet_dict["group_names_parsed"].append(group_name) excel_info["group_names_parsed"].append(group_name) - excel_info['week_keys_metadata'] = prs.week_keys_metadata + result["group_names_parsed"].append(group_name) + result['stat']['groups'] += 1 + sheet_dict['week_keys_metadata'] = prs.week_keys_metadata + sheet_dict['groups'][group_name] = gr + process_obj(gr['slots']) - print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys())) + print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys())) if not reader.has_next_sheet(): print("File ended") @@ -159,10 +186,12 @@ def process_excel_file(facultet, excel_url, counter, latest_changed): }) result['excels'].append(excel_info) + k = "fine" if len(excel_info['group_names_parsed']) > 0 else "bad" + result['stat']['excels'][k] += 1 faileds = [] def main(): - global result_groups, result + global result t = utils.StepTimeCounter() try: try: @@ -189,7 +218,9 @@ def main(): if now_diffable_dates == prev_diffable_dates: print("No date changes in vstu.ru website. Stopping") - return + if not SKIP_DIFFABLE_DATES: + return + print("SKIP_DIFFABLE_DATES is True, force resuming") counter = 10000 for excel_link in EXCEL_LINKS: @@ -200,14 +231,8 @@ def main(): process_excel_file(facultet, excel_url, counter, latest_changed) print("Saving result.json") - group_names_alphabeticaly = sorted(result_groups.keys()) - sorted_groups = {} - for group_name in group_names_alphabeticaly: - sorted_groups[group_name] = result_groups[group_name] - - result['groups'] = sorted_groups - result['stat']['total_parsing_time'] = t.step() + result['unique_raws'] = sorted(unique_raws) json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False) print("Saved to result.json indent=2") diff --git a/parser.py b/parser.py index fc73df1..649fdbe 100644 --- a/parser.py +++ b/parser.py @@ -3,10 +3,21 @@ PAIR_NUMS = [ "1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16" ] +WEEKDAYS_STARTSWITH = [ + "понед", + "вторник", + "среда", + "четверг", + "пятница", + "суббота" +] +bad_group_names = [ + "янв", "февр", "март", "апр", "май", "сент", "окт", "ноя", "дек", "июнь", "июль", "авг" +] + +from datetime import time import json -import uuid -import aigenerated from coord import Coord, Merged from translations import ExcelSheetReader import utils @@ -17,50 +28,127 @@ LOGGING = False def pprint(*args, **kwargs): if LOGGING: print(*args, **kwargs) + +def is_weeknum(text): + for wd in WEEKDAYS_STARTSWITH: + if text.strip().replace(" ", "").lower().startswith(wd): + return True + return False + +def is_pair(text): + for p in PAIR_NUMS: + if text.strip().replace(" ", "").lower().startswith(p): + return True + return False class Parser: def __init__(self, reader: ExcelSheetReader): self.reader = reader - self.groups = {} - self.week_keys_metadata = {} + self.groups = {} # Группы которые удалось распарсить + self.features = set() # фичи данной страницы + self.week_keys_metadata = {} # календарик + self.schedule_range_row = None # [min, max] диапазон col включительно где расписание + self.raw_no_schedule = [] # всё что не schedule_range_row отправляется сюда ('СОГЛАСОВАНО:', etc..) - self.weeknums: defaultdict = defaultdict(set) # no support json! - self.parser_error = None - self.parser_warnings = [] + self.weeknums: defaultdict = defaultdict(set) # no support json! (для week_keys_metadata) + self.parser_error = None # ошибка парсера перед выходом + self.parser_warnings = [] # предупреждения парсера pprint("Parser created for '{0}'".format(reader.info())) def parse(self): - monday = self.reader.find("ПОНЕДЕЛЬНИК") - if monday is None: + # Характерные признаки разных сеток + no_pair_numeration = False + col_distance_pair_weekday = None + weekday_firstly_calendar = False + + first_weekday = self.reader.find_any(WEEKDAYS_STARTSWITH, startswith=True, nospace=True) + + if first_weekday is None: + self.features.add("no_weekdays") print(" -- Failed parse! -- ") - print("ПОНЕДЕЛЬНИК НЕ НАЙДЕН!") - self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице." + print("дни недели не найдены!") + self.parser_error = f"{WEEKDAYS_STARTSWITH} ни один найден в таблице. Дня недели нет." return - if monday.col != 4: - print("--- warning parse! ---") - print(f"Monday col != 4 (actual: {monday})") - self.parser_warnings.append(f"Monday col != 4 (actual: {monday}); Это, наверное, может работать не стабильно!") + pair_num_any = self.reader.find_any(PAIR_NUMS, nospace=True) + if pair_num_any is None: + no_pair_numeration = True + self.features.add("no_pair_numeration") + self.parser_warnings.append(f"Нет нумерации академических часов {PAIR_NUMS}") + + else: + self.features.add("pair_numeration") + col_distance_pair_weekday = pair_num_any.col - first_weekday.col - head_rx = monday.row - 1 # выше первого понидельника + head_rx = first_weekday.row - 1 # выше первого понидельника + group_col_start = first_weekday.col + 2 + if col_distance_pair_weekday is not None: + if col_distance_pair_weekday > 1: + weekday_firstly_calendar = True + self.features.add("weekdays_before_calendar") + group_col_start = pair_num_any.col + 1 + if head_rx < 0: - raise Exception("head_rx < 0: Программа пыталась найти 'ПОНЕДЕЛЬНИК', но по всей видимости не нашла.") + raise Exception("head_rx < 0: Программа пыталась найти день недели, но по всей видимости не нашла.") head = self.reader.get_row_values(head_rx) # get all ROW (months, groups) pprint(f"head={head}") - self.groups = parse_groups(self.reader, head, monday, head_rx) # parse groups to self.groups + + head_joined = " ||| ".join([v for v in head if isinstance(v, str) and v.strip()]) + print(head_joined) + if "1 неделя" in head_joined or "1 НЕДЕЛЯ" in head_joined or "2 неделя" in head_joined or "2 НЕДЕЛЯ" in head_joined or "ИЗМЕНЕНИЯ" in head_joined or "изменения" in head_joined or "vtf-vstu.ru" in head_joined: + head_rx -= 1 + head = self.reader.get_row_values(head_rx) # get all ROW (months, groups) + pprint(f"head (upper)={head}") + self.features.add("post_groups_info_row") + + self.groups = parse_groups(self.reader, head, group_col_start, head_rx) # parse groups to self.groups pprint(f'self.groups={json.dumps(self.groups, indent=2, ensure_ascii=False)}') - + pprint("\n\n\n") for group in self.groups.values(): pprint("\nSTART OF PROCESS GROUP\n") - self.process_group(group, monday) + self.process_group(group, first_weekday, pair_num_any.col if pair_num_any else None) pprint("\nEND OF PROCESS GROUP\n") - - self.process_weekmetadatas(monday) - def process_weekmetadatas(self, first_monday: "Coord"): + # week metadatas parse + S = 9999999 + group_min_col = S + group_min_row = S + + for x in self.groups.values(): + p = x['position'] + group_min_row = min(p[0], group_min_row) + group_min_col = min(p[1], group_min_col) + + if group_min_row != S and group_min_col != S: + pprint("Process weekmetadatas!") + self.process_weekmetadatas(Coord(row=group_min_row, col=group_min_col)) + + # parse no-schedule raws (согласовано, и т.д.) + self.parse_raw_no_schedule() + + + def parse_raw_no_schedule(self): + """Распарсить всё за пределами self.schedule_range_row в self.raw_no_schedule""" + if self.schedule_range_row is None: + return + + row = 0 + while row < self.reader.get_row_count(): + if row >= self.schedule_range_row[0] and row <= self.schedule_range_row[1]: + row = self.schedule_range_row[1] + 1 + + row_values = self.reader.get_row_values(row) + row_values = [v for v in row_values if isinstance(v, str) and v.strip()] + if len(row_values) > 0: + self.raw_no_schedule.append(row_values) + + row += 1 + + def process_weekmetadatas(self, first_group: "Coord"): + """Обработать календарик""" for x in self.weeknums.keys(): pprint(x) set_of_merged: set = self.weeknums[x] @@ -82,14 +170,16 @@ class Parser: self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})") continue - month_row = first_monday.row - 1 - curr_col = weekday_merged.low.col - 1 + month_row = first_group.row + curr_col = first_group.col - 1 while curr_col >= 0: month_pos = Coord(month_row, curr_col) month_cell = month_pos.cell(self.reader) if month_cell.is_empty(): pprint("month cell is empty") - break + curr_col -= 1 + continue + month_name = str(month_cell.value).strip() pprint(month_cell) all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.replace(row=weekday_merged.low.row), right=1, down=weekday_merged.height()) @@ -117,6 +207,16 @@ class Parser: def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"): self.weeknums[week_key_name].add(merged) + + def row_with_schedule_notify(self, row_coord): + if self.schedule_range_row is None: + self.schedule_range_row = [row_coord, row_coord] + + if self.schedule_range_row[1] < row_coord: + self.schedule_range_row[1] = row_coord + + if self.schedule_range_row[0] > row_coord: + self.schedule_range_row[0] = row_coord def parse_potokoviy(self, merged: Merged): speaker = None @@ -132,163 +232,157 @@ class Parser: return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()} - def process_group(self, group: dict, monday: Coord): + def process_group(self, group: dict, first_weekday: Coord, pair_pos_col): """ Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups) group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'} """ pprint(f"process_group group={group}") group_name = group['name'] - pprint(group_name) - row = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля) + pprint(F"Имя группы: {group_name}") + row_c1 = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля) + self.row_with_schedule_notify(group['position'][0]) + group_header_pos = Coord(group['position'][0], group['position'][1]) + width = group['width'] weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии. previous_pair = None - while row < self.reader.get_row_count(): # maybe условие чтобы не уйти ниже чем есть строк - pos = Coord(row, group['position'][1]) # текущая позиция, верхний левый угол (=low) - pprint(f"while pos={pos}") - pos_right = pos.shift(right=3) - pair_pos = pos.replace(col=monday.col + 1) - weekday_pos = pos.replace(col=monday.col) - merged = self.reader.get_merged_coord(pos) - merged_cell = merged.cell(self.reader) - cv = merged_cell.value - # В конце (12 пара:>) название группы, можно использовать как якорь - if utils.unspace(cv) == group_name: - pprint("Lesson == group name; ending group loop.") - break + + weekcycles = 0 + while row_c1 < self.reader.get_row_count(): + pos_c1 = Coord(row_c1, group['position'][1]) # текущая позиция, верхний левый угол (=low) + self.row_with_schedule_notify(pos_c1.row) - weekday_mr = self.reader.get_merged_coord(weekday_pos) - weekday = utils.unspace(weekday_mr.cell(self.reader).value) - pair_mr = self.reader.get_merged_coord(pair_pos) - pair = utils.unspace(pair_mr.cell(self.reader).value) - - skip = 0 - if weekday == "": - if weeknum == 1: - weeknum += 1 - pprint("------") - skip = 1 - row += 1 - else: - break - if not skip: - next = 3 # на сколько пыгнуть для следующего шага? + if pos_c1.cell(self.reader).is_nospace_nocase_same(group_name): + pprint("Ended with grpup name; stop moving down, break") + break + + weekday_pos = pos_c1.replace(col=first_weekday.col) + weekday_cell = weekday_pos.cell(self.reader) + weekday_mr = self.reader.get_merged_coord(weekday_pos) + weekday = weekday_cell.value + + if not is_weeknum(weekday): + row_c1 += 1 + pprint("Not weeknum!") + if weekcycles > 0: + if (weeknum != 2): + pprint("Weeknum now 2") + weekday = 0 + weeknum = 2 + continue + + pprint(weekday) + weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2") + self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr) + + # state + event_no = 1 + is_widely = False + override_col_range = None + all_raw = set() + pairs = set() + times = [] + first_coord = None + + row_c2 = row_c1 + while row_c2 <= weekday_mr.high.row: + pos_c2 = Coord(row_c2, group['position'][1]) # текущая позиция (внутри группы, внутри дня недели), верхний левый угол (=low) + cell_c2 = pos_c2.cell(self.reader) + mr_c2 = self.reader.get_merged_coord(pos_c2) - weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2") - self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr) + if first_coord is None: + first_coord = pos_c2.row - is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего.. - parsed_discipline_name = None - parsed_location = None - parsed_leader = None - pairs = 1 - wtf_tomanypairs = False - is_solid = pos_right in merged - parsed_uncotigorized = [] - is_wide_maybe_potokoviy = merged.width() > 4 # потоковая ли лекция (занимает несколько групп.) - - if not is_empty_lesson: - cur = pos.shift(down=2) - while utils.has_no_bottom_border(self.reader, cur): - next += 3 - pairs += 1 - pprint(f"next = {next} cur={cur}") - if pairs >= 7: - wtf_tomanypairs = True - break - cur = cur.shift(down=3) - - if is_wide_maybe_potokoviy: - ret = self.parse_potokoviy(merged) - parsed_location = ret['loc'] - parsed_leader = ret['leader'] - parsed_discipline_name = ret['name'] - parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, merged.width(), next)) - - else: - if (is_solid): - parsed_discipline_name = cv - - parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, 4, next)) - - # попытка исправить пару (1-2) если пустая. - fuck_empty_pair_in_excel = pair == "" - previous_dump = previous_pair - if fuck_empty_pair_in_excel: - if previous_pair is None or previous_pair == "": - pair = f"EMPTY_IN_EXCEL" - else: - pair = utils.next_element(PAIR_NUMS, previous_pair) + pair_num = None + pair_num_mr = None + if pair_pos_col is not None: + pair_num = pos_c2.replace(col=pair_pos_col) + pair_num_mr = self.reader.get_merged_coord(pair_num) - if pair != "": - previous_pair = pair if next == 3 else None # костыль чтобы избежать гипотетически не верной даты. + if (not is_widely) and (mr_c2.low.col < group_header_pos.col or mr_c2.high.col > group_header_pos.col + width - 1): + is_widely = True + override_col_range = (mr_c2.low.col, mr_c2.high.col) + + col_low = group_header_pos.col + col_high = group_header_pos.col + width - 1 + if override_col_range is not None: + col_low = min(col_low, override_col_range[0]) + col_high = max(col_high, override_col_range[1]) - # пытаемся из некотегорезированных данных выцепить место и лидера (препода) - prepods = set() - if parsed_leader is not None: prepods.add(parsed_leader.strip()) - - locations = set() - if parsed_location is not None: locations.add(parsed_location.strip().replace(" ", "")) - - for x in list(parsed_uncotigorized): - if aigenerated.is_surname_string(x): - prepods.add(x.strip()) - - if aigenerated.is_room_number(x): - locations.add(x.strip().replace(" ", "") if x is not None else None) - - # попытка починить пустую дисциплину - if parsed_discipline_name is None: - l = sorted(utils.remove_from_list(list(parsed_uncotigorized), list(locations | prepods | set([parsed_location, parsed_leader])))) - parsed_discipline_name = " ".join(l) - - # чистим сеты от мусора - utils.discards_list(prepods, nones=True, emptystrings=True) - utils.discards_list(locations, nones=True, emptystrings=True) - utils.discards_list(parsed_uncotigorized, nones=True, emptystrings=True) - - # если не пустой предмет то записываем его - if not is_empty_lesson: - slots = group['slots'] - w = weekday_key_name - if w not in slots.keys(): - slots[w] = {} + dirty_line = utils.parse_all_dirt(self.reader, Coord(row_c2, col_low), (col_high - col_low + 1), 1, with_cells=True) + if len(dirty_line) > 0: + if pair_num_mr is not None: + pair_num_to_add = pair_num_mr.cell(self.reader).value.replace(" ", "").strip() + if len(pair_num_to_add) == 0: + pair_num_to_add = "???" + pairs.add(pair_num_to_add) - today = slots[w] - today[pair] = { - "excel_pos": str(pos), - "discipline_name": parsed_discipline_name.strip(), - "locations": sorted(locations), - "leads": sorted(prepods), - "is_solid": is_solid, - "time_coeff": pairs, - "is_flow": is_wide_maybe_potokoviy, - "lefttopmerged": { - "width": merged.width(), - "height": merged.height(), - "excel_range": utils.merged_humanize(merged.as_numbers()) - }, - "raw": sorted(parsed_uncotigorized), - "weekday": utils.weekday_to_num(weekday), - "weeknum": weeknum - } - if fuck_empty_pair_in_excel: - today[pair]['pair_num_empty'] = { - "prev": previous_dump, - "restored": pair != "", - "pair": pair - } - if wtf_tomanypairs: - today[pair]['to_many_parsing_time_coeff'] = True - + for cell in dirty_line: + if not cell.is_time: + all_raw.add(str(cell.value)) + else: + dt: time = cell.value + times.append(str(dt)) - # INCREMENT на next и конец цикла. - row += next + def clean_state(): + nonlocal is_widely, override_col_range, event_no, all_raw, pairs, times, first_coord + is_widely = False + override_col_range = None + event_no += 1 + all_raw = set() + pairs = set() + first_coord = None + times = [] + + + if not utils.has_no_bottom_border(self.reader, pos_c2) and not(mr_c2.high.row - row_c2 > 0): + if not (len(all_raw) == 0): + # this code last for current state event + pprint(f"№{event_no} {pairs}: {'[wide] ' if is_widely else ''} raw={all_raw}") + + slots = group['slots'] + w = weekday_key_name + if w not in slots.keys(): + slots[w] = {} + + pair_name = "????" + try: + pair_name = sorted(pairs)[0] + except: pass + + today = slots[w] + obj = { + "object": "event", + "pairs": sorted(pairs), + "is_flow": is_widely, + "excel_range": utils.merged_humanize((first_coord, col_low, row_c2, col_high)), + "raw": sorted(all_raw), + "weekday": utils.weekday_to_num(weekday), + "weeknum": weeknum + } + if len(times) > 0: + obj['times'] = times + + if pair_pos_col is None: + slots[w] = obj + else: + today[pair_name] = obj + # here may be a empty all_raw + clean_state() + first_coord = None + + + if row_c2 >= weekday_mr.high.row: + clean_state() + pprint("Last for weekday") + row_c2 += 1 + + row_c1 += weekday_mr.height() + weekcycles += 1 - -def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx): +def parse_groups(reader: "ExcelSheetReader", head, col_start, head_rx): """Распознать список групп и метаданные к ним, по сути получить список названий группы и координат её верхнего header-а (AQ6:AT6)""" groups = {} i = 0 @@ -296,21 +390,26 @@ def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx): x = head[i] pprint(f"while i={i} head[i]={x}") merged = reader.get_merged_coord(Coord(head_rx, i)) - if i > monday.col + 1: - if merged is None or x == "": - break - - if merged.width() != 4: - pprint(f"WARNING: group header witdh !=4 (found: {merged.width()}); blocks !=4 not supported by parser.") + if i >= col_start: + if merged is None or x == "" or x is None: break name = utils.unspace(x) - groups[name] = { - "name": name, - "position": [head_rx, i], - "position_human": utils.merged_humanize(merged.as_numbers()), - "slots": {} - } + skip = False + if "-" not in name: + for x in bad_group_names: + if x in name.lower(): + skip = True + pprint(f"Skip groupname {name} because not dash in name and in blacklist") + + if not skip: + groups[name] = { + "name": name, + "position": [head_rx, i], + "width": merged.width(), + "position_human": utils.merged_humanize(merged.as_numbers()), + "slots": {} + } if merged is None: i += 1 diff --git a/translations.py b/translations.py index c61b79d..4211e85 100644 --- a/translations.py +++ b/translations.py @@ -1,6 +1,7 @@ # --- Абстрактный базовый класс (Контракт) --- from abc import ABC, abstractmethod +from datetime import datetime, time import openpyxl import xlrd @@ -10,9 +11,18 @@ from coord import Coord, Merged EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK] class TranschendentnostCell: - def __init__(self, value, is_empty): + def __init__(self, value, is_empty, is_time=False): self.value = value + self.is_time = isinstance(value, time) or is_time self._is_empty = is_empty + + def is_nospace_nocase_same(self, query): + try: + if self.value.lower().replace(" ", "").strip() == query.lower().replace(" ", "").strip(): + return True + except: pass + + return False def is_empty(self): return self._is_empty @@ -28,6 +38,10 @@ class ExcelSheetReader(ABC): @abstractmethod def get_sheet_index(self): pass + + @abstractmethod + def get_sheet_name(self): + pass @abstractmethod def has_next_sheet(self): @@ -71,16 +85,28 @@ class ExcelSheetReader(ABC): return "TODO: info" @abstractmethod - def cell(self, row, col): + def cell(self, row, col) -> TranschendentnostCell: """Возвращает абстрактную клетку""" pass - def find(self, query = None): + def find(self, query = None, startswith=False, nospace=False): + return self.find_any([query], startswith=startswith, nospace=nospace) + + def find_any(self, query = None, startswith=False, nospace=False): for rx in range(self.get_row_count()): i = 0 for x in self.get_row_values(rx): - if x == query: - return Coord(rx, i) + if nospace: + x = str(x).replace(" ", "").strip() + + for query_selected in query: + if x == query_selected: + return Coord(rx, i) + elif startswith: + try: + if str(x).lower().startswith(query_selected.lower()): + return Coord(rx, i) + except: pass i += 1 return None @@ -117,6 +143,9 @@ class XlrdSheetReader(ExcelSheetReader): def init_sheet(self): self.sheet = self.book.sheet_by_index(self.sheet_index) + + def get_sheet_name(self): + return self.sheet.name def has_next_sheet(self): return self.sheet_index < len(self.book.sheet_names())-1 @@ -140,7 +169,24 @@ class XlrdSheetReader(ExcelSheetReader): def cell(self, row, col): """Возвращает абстрактную клетку""" c = self.sheet.cell(row, col) - return TranschendentnostCell(c.value, c.ctype in EMPTY_CTYPES) + is_empty = c.ctype in EMPTY_CTYPES + is_time = c.ctype == xlrd.XL_CELL_DATE + value = c.value + if is_empty: + value = "" + elif is_time: + if isinstance(value, float): + if value <= 1: + seconds = round(value * 86400) + minutes, seconds = divmod(seconds, 60) + hours, minutes = divmod(minutes, 60) + value = time(hour=hours, second=seconds, minute=minutes) + else: + print(f"TODO: value is {value} its unix? not 0.xxxxxxxx") + else: + is_time = False + print("IsTime but not float!") + return TranschendentnostCell(value, is_empty, is_time=is_time) def get_border_style(self, coord: Coord, side): row = coord.row @@ -192,6 +238,9 @@ class OpenpyxlSheetReader(ExcelSheetReader): def get_sheet_index(self): return self.sheet_index + def get_sheet_name(self): + return self.workbook.sheetnames[self.sheet_index] + def has_next_sheet(self): return self.sheet_index < len(self.workbook.sheetnames)-1 @@ -221,7 +270,7 @@ class OpenpyxlSheetReader(ExcelSheetReader): c = self._get_cell(row, col) is_empty = (c.value is None) - return TranschendentnostCell("" if is_empty else c.value, is_empty) + return TranschendentnostCell("" if is_empty else c.value, is_empty, is_time=isinstance(c.value, time)) def get_cell_value(self, row, col): cell = self._get_cell(row, col) diff --git a/utils.py b/utils.py index b91968f..20c43ed 100644 --- a/utils.py +++ b/utils.py @@ -53,7 +53,7 @@ def remove_from_list(l: list, todel: list): return l -def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down): +def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down, with_cells=False): RET = set() row = min_pos.row @@ -61,10 +61,9 @@ def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down): col = min_pos.col while col < min_pos.col + right: #print(excel_coordinate(row, col)) - cv = reader.get_cell_value(row, col) - value = str(cv).strip() - if cv is not None and len(value) > 0: - RET.add(value) + cv = reader.cell(row, col) + if cv is not None and not cv.is_empty(): + RET.add(cv if with_cells else str(cv.value)) col += 1 row += 1 @@ -165,7 +164,7 @@ def find(sh, query = None): return None def weekday_to_num(st: str): - if st.upper().strip() == "ПОНЕДЕЛЬНИК": + if st.upper().strip().startswith("ПОНЕД"): return 1 if st.upper().strip() == "ВТОРНИК": return 2 @@ -177,7 +176,7 @@ def weekday_to_num(st: str): return 5 if st.upper().strip() == "СУББОТА": return 6 - if st.upper().strip() == "ВОСКРЕСЕНЬЕ": + if st.upper().strip().startswith("ВОСКР"): return 7 return -1