diff --git a/aigenerated.py b/aigenerated.py index 97a7b55..468509c 100644 --- a/aigenerated.py +++ b/aigenerated.py @@ -1,3 +1,6 @@ +# Copyright GEMINI + + import re # --- Ресурсы для алгоритма --- diff --git a/coord.py b/coord.py index ab218a2..bb25e81 100644 --- a/coord.py +++ b/coord.py @@ -1,3 +1,4 @@ +# Copyright Stanislav Mironov class Coord: @@ -13,6 +14,10 @@ class Coord: return Coord(self.row if row is None else row, self.col if col is None else col) + def copy(self) -> "Coord": + return Coord(self.row, + self.col) + def cell(self, reader: "ExcelSheetReader") -> "TranschendentnostCell": return reader.cell(self.row, self.col) diff --git a/excels/mag1757588817[C2003].xlsx.zip b/excels/mag1757588817[C2003].xlsx.zip deleted file mode 100644 index 6f5c70c..0000000 Binary files a/excels/mag1757588817[C2003].xlsx.zip and /dev/null differ diff --git a/links_parser.py b/links_parser.py index 57050d6..3553032 100644 --- a/links_parser.py +++ b/links_parser.py @@ -1,5 +1,7 @@ +# Copyright Stanislav Mironov + + import re -import time from urllib.parse import urljoin import requests from requests.structures import CaseInsensitiveDict @@ -8,7 +10,7 @@ from bs4 import BeautifulSoup BASE_URL = "https://www.vstu.ru/" RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep=" - +# Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их def parse_links(facultets): session = requests.Session() session.headers = CaseInsensitiveDict( @@ -18,17 +20,17 @@ def parse_links(facultets): "Accept-Language: ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", - "Referer": "http://dump.vstu.ru/", "Upgrade-Insecure-Requests": "1", "Priority": "u=0, i", "Pragma": "no-cache", - "Cache-Control": "no-cach", + "Cache-Control": "no-cach" } ) EXCEL_LINKS = {} for facultet in facultets: url = RASP_PREFIX + facultet + print("getting...") r = session.get(url) print(f"GET {url}") soup = BeautifulSoup(r.text, 'html.parser') @@ -51,3 +53,4 @@ def parse_links(facultets): print(f"+url {excel_url}") return EXCEL_LINKS + diff --git a/main.py b/main.py index c63590e..52fbb4d 100644 --- a/main.py +++ b/main.py @@ -1,62 +1,127 @@ +# Copyright Stanislav Mironov + +# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля + + import json -import re +import os import time import traceback -from urllib.parse import urljoin -import pandas as pd -import xlwt - -import xlrd -import requests - - -from bs4 import BeautifulSoup +import uuid import aigenerated import parser import translations import utils import json import links_parser -# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля +import shutil + +def currt(): + return round(time.time()) FACULTETS = [ "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik" ] +DIRNAME = "excels" -DEBUG_ONE_FAC = None #'fevt' +DEBUG_ONE_FAC = None #'htf' +result_groups = {} +result = { + "version": 1, + "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав", + "actual_at": round(time.time()), + "documentation": "TODO", + "daypicture": "QwQ", + "university": "VSTU", + "university_site": "https://www.vstu.ru/", + "stat": { + "total_parsing_time": -1, + }, + "api_notices": { + "updated_at": 1757688552, + "text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;", + "warning": False, + "tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'" + }, + "doubled_groups": [], + "debug": { + "bleu~~": 1 + }, + "excels": [], + "facultets": FACULTETS, + "emptykey1": "", + "emptykey2": "", + + "groups": result_groups, + + "emptykey3": "", + "emptykey4": "", + "see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА" +} def process_excel_file(facultet, excel_url, counter, timeid): is_xlsx = excel_url.endswith(".xlsx") + filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") + + excel_info = { + "filename": excel_url.split("/")[-1], + "url": excel_url, + "download_place": filename, + "stat": { + "download": -1, + "create_reader": -1, + "parse": -1, + "cycles": 0 + }, + "group_names_parsed": [], + "facultet": facultet, + "counter": counter + } + parser.LOGGING = False + try: - filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "") + t = utils.StepTimeCounter() aigenerated.download_file_from_url(excel_url, filename) + excel_info["stat"]['download'] = t.step() reader = translations.create_reader(filename) print("Reader info") print(reader.info()) + excel_info["stat"]['create_reader'] = t.step() while True: - print(f"Parsing sheet №{reader.get_sheet_index()+1}") - parser.LOGGING = False + excel_info['stat']['cycles'] += 1 + print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)") prs = parser.Parser(reader) prs.parse() + if prs.parser_error is not None: + excel_info["parser_error_cycle_" + excel_info['stat']['cycles']] = prs.parser_error + for group_name in prs.groups.keys(): - if group_name in result.keys(): + if group_name in result_groups.keys(): print(f" -- WTF -- Doubled groups -- name: {group_name}") + if 'warning_doubled_groups_skip' not in excel_info.keys(): + excel_info['warning_doubled_groups_skip'] = [] + + excel_info['warning_doubled_groups_skip'].append(group_name) + result['doubled_groups'].append(group_name) + + continue - gr = result[group_name] = prs.groups[group_name] + gr = result_groups[group_name] = prs.groups[group_name] gr['facultet'] = facultet gr['data_source'] = excel_url.split("/")[-1] - gr['parser_debug'] = { - "C_COUNTER": counter, + gr['debug'] = { + "counter": counter, "timeid": timeid, "excel_url": excel_url, "reader_info": reader.info(), "reader_sheet_index": reader.get_sheet_index(), "filename": filename } + excel_info["group_names_parsed"].append(group_name) print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys())) @@ -67,22 +132,40 @@ def process_excel_file(facultet, excel_url, counter, timeid): reader.next_sheet() print("Next sheet!") + excel_info["stat"]['parse'] = t.step() + + except Exception as e: print(f"Error while {excel_url}") print(e) traceback.print_exc() + u = uuid.uuid4() + excel_info['error'] = { + "smile": ":(", + "error_message": str(e), + "log_anchor": str(u), + "time": currt() + } + print(f"Log Anchor: {u}") faileds.append({ "ex": e, "fac": facultet, "url": excel_url }) + + result['excels'].append(excel_info) - -result = {} faileds = [] def main(): - EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC]) + t = utils.StepTimeCounter() + try: + os.mkdir(DIRNAME) + print(f"Directory '{DIRNAME}' created successfully.") + except Exception: + print(f"Directory '{DIRNAME}' already exists.") + print("main(); parse links starting...") + EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC]) counter = 0 timeid = str(round(time.time())) for facultet in EXCEL_LINKS.keys(): @@ -99,13 +182,24 @@ def main(): print("Excel file processing done!") print("Saving result.json") + + result['stat']['total_parsing_time'] = t.step() + json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False) print("Saved to result.json") print("Faileds:") print(faileds) + # Delete a non-empty directory and its contents + try: + shutil.rmtree(DIRNAME) + print(f"Directory '{DIRNAME}' and its contents deleted successfully.") + except Exception as e: + print(f"Error deleting directory '{DIRNAME}': {e}") + if __name__ == "__main__": + print("Start") main() print("Bye!") diff --git a/parser.py b/parser.py index 3e4acec..1b4ec59 100644 --- a/parser.py +++ b/parser.py @@ -1,7 +1,11 @@ +# Copyright Stanislav Mironov + +PAIR_NUMS = [ + "1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16" +] + import json - -import xlrd - +import uuid import aigenerated from coord import Coord, Merged from translations import ExcelSheetReader @@ -13,13 +17,13 @@ def pprint(*args, **kwargs): if LOGGING: print(*args, **kwargs) - class Parser: def __init__(self, reader: ExcelSheetReader): self.reader = reader self.groups = {} self.teachers = set() self.places = set() + self.parser_error = None pprint("Parser created for '{0}'".format(reader.info())) def parse(self): @@ -27,6 +31,7 @@ class Parser: if monday is None: print(" -- Failed parse! -- ") print("ПОНЕДЕЛЬНИК НЕ НАЙДЕН!") + self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице." return head_rx = monday.row - 1 # выше первого понидельника @@ -59,7 +64,7 @@ class Parser: # location location = merged.high.shift(down=1).cell(self.reader).value - return {"loc": str(location), "leader": str(speaker), "name": str(merged.cell(self.reader).value)} + return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()} def process_group(self, group, monday): """ @@ -71,13 +76,13 @@ class Parser: pprint(group_name) row = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля) weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии. + previous_pair = None while row < self.reader.get_row_count(): # maybe условие чтобы не уйти ниже чем есть строк pos = Coord(row, group['position'][1]) # текущая позиция, верхний левый угол (=low) pos_right = pos.shift(right=3) pair_pos = pos.replace(col=5) weekday_pos = pos.replace(col=4) merged = self.reader.get_merged_coord(pos) - right_cell = pos_right.cell(self.reader) merged_cell = merged.cell(self.reader) cv = merged_cell.value # В конце (12 пара:>) название группы, можно использовать как якорь @@ -89,6 +94,16 @@ class Parser: weekday = utils.unspace(weekday_mr.cell(self.reader).value) pair_mr = self.reader.get_merged_coord(pair_pos) pair = utils.unspace(pair_mr.cell(self.reader).value) + fuck_empty_pair_in_excel = pair == "" + previous_dump = previous_pair + if fuck_empty_pair_in_excel: + if previous_pair is None or previous_pair == "": + pair = f"EMPTY_IN_EXCEL_{uuid.uuid4()}" + else: + pair = utils.next_element(PAIR_NUMS, previous_pair) + + if pair != "": + previous_pair = pair skip = 0 if weekday == "": @@ -99,26 +114,25 @@ class Parser: row += 1 else: break + if not skip: next = 3 # на сколько пыгнуть для следующего шага? - is_empty_lesson = right_cell.is_empty() and merged_cell.is_empty() - dispname = "" + is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего.. parsed_discipline_name = None parsed_location = None parsed_leader = None - is_2pair = False + pairs = 1 is_solid = pos_right in merged parsed_uncotigorized = [] is_wide_maybe_potokoviy = merged.width() > 4 # потоковая ли лекция (занимает несколько групп.) - if is_empty_lesson: - dispname = "" if not is_empty_lesson: - may_prepod = merged.low.shift(down=2) - if utils.has_no_bottom_border(self.reader, may_prepod): - next = 6 - is_2pair = True + cur = merged.low.shift(down=2) + while utils.has_no_bottom_border(self.reader, cur): + next += 3 + pairs += 1 + cur = cur.shift(down=3) if is_wide_maybe_potokoviy: ret = self.parse_potokoviy(merged) @@ -127,45 +141,37 @@ class Parser: parsed_discipline_name = ret['name'] parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, merged.width(), next)) - else: if (is_solid): parsed_discipline_name = cv - dispname = cv - dispname += (" SOLD" if is_solid else " SPLIT") - dispname += (" [ДВУПАРНЫЙ]" if is_2pair else "") - parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, 4, next)) - - if parsed_leader: dispname += f" [{parsed_leader}]" - if parsed_location: dispname += f" [{parsed_location}]" - dispname = dispname.replace("\n", "\\n") - pprint(f"[{group_name}] row={row}; {pos} {pos_right} {pair} {weekday}: {'[ПОТОКОВЫЙ] ' if is_wide_maybe_potokoviy else ''}{dispname} {parsed_uncotigorized}") - # пытаемся из некотегорезированных данных выцепить место и лидера (препода) prepods = set() - if parsed_leader is not None: prepods.add(aigenerated.extract_last_name(parsed_leader)) + if parsed_leader is not None: prepods.add(parsed_leader.strip()) locations = set() - if parsed_location is not None: locations.add(parsed_location.replace(" ", "").replace("-", "")) + if parsed_location is not None: locations.add(parsed_location.strip().replace(" ", "")) for x in list(parsed_uncotigorized): if aigenerated.is_surname_string(x): - prepods.add(aigenerated.extract_last_name(x)) + prepods.add(x.strip()) if aigenerated.is_room_number(x): - locations.add(x.replace(" ", "").replace("-", "") if x is not None else None) + locations.add(x.strip().replace(" ", "") if x is not None else None) - # оставшееся в дисциплину (костыль) + # попытка починить пустую дисциплину if parsed_discipline_name is None: - parsed_discipline_name = " ".join(parsed_uncotigorized) + l = utils.remove_from_list(list(parsed_uncotigorized), [parsed_leader, parsed_location]) + parsed_discipline_name = " ".join(l) - prepods.discard(None) - prepods.discard("") - locations.discard(None) - locations.discard("") + # чистим сеты от мусора + utils.discards_list(prepods, nones=True, emptystrings=True) + utils.discards_list(locations, nones=True, emptystrings=True) + utils.discards_list(parsed_uncotigorized, nones=True, emptystrings=True) + + # если не пустой предмет то записываем его if not is_empty_lesson: slots = group['slots'] w = weekday + ("_1" if weeknum == 1 else "_2") @@ -174,22 +180,30 @@ class Parser: today = slots[w] today[pair] = { - "pos": str(pos), - "discipline": parsed_discipline_name, + "excel_pos": str(pos), + "discipline_name": parsed_discipline_name.strip(), "locations": list(locations), "leads": list(prepods), "is_solid": is_solid, - "is_2pair": is_2pair, + "time_coeff": pairs, "is_flow": is_wide_maybe_potokoviy, + "lefttopmerged": { + "width": merged.width(), + "height": merged.height(), + "excel_range": utils.merged_humanize(merged.as_numbers()) + }, "raw": parsed_uncotigorized, - "weeday": utils.weekday_to_num(weekday), + "weekday": utils.weekday_to_num(weekday), "weeknum": weeknum } + if fuck_empty_pair_in_excel: + today[pair]['pair_num_empty'] = { + "prev": previous_dump, + "restoted": pair != "", + "pair": pair + } - self.teachers.add(aigenerated.extract_last_name(parsed_leader)) - - # INCREMENT на next и конец цикла. row += next diff --git a/translations.py b/translations.py index f92e760..c61b79d 100644 --- a/translations.py +++ b/translations.py @@ -267,12 +267,11 @@ def create_reader(file_path, **kwargs) -> ExcelSheetReader: Создает и возвращает подходящий экземпляр ридера в зависимости от расширения файла. """ if file_path.lower().endswith('.xlsx'): - print("Используется движок openpyxl для .xlsx") return OpenpyxlSheetReader(file_path, **kwargs) elif file_path.lower().endswith('.xls'): - print("Используется движок xlrd для .xls") return XlrdSheetReader(file_path, **kwargs) else: - raise ValueError("Неподдерживаемый формат файла. Используйте .xls или .xlsx") \ No newline at end of file + raise ValueError("Неподдерживаемый формат файла. Используйте .xls или .xlsx") + diff --git a/utils.py b/utils.py index 5b683bb..7fc2cb2 100644 --- a/utils.py +++ b/utils.py @@ -1,14 +1,58 @@ -# gemini generated +# Copyright Stanislav Mironov + +import time import xlrd from coord import Coord, Merged from translations import ExcelSheetReader +import re + + +class StepTimeCounter: + def __init__(self): + self.time: float = -1.0 + self.createtime = time.time() + self.setnow() + + def setnow(self): + self.time = time.time() + + def step(self, no_set_now=False): + left = time.time() - self.time + if not no_set_now: + self.setnow() + return left + + def from_create(self): + left = time.time() - self.createtime + return left EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK] +def discards_list(trg, nones=True, emptystrings=True): + if nones: remove_from_list(trg, [None]) + if emptystrings: remove_from_list(trg, [""]) + def has_no_bottom_border(reader: "ExcelSheetReader", coord): return reader.get_border_style(coord, 'bottom') == 0 and reader.get_border_style(coord.shift(down=1), 'top') == 0 +def find_element_index(my_list, element): + if element in my_list: + return my_list.index(element) + else: + return -1 + +def next_element(arr, el): + index = find_element_index(arr, el) + return arr[index + 1] + +def remove_from_list(l: list, todel: list): + for x in todel: + if x in l: + l.remove(x) + + return l + def parse_all_dirt(reader: "ExcelSheetReader", min_pos, right, down): RET = set() @@ -17,17 +61,16 @@ def parse_all_dirt(reader: "ExcelSheetReader", min_pos, right, down): col = min_pos.col while col < min_pos.col + right: #print(excel_coordinate(row, col)) - value = str(reader.get_cell_value(row, col)) - if value is not None and len(value) > 0: + cv = reader.get_cell_value(row, col) + value = str(cv).strip() + if cv is not None and len(value) > 0: RET.add(value) col += 1 row += 1 return RET -import re - -# GEMINI +# GEMINI GENERATED def normalize_name(raw_name): """ Приводит разнородные записи ФИО к единому структурированному виду.