refactor: big, more patterns\n\nBREAKING CHANGES

This commit is contained in:
2026-03-16 20:53:42 +03:00
parent 2105e9bc36
commit 7e0e4a0b71
5 changed files with 416 additions and 244 deletions

0
Dockerfile Normal file
View File

131
main.py
View File

@@ -5,6 +5,7 @@
import json import json
import os import os
import random
import time import time
import traceback import traceback
import uuid import uuid
@@ -26,50 +27,75 @@ FACULTETS = sorted([
DIRNAME = "excels" DIRNAME = "excels"
DIFFABLE_DATES = "diffable_dates.txt" DIFFABLE_DATES = "diffable_dates.txt"
SKIP_DIFFABLE_DATES = True
DEBUG_ONE_FAC = None #'fevt' DEBUG_ONE_FAC = None #'fevt'
result_groups = {} LOGGING = False
unique_raws = set()
result = { result = {
"version": 1, "version": 1,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php", "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php",
"actual_at": round(time.time()), "actual_at": round(time.time()),
"documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json", "documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json (temporary outdated)",
"daypicture": "QwQ", "daypicture": "0w0",
"daycite": "running on a rope", "daycite": "KIlLSWITCH",
"contact": "https://fazziclay.com/", "contact": "https://fazziclay.com/",
"university": "VSTU", "university": "VSTU",
"university_site": "https://www.vstu.ru/", "university_site": "https://www.vstu.ru/",
"source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json", "source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json",
"stat": { "stat": {
"total_parsing_time": -1, "total_parsing_time": -1,
"excels": {
"fine": 0,
"bad": 0
},
"groups": 0,
"unique_raws": -1
}, },
"api_notices": { "api_notices": {
"updated_at": 1759651871, "updated_at": 1773523692,
"text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.", "text_pre1": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.",
"warning": False, "text": "2026-03-15 BREAKING CHANGES! By Stanislav Mironov.\n\nИзменено многое в угоду унифкации и расширению спаршенных групп. Пока alpha",
"warning": True,
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'" "tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
}, },
"doubled_groups": [],
"debug": { "debug": {
"bleu~~": 2 "bleu~~": 3
}, },
"excels": [], "excels": [],
"facultets": FACULTETS, "facultets": FACULTETS,
"group_names_parsed": [],
"emptykey1": "", "unique_raws": unique_raws,
"emptykey2": "",
"groups": result_groups,
"emptykey3": "",
"emptykey4": "",
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА" "see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
} }
def process_obj(data):
try:
if isinstance(data, dict):
for key, value in data.items():
if key == "raw":
unique_raws.update(value)
process_obj(value)
# Если это список, проходим по его элементам
elif isinstance(data, list):
for item in data:
process_obj(item)
except Exception as e:
print("Failed process_obj")
print(e)
def process_excel_file(facultet, excel_url, counter, latest_changed): def process_excel_file(facultet, excel_url, counter, latest_changed):
is_xlsx = excel_url.endswith(".xlsx") is_xlsx = excel_url.endswith(".xlsx")
download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
excel_filename = excel_url.split("/")[-1] excel_filename = excel_url.split("/")[-1]
if "ФЭУ" not in excel_filename:
print("SKIPPED")
return
excel_info = { excel_info = {
"filename": excel_filename, "filename": excel_filename,
@@ -80,9 +106,9 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
"group_names_parsed": [], "group_names_parsed": [],
"facultet": facultet, "facultet": facultet,
"counter": counter, "counter": counter,
"week_keys_metadata": {} "sheets": []
} }
parser.LOGGING = False parser.LOGGING = LOGGING
try: try:
aigenerated.download_file_from_url(excel_url, download_place) aigenerated.download_file_from_url(excel_url, download_place)
@@ -94,44 +120,45 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
while True: while True:
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)") print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
sheet_dict = {
"index": reader.get_sheet_index(),
"name": reader.get_sheet_name(),
"reader_info": reader.info(),
"group_names_parsed": [],
"groups": {}
}
excel_info['sheets'].append(sheet_dict)
prs = parser.Parser(reader) prs = parser.Parser(reader)
print("Parser created; parser.parse();") print("Parser created; parser.parse();")
prs.parse() prs.parse()
print("parsed done!") print("parsed done!")
if len(prs.raw_no_schedule) > 0:
sheet_dict["raw_no_schedule"] = prs.raw_no_schedule
if len(prs.features) > 0:
sheet_dict["features"] = sorted(prs.features)
if prs.parser_error is not None: if prs.parser_error is not None:
excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error sheet_dict["parser_error"] = prs.parser_error
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0: if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
excel_info["parser_warnings_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_warnings sheet_dict["parser_warnings"] = prs.parser_warnings
for group_name in prs.groups.keys(): for group_name in prs.groups.keys():
if group_name in result_groups.keys(): gr = prs.groups[group_name]
print(f" -- WTF -- Doubled groups -- name: {group_name}") gr["excel_url"] = excel_url
if 'warning_doubled_groups_skip' not in excel_info.keys(): sheet_dict["group_names_parsed"].append(group_name)
excel_info['warning_doubled_groups_skip'] = []
excel_info['warning_doubled_groups_skip'].append(group_name)
result['doubled_groups'].append(group_name)
continue
gr = result_groups[group_name] = prs.groups[group_name]
gr['facultet'] = facultet
gr['data_source'] = excel_filename # same as 'filename' in excel_info's
gr['data_source_hash'] = sha1hash
gr['debug'] = {
"excel_url": excel_url,
"reader_info": reader.info(),
"reader_sheet_index": reader.get_sheet_index(),
"download_place": download_place
}
excel_info["group_names_parsed"].append(group_name) excel_info["group_names_parsed"].append(group_name)
excel_info['week_keys_metadata'] = prs.week_keys_metadata result["group_names_parsed"].append(group_name)
result['stat']['groups'] += 1
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
sheet_dict['groups'][group_name] = gr
process_obj(gr['slots'])
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys())) print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
if not reader.has_next_sheet(): if not reader.has_next_sheet():
print("File ended") print("File ended")
@@ -159,10 +186,12 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
}) })
result['excels'].append(excel_info) result['excels'].append(excel_info)
k = "fine" if len(excel_info['group_names_parsed']) > 0 else "bad"
result['stat']['excels'][k] += 1
faileds = [] faileds = []
def main(): def main():
global result_groups, result global result
t = utils.StepTimeCounter() t = utils.StepTimeCounter()
try: try:
try: try:
@@ -189,7 +218,9 @@ def main():
if now_diffable_dates == prev_diffable_dates: if now_diffable_dates == prev_diffable_dates:
print("No date changes in vstu.ru website. Stopping") print("No date changes in vstu.ru website. Stopping")
return if not SKIP_DIFFABLE_DATES:
return
print("SKIP_DIFFABLE_DATES is True, force resuming")
counter = 10000 counter = 10000
for excel_link in EXCEL_LINKS: for excel_link in EXCEL_LINKS:
@@ -200,14 +231,8 @@ def main():
process_excel_file(facultet, excel_url, counter, latest_changed) process_excel_file(facultet, excel_url, counter, latest_changed)
print("Saving result.json") print("Saving result.json")
group_names_alphabeticaly = sorted(result_groups.keys())
sorted_groups = {}
for group_name in group_names_alphabeticaly:
sorted_groups[group_name] = result_groups[group_name]
result['groups'] = sorted_groups
result['stat']['total_parsing_time'] = t.step() result['stat']['total_parsing_time'] = t.step()
result['unique_raws'] = sorted(unique_raws)
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False) json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json indent=2") print("Saved to result.json indent=2")

419
parser.py
View File

@@ -3,10 +3,21 @@
PAIR_NUMS = [ PAIR_NUMS = [
"1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16" "1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16"
] ]
WEEKDAYS_STARTSWITH = [
"понед",
"вторник",
"среда",
"четверг",
"пятница",
"суббота"
]
bad_group_names = [
"янв", "февр", "март", "апр", "май", "сент", "окт", "ноя", "дек", "июнь", "июль", "авг"
]
from datetime import time
import json import json
import uuid
import aigenerated
from coord import Coord, Merged from coord import Coord, Merged
from translations import ExcelSheetReader from translations import ExcelSheetReader
import utils import utils
@@ -18,49 +29,126 @@ def pprint(*args, **kwargs):
if LOGGING: if LOGGING:
print(*args, **kwargs) print(*args, **kwargs)
def is_weeknum(text):
for wd in WEEKDAYS_STARTSWITH:
if text.strip().replace(" ", "").lower().startswith(wd):
return True
return False
def is_pair(text):
for p in PAIR_NUMS:
if text.strip().replace(" ", "").lower().startswith(p):
return True
return False
class Parser: class Parser:
def __init__(self, reader: ExcelSheetReader): def __init__(self, reader: ExcelSheetReader):
self.reader = reader self.reader = reader
self.groups = {} self.groups = {} # Группы которые удалось распарсить
self.week_keys_metadata = {} self.features = set() # фичи данной страницы
self.week_keys_metadata = {} # календарик
self.schedule_range_row = None # [min, max] диапазон col включительно где расписание
self.raw_no_schedule = [] # всё что не schedule_range_row отправляется сюда ('СОГЛАСОВАНО:', etc..)
self.weeknums: defaultdict = defaultdict(set) # no support json! self.weeknums: defaultdict = defaultdict(set) # no support json! (для week_keys_metadata)
self.parser_error = None self.parser_error = None # ошибка парсера перед выходом
self.parser_warnings = [] self.parser_warnings = [] # предупреждения парсера
pprint("Parser created for '{0}'".format(reader.info())) pprint("Parser created for '{0}'".format(reader.info()))
def parse(self): def parse(self):
monday = self.reader.find("ПОНЕДЕЛЬНИК") # Характерные признаки разных сеток
if monday is None: no_pair_numeration = False
col_distance_pair_weekday = None
weekday_firstly_calendar = False
first_weekday = self.reader.find_any(WEEKDAYS_STARTSWITH, startswith=True, nospace=True)
if first_weekday is None:
self.features.add("no_weekdays")
print(" -- Failed parse! -- ") print(" -- Failed parse! -- ")
print("ПОНЕДЕЛЬНИК НЕ НАЙДЕН!") print("дни недели не найдены!")
self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице." self.parser_error = f"{WEEKDAYS_STARTSWITH} ни один найден в таблице. Дня недели нет."
return return
if monday.col != 4: pair_num_any = self.reader.find_any(PAIR_NUMS, nospace=True)
print("--- warning parse! ---") if pair_num_any is None:
print(f"Monday col != 4 (actual: {monday})") no_pair_numeration = True
self.parser_warnings.append(f"Monday col != 4 (actual: {monday}); Это, наверное, может работать не стабильно!") self.features.add("no_pair_numeration")
self.parser_warnings.append(f"Нет нумерации академических часов {PAIR_NUMS}")
else:
self.features.add("pair_numeration")
col_distance_pair_weekday = pair_num_any.col - first_weekday.col
head_rx = first_weekday.row - 1 # выше первого понидельника
group_col_start = first_weekday.col + 2
if col_distance_pair_weekday is not None:
if col_distance_pair_weekday > 1:
weekday_firstly_calendar = True
self.features.add("weekdays_before_calendar")
group_col_start = pair_num_any.col + 1
head_rx = monday.row - 1 # выше первого понидельника
if head_rx < 0: if head_rx < 0:
raise Exception("head_rx < 0: Программа пыталась найти 'ПОНЕДЕЛЬНИК', но по всей видимости не нашла.") raise Exception("head_rx < 0: Программа пыталась найти день недели, но по всей видимости не нашла.")
head = self.reader.get_row_values(head_rx) # get all ROW (months, groups) head = self.reader.get_row_values(head_rx) # get all ROW (months, groups)
pprint(f"head={head}") pprint(f"head={head}")
self.groups = parse_groups(self.reader, head, monday, head_rx) # parse groups to self.groups
head_joined = " ||| ".join([v for v in head if isinstance(v, str) and v.strip()])
print(head_joined)
if "1 неделя" in head_joined or "1 НЕДЕЛЯ" in head_joined or "2 неделя" in head_joined or "2 НЕДЕЛЯ" in head_joined or "ИЗМЕНЕНИЯ" in head_joined or "изменения" in head_joined or "vtf-vstu.ru" in head_joined:
head_rx -= 1
head = self.reader.get_row_values(head_rx) # get all ROW (months, groups)
pprint(f"head (upper)={head}")
self.features.add("post_groups_info_row")
self.groups = parse_groups(self.reader, head, group_col_start, head_rx) # parse groups to self.groups
pprint(f'self.groups={json.dumps(self.groups, indent=2, ensure_ascii=False)}') pprint(f'self.groups={json.dumps(self.groups, indent=2, ensure_ascii=False)}')
pprint("\n\n\n") pprint("\n\n\n")
for group in self.groups.values(): for group in self.groups.values():
pprint("\nSTART OF PROCESS GROUP\n") pprint("\nSTART OF PROCESS GROUP\n")
self.process_group(group, monday) self.process_group(group, first_weekday, pair_num_any.col if pair_num_any else None)
pprint("\nEND OF PROCESS GROUP\n") pprint("\nEND OF PROCESS GROUP\n")
self.process_weekmetadatas(monday) # week metadatas parse
S = 9999999
group_min_col = S
group_min_row = S
def process_weekmetadatas(self, first_monday: "Coord"): for x in self.groups.values():
p = x['position']
group_min_row = min(p[0], group_min_row)
group_min_col = min(p[1], group_min_col)
if group_min_row != S and group_min_col != S:
pprint("Process weekmetadatas!")
self.process_weekmetadatas(Coord(row=group_min_row, col=group_min_col))
# parse no-schedule raws (согласовано, и т.д.)
self.parse_raw_no_schedule()
def parse_raw_no_schedule(self):
"""Распарсить всё за пределами self.schedule_range_row в self.raw_no_schedule"""
if self.schedule_range_row is None:
return
row = 0
while row < self.reader.get_row_count():
if row >= self.schedule_range_row[0] and row <= self.schedule_range_row[1]:
row = self.schedule_range_row[1] + 1
row_values = self.reader.get_row_values(row)
row_values = [v for v in row_values if isinstance(v, str) and v.strip()]
if len(row_values) > 0:
self.raw_no_schedule.append(row_values)
row += 1
def process_weekmetadatas(self, first_group: "Coord"):
"""Обработать календарик"""
for x in self.weeknums.keys(): for x in self.weeknums.keys():
pprint(x) pprint(x)
set_of_merged: set = self.weeknums[x] set_of_merged: set = self.weeknums[x]
@@ -82,14 +170,16 @@ class Parser:
self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})") self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})")
continue continue
month_row = first_monday.row - 1 month_row = first_group.row
curr_col = weekday_merged.low.col - 1 curr_col = first_group.col - 1
while curr_col >= 0: while curr_col >= 0:
month_pos = Coord(month_row, curr_col) month_pos = Coord(month_row, curr_col)
month_cell = month_pos.cell(self.reader) month_cell = month_pos.cell(self.reader)
if month_cell.is_empty(): if month_cell.is_empty():
pprint("month cell is empty") pprint("month cell is empty")
break curr_col -= 1
continue
month_name = str(month_cell.value).strip() month_name = str(month_cell.value).strip()
pprint(month_cell) pprint(month_cell)
all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.replace(row=weekday_merged.low.row), right=1, down=weekday_merged.height()) all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.replace(row=weekday_merged.low.row), right=1, down=weekday_merged.height())
@@ -118,6 +208,16 @@ class Parser:
def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"): def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"):
self.weeknums[week_key_name].add(merged) self.weeknums[week_key_name].add(merged)
def row_with_schedule_notify(self, row_coord):
if self.schedule_range_row is None:
self.schedule_range_row = [row_coord, row_coord]
if self.schedule_range_row[1] < row_coord:
self.schedule_range_row[1] = row_coord
if self.schedule_range_row[0] > row_coord:
self.schedule_range_row[0] = row_coord
def parse_potokoviy(self, merged: Merged): def parse_potokoviy(self, merged: Merged):
speaker = None speaker = None
location = None location = None
@@ -132,163 +232,157 @@ class Parser:
return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()} return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()}
def process_group(self, group: dict, monday: Coord): def process_group(self, group: dict, first_weekday: Coord, pair_pos_col):
""" """
Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups) Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups)
group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'} group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'}
""" """
pprint(f"process_group group={group}") pprint(f"process_group group={group}")
group_name = group['name'] group_name = group['name']
pprint(group_name) pprint(F"Имя группы: {group_name}")
row = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля) row_c1 = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля)
self.row_with_schedule_notify(group['position'][0])
group_header_pos = Coord(group['position'][0], group['position'][1])
width = group['width']
weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии. weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии.
previous_pair = None previous_pair = None
while row < self.reader.get_row_count(): # maybe условие чтобы не уйти ниже чем есть строк
pos = Coord(row, group['position'][1]) # текущая позиция, верхний левый угол (=low) weekcycles = 0
pprint(f"while pos={pos}") while row_c1 < self.reader.get_row_count():
pos_right = pos.shift(right=3) pos_c1 = Coord(row_c1, group['position'][1]) # текущая позиция, верхний левый угол (=low)
pair_pos = pos.replace(col=monday.col + 1) self.row_with_schedule_notify(pos_c1.row)
weekday_pos = pos.replace(col=monday.col)
merged = self.reader.get_merged_coord(pos)
merged_cell = merged.cell(self.reader) if pos_c1.cell(self.reader).is_nospace_nocase_same(group_name):
cv = merged_cell.value pprint("Ended with grpup name; stop moving down, break")
# В конце (12 пара:>) название группы, можно использовать как якорь
if utils.unspace(cv) == group_name:
pprint("Lesson == group name; ending group loop.")
break break
weekday_pos = pos_c1.replace(col=first_weekday.col)
weekday_cell = weekday_pos.cell(self.reader)
weekday_mr = self.reader.get_merged_coord(weekday_pos) weekday_mr = self.reader.get_merged_coord(weekday_pos)
weekday = utils.unspace(weekday_mr.cell(self.reader).value) weekday = weekday_cell.value
pair_mr = self.reader.get_merged_coord(pair_pos)
pair = utils.unspace(pair_mr.cell(self.reader).value)
skip = 0 if not is_weeknum(weekday):
if weekday == "": row_c1 += 1
if weeknum == 1: pprint("Not weeknum!")
weeknum += 1 if weekcycles > 0:
pprint("------") if (weeknum != 2):
skip = 1 pprint("Weeknum now 2")
row += 1 weekday = 0
else: weeknum = 2
break continue
if not skip: pprint(weekday)
next = 3 # на сколько пыгнуть для следующего шага? weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2")
self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr)
weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2") # state
self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr) event_no = 1
is_widely = False
override_col_range = None
all_raw = set()
pairs = set()
times = []
first_coord = None
is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего.. row_c2 = row_c1
parsed_discipline_name = None while row_c2 <= weekday_mr.high.row:
parsed_location = None pos_c2 = Coord(row_c2, group['position'][1]) # текущая позиция (внутри группы, внутри дня недели), верхний левый угол (=low)
parsed_leader = None cell_c2 = pos_c2.cell(self.reader)
pairs = 1 mr_c2 = self.reader.get_merged_coord(pos_c2)
wtf_tomanypairs = False
is_solid = pos_right in merged
parsed_uncotigorized = []
is_wide_maybe_potokoviy = merged.width() > 4 # потоковая ли лекция (занимает несколько групп.)
if not is_empty_lesson: if first_coord is None:
cur = pos.shift(down=2) first_coord = pos_c2.row
while utils.has_no_bottom_border(self.reader, cur):
next += 3
pairs += 1
pprint(f"next = {next} cur={cur}")
if pairs >= 7:
wtf_tomanypairs = True
break
cur = cur.shift(down=3)
if is_wide_maybe_potokoviy: pair_num = None
ret = self.parse_potokoviy(merged) pair_num_mr = None
parsed_location = ret['loc'] if pair_pos_col is not None:
parsed_leader = ret['leader'] pair_num = pos_c2.replace(col=pair_pos_col)
parsed_discipline_name = ret['name'] pair_num_mr = self.reader.get_merged_coord(pair_num)
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, merged.width(), next))
else: if (not is_widely) and (mr_c2.low.col < group_header_pos.col or mr_c2.high.col > group_header_pos.col + width - 1):
if (is_solid): is_widely = True
parsed_discipline_name = cv override_col_range = (mr_c2.low.col, mr_c2.high.col)
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, 4, next)) col_low = group_header_pos.col
col_high = group_header_pos.col + width - 1
if override_col_range is not None:
col_low = min(col_low, override_col_range[0])
col_high = max(col_high, override_col_range[1])
# попытка исправить пару (1-2) если пустая. dirty_line = utils.parse_all_dirt(self.reader, Coord(row_c2, col_low), (col_high - col_low + 1), 1, with_cells=True)
fuck_empty_pair_in_excel = pair == "" if len(dirty_line) > 0:
previous_dump = previous_pair if pair_num_mr is not None:
if fuck_empty_pair_in_excel: pair_num_to_add = pair_num_mr.cell(self.reader).value.replace(" ", "").strip()
if previous_pair is None or previous_pair == "": if len(pair_num_to_add) == 0:
pair = f"EMPTY_IN_EXCEL" pair_num_to_add = "???"
else: pairs.add(pair_num_to_add)
pair = utils.next_element(PAIR_NUMS, previous_pair)
if pair != "": for cell in dirty_line:
previous_pair = pair if next == 3 else None # костыль чтобы избежать гипотетически не верной даты. if not cell.is_time:
all_raw.add(str(cell.value))
else:
dt: time = cell.value
times.append(str(dt))
# пытаемся из некотегорезированных данных выцепить место и лидера (препода) def clean_state():
prepods = set() nonlocal is_widely, override_col_range, event_no, all_raw, pairs, times, first_coord
if parsed_leader is not None: prepods.add(parsed_leader.strip()) is_widely = False
override_col_range = None
event_no += 1
all_raw = set()
pairs = set()
first_coord = None
times = []
locations = set()
if parsed_location is not None: locations.add(parsed_location.strip().replace(" ", ""))
for x in list(parsed_uncotigorized): if not utils.has_no_bottom_border(self.reader, pos_c2) and not(mr_c2.high.row - row_c2 > 0):
if aigenerated.is_surname_string(x): if not (len(all_raw) == 0):
prepods.add(x.strip()) # this code last for current state event
pprint(f"{event_no} {pairs}: {'[wide] ' if is_widely else ''} raw={all_raw}")
if aigenerated.is_room_number(x): slots = group['slots']
locations.add(x.strip().replace(" ", "") if x is not None else None) w = weekday_key_name
if w not in slots.keys():
slots[w] = {}
# попытка починить пустую дисциплину pair_name = "????"
if parsed_discipline_name is None: try:
l = sorted(utils.remove_from_list(list(parsed_uncotigorized), list(locations | prepods | set([parsed_location, parsed_leader])))) pair_name = sorted(pairs)[0]
parsed_discipline_name = " ".join(l) except: pass
# чистим сеты от мусора today = slots[w]
utils.discards_list(prepods, nones=True, emptystrings=True) obj = {
utils.discards_list(locations, nones=True, emptystrings=True) "object": "event",
utils.discards_list(parsed_uncotigorized, nones=True, emptystrings=True) "pairs": sorted(pairs),
"is_flow": is_widely,
# если не пустой предмет то записываем его "excel_range": utils.merged_humanize((first_coord, col_low, row_c2, col_high)),
if not is_empty_lesson: "raw": sorted(all_raw),
slots = group['slots'] "weekday": utils.weekday_to_num(weekday),
w = weekday_key_name "weeknum": weeknum
if w not in slots.keys():
slots[w] = {}
today = slots[w]
today[pair] = {
"excel_pos": str(pos),
"discipline_name": parsed_discipline_name.strip(),
"locations": sorted(locations),
"leads": sorted(prepods),
"is_solid": is_solid,
"time_coeff": pairs,
"is_flow": is_wide_maybe_potokoviy,
"lefttopmerged": {
"width": merged.width(),
"height": merged.height(),
"excel_range": utils.merged_humanize(merged.as_numbers())
},
"raw": sorted(parsed_uncotigorized),
"weekday": utils.weekday_to_num(weekday),
"weeknum": weeknum
}
if fuck_empty_pair_in_excel:
today[pair]['pair_num_empty'] = {
"prev": previous_dump,
"restored": pair != "",
"pair": pair
} }
if wtf_tomanypairs: if len(times) > 0:
today[pair]['to_many_parsing_time_coeff'] = True obj['times'] = times
if pair_pos_col is None:
slots[w] = obj
else:
today[pair_name] = obj
# here may be a empty all_raw
clean_state()
first_coord = None
# INCREMENT на next и конец цикла. if row_c2 >= weekday_mr.high.row:
row += next clean_state()
pprint("Last for weekday")
row_c2 += 1
row_c1 += weekday_mr.height()
weekcycles += 1
def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx): def parse_groups(reader: "ExcelSheetReader", head, col_start, head_rx):
"""Распознать список групп и метаданные к ним, по сути получить список названий группы и координат её верхнего header-а (AQ6:AT6)""" """Распознать список групп и метаданные к ним, по сути получить список названий группы и координат её верхнего header-а (AQ6:AT6)"""
groups = {} groups = {}
i = 0 i = 0
@@ -296,21 +390,26 @@ def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx):
x = head[i] x = head[i]
pprint(f"while i={i} head[i]={x}") pprint(f"while i={i} head[i]={x}")
merged = reader.get_merged_coord(Coord(head_rx, i)) merged = reader.get_merged_coord(Coord(head_rx, i))
if i > monday.col + 1: if i >= col_start:
if merged is None or x == "": if merged is None or x == "" or x is None:
break
if merged.width() != 4:
pprint(f"WARNING: group header witdh !=4 (found: {merged.width()}); blocks !=4 not supported by parser.")
break break
name = utils.unspace(x) name = utils.unspace(x)
groups[name] = { skip = False
"name": name, if "-" not in name:
"position": [head_rx, i], for x in bad_group_names:
"position_human": utils.merged_humanize(merged.as_numbers()), if x in name.lower():
"slots": {} skip = True
} pprint(f"Skip groupname {name} because not dash in name and in blacklist")
if not skip:
groups[name] = {
"name": name,
"position": [head_rx, i],
"width": merged.width(),
"position_human": utils.merged_humanize(merged.as_numbers()),
"slots": {}
}
if merged is None: if merged is None:
i += 1 i += 1

View File

@@ -1,6 +1,7 @@
# --- Абстрактный базовый класс (Контракт) --- # --- Абстрактный базовый класс (Контракт) ---
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from datetime import datetime, time
import openpyxl import openpyxl
import xlrd import xlrd
@@ -10,10 +11,19 @@ from coord import Coord, Merged
EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK] EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK]
class TranschendentnostCell: class TranschendentnostCell:
def __init__(self, value, is_empty): def __init__(self, value, is_empty, is_time=False):
self.value = value self.value = value
self.is_time = isinstance(value, time) or is_time
self._is_empty = is_empty self._is_empty = is_empty
def is_nospace_nocase_same(self, query):
try:
if self.value.lower().replace(" ", "").strip() == query.lower().replace(" ", "").strip():
return True
except: pass
return False
def is_empty(self): def is_empty(self):
return self._is_empty return self._is_empty
@@ -29,6 +39,10 @@ class ExcelSheetReader(ABC):
def get_sheet_index(self): def get_sheet_index(self):
pass pass
@abstractmethod
def get_sheet_name(self):
pass
@abstractmethod @abstractmethod
def has_next_sheet(self): def has_next_sheet(self):
pass pass
@@ -71,16 +85,28 @@ class ExcelSheetReader(ABC):
return "TODO: info" return "TODO: info"
@abstractmethod @abstractmethod
def cell(self, row, col): def cell(self, row, col) -> TranschendentnostCell:
"""Возвращает абстрактную клетку""" """Возвращает абстрактную клетку"""
pass pass
def find(self, query = None): def find(self, query = None, startswith=False, nospace=False):
return self.find_any([query], startswith=startswith, nospace=nospace)
def find_any(self, query = None, startswith=False, nospace=False):
for rx in range(self.get_row_count()): for rx in range(self.get_row_count()):
i = 0 i = 0
for x in self.get_row_values(rx): for x in self.get_row_values(rx):
if x == query: if nospace:
return Coord(rx, i) x = str(x).replace(" ", "").strip()
for query_selected in query:
if x == query_selected:
return Coord(rx, i)
elif startswith:
try:
if str(x).lower().startswith(query_selected.lower()):
return Coord(rx, i)
except: pass
i += 1 i += 1
return None return None
@@ -118,6 +144,9 @@ class XlrdSheetReader(ExcelSheetReader):
def init_sheet(self): def init_sheet(self):
self.sheet = self.book.sheet_by_index(self.sheet_index) self.sheet = self.book.sheet_by_index(self.sheet_index)
def get_sheet_name(self):
return self.sheet.name
def has_next_sheet(self): def has_next_sheet(self):
return self.sheet_index < len(self.book.sheet_names())-1 return self.sheet_index < len(self.book.sheet_names())-1
@@ -140,7 +169,24 @@ class XlrdSheetReader(ExcelSheetReader):
def cell(self, row, col): def cell(self, row, col):
"""Возвращает абстрактную клетку""" """Возвращает абстрактную клетку"""
c = self.sheet.cell(row, col) c = self.sheet.cell(row, col)
return TranschendentnostCell(c.value, c.ctype in EMPTY_CTYPES) is_empty = c.ctype in EMPTY_CTYPES
is_time = c.ctype == xlrd.XL_CELL_DATE
value = c.value
if is_empty:
value = ""
elif is_time:
if isinstance(value, float):
if value <= 1:
seconds = round(value * 86400)
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
value = time(hour=hours, second=seconds, minute=minutes)
else:
print(f"TODO: value is {value} its unix? not 0.xxxxxxxx")
else:
is_time = False
print("IsTime but not float!")
return TranschendentnostCell(value, is_empty, is_time=is_time)
def get_border_style(self, coord: Coord, side): def get_border_style(self, coord: Coord, side):
row = coord.row row = coord.row
@@ -192,6 +238,9 @@ class OpenpyxlSheetReader(ExcelSheetReader):
def get_sheet_index(self): def get_sheet_index(self):
return self.sheet_index return self.sheet_index
def get_sheet_name(self):
return self.workbook.sheetnames[self.sheet_index]
def has_next_sheet(self): def has_next_sheet(self):
return self.sheet_index < len(self.workbook.sheetnames)-1 return self.sheet_index < len(self.workbook.sheetnames)-1
@@ -221,7 +270,7 @@ class OpenpyxlSheetReader(ExcelSheetReader):
c = self._get_cell(row, col) c = self._get_cell(row, col)
is_empty = (c.value is None) is_empty = (c.value is None)
return TranschendentnostCell("" if is_empty else c.value, is_empty) return TranschendentnostCell("" if is_empty else c.value, is_empty, is_time=isinstance(c.value, time))
def get_cell_value(self, row, col): def get_cell_value(self, row, col):
cell = self._get_cell(row, col) cell = self._get_cell(row, col)

View File

@@ -53,7 +53,7 @@ def remove_from_list(l: list, todel: list):
return l return l
def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down): def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down, with_cells=False):
RET = set() RET = set()
row = min_pos.row row = min_pos.row
@@ -61,10 +61,9 @@ def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down):
col = min_pos.col col = min_pos.col
while col < min_pos.col + right: while col < min_pos.col + right:
#print(excel_coordinate(row, col)) #print(excel_coordinate(row, col))
cv = reader.get_cell_value(row, col) cv = reader.cell(row, col)
value = str(cv).strip() if cv is not None and not cv.is_empty():
if cv is not None and len(value) > 0: RET.add(cv if with_cells else str(cv.value))
RET.add(value)
col += 1 col += 1
row += 1 row += 1
@@ -165,7 +164,7 @@ def find(sh, query = None):
return None return None
def weekday_to_num(st: str): def weekday_to_num(st: str):
if st.upper().strip() == "ПОНЕДЕЛЬНИК": if st.upper().strip().startswith("ПОНЕД"):
return 1 return 1
if st.upper().strip() == "ВТОРНИК": if st.upper().strip() == "ВТОРНИК":
return 2 return 2
@@ -177,7 +176,7 @@ def weekday_to_num(st: str):
return 5 return 5
if st.upper().strip() == "СУББОТА": if st.upper().strip() == "СУББОТА":
return 6 return 6
if st.upper().strip() == "ВОСКРЕСЕНЬЕ": if st.upper().strip().startswith("ВОСКР"):
return 7 return 7
return -1 return -1