refactor: big, more patterns\n\nBREAKING CHANGES

This commit is contained in:
2026-03-16 20:53:42 +03:00
parent 2105e9bc36
commit 7e0e4a0b71
5 changed files with 416 additions and 244 deletions

0
Dockerfile Normal file
View File

129
main.py
View File

@@ -5,6 +5,7 @@
import json import json
import os import os
import random
import time import time
import traceback import traceback
import uuid import uuid
@@ -26,50 +27,75 @@ FACULTETS = sorted([
DIRNAME = "excels" DIRNAME = "excels"
DIFFABLE_DATES = "diffable_dates.txt" DIFFABLE_DATES = "diffable_dates.txt"
SKIP_DIFFABLE_DATES = True
DEBUG_ONE_FAC = None #'fevt' DEBUG_ONE_FAC = None #'fevt'
result_groups = {} LOGGING = False
unique_raws = set()
result = { result = {
"version": 1, "version": 1,
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php", "notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php",
"actual_at": round(time.time()), "actual_at": round(time.time()),
"documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json", "documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json (temporary outdated)",
"daypicture": "QwQ", "daypicture": "0w0",
"daycite": "running on a rope", "daycite": "KIlLSWITCH",
"contact": "https://fazziclay.com/", "contact": "https://fazziclay.com/",
"university": "VSTU", "university": "VSTU",
"university_site": "https://www.vstu.ru/", "university_site": "https://www.vstu.ru/",
"source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json", "source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json",
"stat": { "stat": {
"total_parsing_time": -1, "total_parsing_time": -1,
"excels": {
"fine": 0,
"bad": 0
},
"groups": 0,
"unique_raws": -1
}, },
"api_notices": { "api_notices": {
"updated_at": 1759651871, "updated_at": 1773523692,
"text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.", "text_pre1": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.",
"warning": False, "text": "2026-03-15 BREAKING CHANGES! By Stanislav Mironov.\n\nИзменено многое в угоду унифкации и расширению спаршенных групп. Пока alpha",
"warning": True,
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'" "tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
}, },
"doubled_groups": [],
"debug": { "debug": {
"bleu~~": 2 "bleu~~": 3
}, },
"excels": [], "excels": [],
"facultets": FACULTETS, "facultets": FACULTETS,
"group_names_parsed": [],
"emptykey1": "", "unique_raws": unique_raws,
"emptykey2": "",
"groups": result_groups,
"emptykey3": "",
"emptykey4": "",
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА" "see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
} }
def process_obj(data):
try:
if isinstance(data, dict):
for key, value in data.items():
if key == "raw":
unique_raws.update(value)
process_obj(value)
# Если это список, проходим по его элементам
elif isinstance(data, list):
for item in data:
process_obj(item)
except Exception as e:
print("Failed process_obj")
print(e)
def process_excel_file(facultet, excel_url, counter, latest_changed): def process_excel_file(facultet, excel_url, counter, latest_changed):
is_xlsx = excel_url.endswith(".xlsx") is_xlsx = excel_url.endswith(".xlsx")
download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
excel_filename = excel_url.split("/")[-1] excel_filename = excel_url.split("/")[-1]
if "ФЭУ" not in excel_filename:
print("SKIPPED")
return
excel_info = { excel_info = {
"filename": excel_filename, "filename": excel_filename,
@@ -80,9 +106,9 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
"group_names_parsed": [], "group_names_parsed": [],
"facultet": facultet, "facultet": facultet,
"counter": counter, "counter": counter,
"week_keys_metadata": {} "sheets": []
} }
parser.LOGGING = False parser.LOGGING = LOGGING
try: try:
aigenerated.download_file_from_url(excel_url, download_place) aigenerated.download_file_from_url(excel_url, download_place)
@@ -94,44 +120,45 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
while True: while True:
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)") print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
sheet_dict = {
"index": reader.get_sheet_index(),
"name": reader.get_sheet_name(),
"reader_info": reader.info(),
"group_names_parsed": [],
"groups": {}
}
excel_info['sheets'].append(sheet_dict)
prs = parser.Parser(reader) prs = parser.Parser(reader)
print("Parser created; parser.parse();") print("Parser created; parser.parse();")
prs.parse() prs.parse()
print("parsed done!") print("parsed done!")
if len(prs.raw_no_schedule) > 0:
sheet_dict["raw_no_schedule"] = prs.raw_no_schedule
if len(prs.features) > 0:
sheet_dict["features"] = sorted(prs.features)
if prs.parser_error is not None: if prs.parser_error is not None:
excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error sheet_dict["parser_error"] = prs.parser_error
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0: if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
excel_info["parser_warnings_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_warnings sheet_dict["parser_warnings"] = prs.parser_warnings
for group_name in prs.groups.keys(): for group_name in prs.groups.keys():
if group_name in result_groups.keys(): gr = prs.groups[group_name]
print(f" -- WTF -- Doubled groups -- name: {group_name}") gr["excel_url"] = excel_url
if 'warning_doubled_groups_skip' not in excel_info.keys(): sheet_dict["group_names_parsed"].append(group_name)
excel_info['warning_doubled_groups_skip'] = []
excel_info['warning_doubled_groups_skip'].append(group_name)
result['doubled_groups'].append(group_name)
continue
gr = result_groups[group_name] = prs.groups[group_name]
gr['facultet'] = facultet
gr['data_source'] = excel_filename # same as 'filename' in excel_info's
gr['data_source_hash'] = sha1hash
gr['debug'] = {
"excel_url": excel_url,
"reader_info": reader.info(),
"reader_sheet_index": reader.get_sheet_index(),
"download_place": download_place
}
excel_info["group_names_parsed"].append(group_name) excel_info["group_names_parsed"].append(group_name)
excel_info['week_keys_metadata'] = prs.week_keys_metadata result["group_names_parsed"].append(group_name)
result['stat']['groups'] += 1
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
sheet_dict['groups'][group_name] = gr
process_obj(gr['slots'])
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys())) print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
if not reader.has_next_sheet(): if not reader.has_next_sheet():
print("File ended") print("File ended")
@@ -159,10 +186,12 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
}) })
result['excels'].append(excel_info) result['excels'].append(excel_info)
k = "fine" if len(excel_info['group_names_parsed']) > 0 else "bad"
result['stat']['excels'][k] += 1
faileds = [] faileds = []
def main(): def main():
global result_groups, result global result
t = utils.StepTimeCounter() t = utils.StepTimeCounter()
try: try:
try: try:
@@ -189,7 +218,9 @@ def main():
if now_diffable_dates == prev_diffable_dates: if now_diffable_dates == prev_diffable_dates:
print("No date changes in vstu.ru website. Stopping") print("No date changes in vstu.ru website. Stopping")
if not SKIP_DIFFABLE_DATES:
return return
print("SKIP_DIFFABLE_DATES is True, force resuming")
counter = 10000 counter = 10000
for excel_link in EXCEL_LINKS: for excel_link in EXCEL_LINKS:
@@ -200,14 +231,8 @@ def main():
process_excel_file(facultet, excel_url, counter, latest_changed) process_excel_file(facultet, excel_url, counter, latest_changed)
print("Saving result.json") print("Saving result.json")
group_names_alphabeticaly = sorted(result_groups.keys())
sorted_groups = {}
for group_name in group_names_alphabeticaly:
sorted_groups[group_name] = result_groups[group_name]
result['groups'] = sorted_groups
result['stat']['total_parsing_time'] = t.step() result['stat']['total_parsing_time'] = t.step()
result['unique_raws'] = sorted(unique_raws)
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False) json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json indent=2") print("Saved to result.json indent=2")

397
parser.py
View File

@@ -3,10 +3,21 @@
PAIR_NUMS = [ PAIR_NUMS = [
"1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16" "1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16"
] ]
WEEKDAYS_STARTSWITH = [
"понед",
"вторник",
"среда",
"четверг",
"пятница",
"суббота"
]
bad_group_names = [
"янв", "февр", "март", "апр", "май", "сент", "окт", "ноя", "дек", "июнь", "июль", "авг"
]
from datetime import time
import json import json
import uuid
import aigenerated
from coord import Coord, Merged from coord import Coord, Merged
from translations import ExcelSheetReader from translations import ExcelSheetReader
import utils import utils
@@ -18,49 +29,126 @@ def pprint(*args, **kwargs):
if LOGGING: if LOGGING:
print(*args, **kwargs) print(*args, **kwargs)
def is_weeknum(text):
for wd in WEEKDAYS_STARTSWITH:
if text.strip().replace(" ", "").lower().startswith(wd):
return True
return False
def is_pair(text):
for p in PAIR_NUMS:
if text.strip().replace(" ", "").lower().startswith(p):
return True
return False
class Parser: class Parser:
def __init__(self, reader: ExcelSheetReader): def __init__(self, reader: ExcelSheetReader):
self.reader = reader self.reader = reader
self.groups = {} self.groups = {} # Группы которые удалось распарсить
self.week_keys_metadata = {} self.features = set() # фичи данной страницы
self.week_keys_metadata = {} # календарик
self.schedule_range_row = None # [min, max] диапазон col включительно где расписание
self.raw_no_schedule = [] # всё что не schedule_range_row отправляется сюда ('СОГЛАСОВАНО:', etc..)
self.weeknums: defaultdict = defaultdict(set) # no support json! self.weeknums: defaultdict = defaultdict(set) # no support json! (для week_keys_metadata)
self.parser_error = None self.parser_error = None # ошибка парсера перед выходом
self.parser_warnings = [] self.parser_warnings = [] # предупреждения парсера
pprint("Parser created for '{0}'".format(reader.info())) pprint("Parser created for '{0}'".format(reader.info()))
def parse(self): def parse(self):
monday = self.reader.find("ПОНЕДЕЛЬНИК") # Характерные признаки разных сеток
if monday is None: no_pair_numeration = False
col_distance_pair_weekday = None
weekday_firstly_calendar = False
first_weekday = self.reader.find_any(WEEKDAYS_STARTSWITH, startswith=True, nospace=True)
if first_weekday is None:
self.features.add("no_weekdays")
print(" -- Failed parse! -- ") print(" -- Failed parse! -- ")
print("ПОНЕДЕЛЬНИК НЕ НАЙДЕН!") print("дни недели не найдены!")
self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице." self.parser_error = f"{WEEKDAYS_STARTSWITH} ни один найден в таблице. Дня недели нет."
return return
if monday.col != 4: pair_num_any = self.reader.find_any(PAIR_NUMS, nospace=True)
print("--- warning parse! ---") if pair_num_any is None:
print(f"Monday col != 4 (actual: {monday})") no_pair_numeration = True
self.parser_warnings.append(f"Monday col != 4 (actual: {monday}); Это, наверное, может работать не стабильно!") self.features.add("no_pair_numeration")
self.parser_warnings.append(f"Нет нумерации академических часов {PAIR_NUMS}")
else:
self.features.add("pair_numeration")
col_distance_pair_weekday = pair_num_any.col - first_weekday.col
head_rx = first_weekday.row - 1 # выше первого понидельника
group_col_start = first_weekday.col + 2
if col_distance_pair_weekday is not None:
if col_distance_pair_weekday > 1:
weekday_firstly_calendar = True
self.features.add("weekdays_before_calendar")
group_col_start = pair_num_any.col + 1
head_rx = monday.row - 1 # выше первого понидельника
if head_rx < 0: if head_rx < 0:
raise Exception("head_rx < 0: Программа пыталась найти 'ПОНЕДЕЛЬНИК', но по всей видимости не нашла.") raise Exception("head_rx < 0: Программа пыталась найти день недели, но по всей видимости не нашла.")
head = self.reader.get_row_values(head_rx) # get all ROW (months, groups) head = self.reader.get_row_values(head_rx) # get all ROW (months, groups)
pprint(f"head={head}") pprint(f"head={head}")
self.groups = parse_groups(self.reader, head, monday, head_rx) # parse groups to self.groups
head_joined = " ||| ".join([v for v in head if isinstance(v, str) and v.strip()])
print(head_joined)
if "1 неделя" in head_joined or "1 НЕДЕЛЯ" in head_joined or "2 неделя" in head_joined or "2 НЕДЕЛЯ" in head_joined or "ИЗМЕНЕНИЯ" in head_joined or "изменения" in head_joined or "vtf-vstu.ru" in head_joined:
head_rx -= 1
head = self.reader.get_row_values(head_rx) # get all ROW (months, groups)
pprint(f"head (upper)={head}")
self.features.add("post_groups_info_row")
self.groups = parse_groups(self.reader, head, group_col_start, head_rx) # parse groups to self.groups
pprint(f'self.groups={json.dumps(self.groups, indent=2, ensure_ascii=False)}') pprint(f'self.groups={json.dumps(self.groups, indent=2, ensure_ascii=False)}')
pprint("\n\n\n") pprint("\n\n\n")
for group in self.groups.values(): for group in self.groups.values():
pprint("\nSTART OF PROCESS GROUP\n") pprint("\nSTART OF PROCESS GROUP\n")
self.process_group(group, monday) self.process_group(group, first_weekday, pair_num_any.col if pair_num_any else None)
pprint("\nEND OF PROCESS GROUP\n") pprint("\nEND OF PROCESS GROUP\n")
self.process_weekmetadatas(monday) # week metadatas parse
S = 9999999
group_min_col = S
group_min_row = S
def process_weekmetadatas(self, first_monday: "Coord"): for x in self.groups.values():
p = x['position']
group_min_row = min(p[0], group_min_row)
group_min_col = min(p[1], group_min_col)
if group_min_row != S and group_min_col != S:
pprint("Process weekmetadatas!")
self.process_weekmetadatas(Coord(row=group_min_row, col=group_min_col))
# parse no-schedule raws (согласовано, и т.д.)
self.parse_raw_no_schedule()
def parse_raw_no_schedule(self):
"""Распарсить всё за пределами self.schedule_range_row в self.raw_no_schedule"""
if self.schedule_range_row is None:
return
row = 0
while row < self.reader.get_row_count():
if row >= self.schedule_range_row[0] and row <= self.schedule_range_row[1]:
row = self.schedule_range_row[1] + 1
row_values = self.reader.get_row_values(row)
row_values = [v for v in row_values if isinstance(v, str) and v.strip()]
if len(row_values) > 0:
self.raw_no_schedule.append(row_values)
row += 1
def process_weekmetadatas(self, first_group: "Coord"):
"""Обработать календарик"""
for x in self.weeknums.keys(): for x in self.weeknums.keys():
pprint(x) pprint(x)
set_of_merged: set = self.weeknums[x] set_of_merged: set = self.weeknums[x]
@@ -82,14 +170,16 @@ class Parser:
self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})") self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})")
continue continue
month_row = first_monday.row - 1 month_row = first_group.row
curr_col = weekday_merged.low.col - 1 curr_col = first_group.col - 1
while curr_col >= 0: while curr_col >= 0:
month_pos = Coord(month_row, curr_col) month_pos = Coord(month_row, curr_col)
month_cell = month_pos.cell(self.reader) month_cell = month_pos.cell(self.reader)
if month_cell.is_empty(): if month_cell.is_empty():
pprint("month cell is empty") pprint("month cell is empty")
break curr_col -= 1
continue
month_name = str(month_cell.value).strip() month_name = str(month_cell.value).strip()
pprint(month_cell) pprint(month_cell)
all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.replace(row=weekday_merged.low.row), right=1, down=weekday_merged.height()) all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.replace(row=weekday_merged.low.row), right=1, down=weekday_merged.height())
@@ -118,6 +208,16 @@ class Parser:
def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"): def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"):
self.weeknums[week_key_name].add(merged) self.weeknums[week_key_name].add(merged)
def row_with_schedule_notify(self, row_coord):
if self.schedule_range_row is None:
self.schedule_range_row = [row_coord, row_coord]
if self.schedule_range_row[1] < row_coord:
self.schedule_range_row[1] = row_coord
if self.schedule_range_row[0] > row_coord:
self.schedule_range_row[0] = row_coord
def parse_potokoviy(self, merged: Merged): def parse_potokoviy(self, merged: Merged):
speaker = None speaker = None
location = None location = None
@@ -132,163 +232,157 @@ class Parser:
return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()} return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()}
def process_group(self, group: dict, monday: Coord): def process_group(self, group: dict, first_weekday: Coord, pair_pos_col):
""" """
Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups) Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups)
group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'} group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'}
""" """
pprint(f"process_group group={group}") pprint(f"process_group group={group}")
group_name = group['name'] group_name = group['name']
pprint(group_name) pprint(F"Имя группы: {group_name}")
row = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля) row_c1 = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля)
self.row_with_schedule_notify(group['position'][0])
group_header_pos = Coord(group['position'][0], group['position'][1])
width = group['width']
weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии. weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии.
previous_pair = None previous_pair = None
while row < self.reader.get_row_count(): # maybe условие чтобы не уйти ниже чем есть строк
pos = Coord(row, group['position'][1]) # текущая позиция, верхний левый угол (=low) weekcycles = 0
pprint(f"while pos={pos}") while row_c1 < self.reader.get_row_count():
pos_right = pos.shift(right=3) pos_c1 = Coord(row_c1, group['position'][1]) # текущая позиция, верхний левый угол (=low)
pair_pos = pos.replace(col=monday.col + 1) self.row_with_schedule_notify(pos_c1.row)
weekday_pos = pos.replace(col=monday.col)
merged = self.reader.get_merged_coord(pos)
merged_cell = merged.cell(self.reader) if pos_c1.cell(self.reader).is_nospace_nocase_same(group_name):
cv = merged_cell.value pprint("Ended with grpup name; stop moving down, break")
# В конце (12 пара:>) название группы, можно использовать как якорь
if utils.unspace(cv) == group_name:
pprint("Lesson == group name; ending group loop.")
break break
weekday_pos = pos_c1.replace(col=first_weekday.col)
weekday_cell = weekday_pos.cell(self.reader)
weekday_mr = self.reader.get_merged_coord(weekday_pos) weekday_mr = self.reader.get_merged_coord(weekday_pos)
weekday = utils.unspace(weekday_mr.cell(self.reader).value) weekday = weekday_cell.value
pair_mr = self.reader.get_merged_coord(pair_pos)
pair = utils.unspace(pair_mr.cell(self.reader).value)
skip = 0 if not is_weeknum(weekday):
if weekday == "": row_c1 += 1
if weeknum == 1: pprint("Not weeknum!")
weeknum += 1 if weekcycles > 0:
pprint("------") if (weeknum != 2):
skip = 1 pprint("Weeknum now 2")
row += 1 weekday = 0
else: weeknum = 2
break continue
if not skip:
next = 3 # на сколько пыгнуть для следующего шага?
pprint(weekday)
weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2") weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2")
self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr) self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr)
is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего.. # state
parsed_discipline_name = None event_no = 1
parsed_location = None is_widely = False
parsed_leader = None override_col_range = None
pairs = 1 all_raw = set()
wtf_tomanypairs = False pairs = set()
is_solid = pos_right in merged times = []
parsed_uncotigorized = [] first_coord = None
is_wide_maybe_potokoviy = merged.width() > 4 # потоковая ли лекция (занимает несколько групп.)
if not is_empty_lesson: row_c2 = row_c1
cur = pos.shift(down=2) while row_c2 <= weekday_mr.high.row:
while utils.has_no_bottom_border(self.reader, cur): pos_c2 = Coord(row_c2, group['position'][1]) # текущая позиция (внутри группы, внутри дня недели), верхний левый угол (=low)
next += 3 cell_c2 = pos_c2.cell(self.reader)
pairs += 1 mr_c2 = self.reader.get_merged_coord(pos_c2)
pprint(f"next = {next} cur={cur}")
if pairs >= 7:
wtf_tomanypairs = True
break
cur = cur.shift(down=3)
if is_wide_maybe_potokoviy: if first_coord is None:
ret = self.parse_potokoviy(merged) first_coord = pos_c2.row
parsed_location = ret['loc']
parsed_leader = ret['leader']
parsed_discipline_name = ret['name']
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, merged.width(), next))
pair_num = None
pair_num_mr = None
if pair_pos_col is not None:
pair_num = pos_c2.replace(col=pair_pos_col)
pair_num_mr = self.reader.get_merged_coord(pair_num)
if (not is_widely) and (mr_c2.low.col < group_header_pos.col or mr_c2.high.col > group_header_pos.col + width - 1):
is_widely = True
override_col_range = (mr_c2.low.col, mr_c2.high.col)
col_low = group_header_pos.col
col_high = group_header_pos.col + width - 1
if override_col_range is not None:
col_low = min(col_low, override_col_range[0])
col_high = max(col_high, override_col_range[1])
dirty_line = utils.parse_all_dirt(self.reader, Coord(row_c2, col_low), (col_high - col_low + 1), 1, with_cells=True)
if len(dirty_line) > 0:
if pair_num_mr is not None:
pair_num_to_add = pair_num_mr.cell(self.reader).value.replace(" ", "").strip()
if len(pair_num_to_add) == 0:
pair_num_to_add = "???"
pairs.add(pair_num_to_add)
for cell in dirty_line:
if not cell.is_time:
all_raw.add(str(cell.value))
else: else:
if (is_solid): dt: time = cell.value
parsed_discipline_name = cv times.append(str(dt))
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, 4, next)) def clean_state():
nonlocal is_widely, override_col_range, event_no, all_raw, pairs, times, first_coord
is_widely = False
override_col_range = None
event_no += 1
all_raw = set()
pairs = set()
first_coord = None
times = []
# попытка исправить пару (1-2) если пустая.
fuck_empty_pair_in_excel = pair == ""
previous_dump = previous_pair
if fuck_empty_pair_in_excel:
if previous_pair is None or previous_pair == "":
pair = f"EMPTY_IN_EXCEL"
else:
pair = utils.next_element(PAIR_NUMS, previous_pair)
if pair != "": if not utils.has_no_bottom_border(self.reader, pos_c2) and not(mr_c2.high.row - row_c2 > 0):
previous_pair = pair if next == 3 else None # костыль чтобы избежать гипотетически не верной даты. if not (len(all_raw) == 0):
# this code last for current state event
pprint(f"{event_no} {pairs}: {'[wide] ' if is_widely else ''} raw={all_raw}")
# пытаемся из некотегорезированных данных выцепить место и лидера (препода)
prepods = set()
if parsed_leader is not None: prepods.add(parsed_leader.strip())
locations = set()
if parsed_location is not None: locations.add(parsed_location.strip().replace(" ", ""))
for x in list(parsed_uncotigorized):
if aigenerated.is_surname_string(x):
prepods.add(x.strip())
if aigenerated.is_room_number(x):
locations.add(x.strip().replace(" ", "") if x is not None else None)
# попытка починить пустую дисциплину
if parsed_discipline_name is None:
l = sorted(utils.remove_from_list(list(parsed_uncotigorized), list(locations | prepods | set([parsed_location, parsed_leader]))))
parsed_discipline_name = " ".join(l)
# чистим сеты от мусора
utils.discards_list(prepods, nones=True, emptystrings=True)
utils.discards_list(locations, nones=True, emptystrings=True)
utils.discards_list(parsed_uncotigorized, nones=True, emptystrings=True)
# если не пустой предмет то записываем его
if not is_empty_lesson:
slots = group['slots'] slots = group['slots']
w = weekday_key_name w = weekday_key_name
if w not in slots.keys(): if w not in slots.keys():
slots[w] = {} slots[w] = {}
pair_name = "????"
try:
pair_name = sorted(pairs)[0]
except: pass
today = slots[w] today = slots[w]
today[pair] = { obj = {
"excel_pos": str(pos), "object": "event",
"discipline_name": parsed_discipline_name.strip(), "pairs": sorted(pairs),
"locations": sorted(locations), "is_flow": is_widely,
"leads": sorted(prepods), "excel_range": utils.merged_humanize((first_coord, col_low, row_c2, col_high)),
"is_solid": is_solid, "raw": sorted(all_raw),
"time_coeff": pairs,
"is_flow": is_wide_maybe_potokoviy,
"lefttopmerged": {
"width": merged.width(),
"height": merged.height(),
"excel_range": utils.merged_humanize(merged.as_numbers())
},
"raw": sorted(parsed_uncotigorized),
"weekday": utils.weekday_to_num(weekday), "weekday": utils.weekday_to_num(weekday),
"weeknum": weeknum "weeknum": weeknum
} }
if fuck_empty_pair_in_excel: if len(times) > 0:
today[pair]['pair_num_empty'] = { obj['times'] = times
"prev": previous_dump,
"restored": pair != "", if pair_pos_col is None:
"pair": pair slots[w] = obj
} else:
if wtf_tomanypairs: today[pair_name] = obj
today[pair]['to_many_parsing_time_coeff'] = True # here may be a empty all_raw
clean_state()
first_coord = None
# INCREMENT на next и конец цикла. if row_c2 >= weekday_mr.high.row:
row += next clean_state()
pprint("Last for weekday")
row_c2 += 1
row_c1 += weekday_mr.height()
weekcycles += 1
def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx): def parse_groups(reader: "ExcelSheetReader", head, col_start, head_rx):
"""Распознать список групп и метаданные к ним, по сути получить список названий группы и координат её верхнего header-а (AQ6:AT6)""" """Распознать список групп и метаданные к ним, по сути получить список названий группы и координат её верхнего header-а (AQ6:AT6)"""
groups = {} groups = {}
i = 0 i = 0
@@ -296,18 +390,23 @@ def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx):
x = head[i] x = head[i]
pprint(f"while i={i} head[i]={x}") pprint(f"while i={i} head[i]={x}")
merged = reader.get_merged_coord(Coord(head_rx, i)) merged = reader.get_merged_coord(Coord(head_rx, i))
if i > monday.col + 1: if i >= col_start:
if merged is None or x == "": if merged is None or x == "" or x is None:
break
if merged.width() != 4:
pprint(f"WARNING: group header witdh !=4 (found: {merged.width()}); blocks !=4 not supported by parser.")
break break
name = utils.unspace(x) name = utils.unspace(x)
skip = False
if "-" not in name:
for x in bad_group_names:
if x in name.lower():
skip = True
pprint(f"Skip groupname {name} because not dash in name and in blacklist")
if not skip:
groups[name] = { groups[name] = {
"name": name, "name": name,
"position": [head_rx, i], "position": [head_rx, i],
"width": merged.width(),
"position_human": utils.merged_humanize(merged.as_numbers()), "position_human": utils.merged_humanize(merged.as_numbers()),
"slots": {} "slots": {}
} }

View File

@@ -1,6 +1,7 @@
# --- Абстрактный базовый класс (Контракт) --- # --- Абстрактный базовый класс (Контракт) ---
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from datetime import datetime, time
import openpyxl import openpyxl
import xlrd import xlrd
@@ -10,10 +11,19 @@ from coord import Coord, Merged
EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK] EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK]
class TranschendentnostCell: class TranschendentnostCell:
def __init__(self, value, is_empty): def __init__(self, value, is_empty, is_time=False):
self.value = value self.value = value
self.is_time = isinstance(value, time) or is_time
self._is_empty = is_empty self._is_empty = is_empty
def is_nospace_nocase_same(self, query):
try:
if self.value.lower().replace(" ", "").strip() == query.lower().replace(" ", "").strip():
return True
except: pass
return False
def is_empty(self): def is_empty(self):
return self._is_empty return self._is_empty
@@ -29,6 +39,10 @@ class ExcelSheetReader(ABC):
def get_sheet_index(self): def get_sheet_index(self):
pass pass
@abstractmethod
def get_sheet_name(self):
pass
@abstractmethod @abstractmethod
def has_next_sheet(self): def has_next_sheet(self):
pass pass
@@ -71,16 +85,28 @@ class ExcelSheetReader(ABC):
return "TODO: info" return "TODO: info"
@abstractmethod @abstractmethod
def cell(self, row, col): def cell(self, row, col) -> TranschendentnostCell:
"""Возвращает абстрактную клетку""" """Возвращает абстрактную клетку"""
pass pass
def find(self, query = None): def find(self, query = None, startswith=False, nospace=False):
return self.find_any([query], startswith=startswith, nospace=nospace)
def find_any(self, query = None, startswith=False, nospace=False):
for rx in range(self.get_row_count()): for rx in range(self.get_row_count()):
i = 0 i = 0
for x in self.get_row_values(rx): for x in self.get_row_values(rx):
if x == query: if nospace:
x = str(x).replace(" ", "").strip()
for query_selected in query:
if x == query_selected:
return Coord(rx, i) return Coord(rx, i)
elif startswith:
try:
if str(x).lower().startswith(query_selected.lower()):
return Coord(rx, i)
except: pass
i += 1 i += 1
return None return None
@@ -118,6 +144,9 @@ class XlrdSheetReader(ExcelSheetReader):
def init_sheet(self): def init_sheet(self):
self.sheet = self.book.sheet_by_index(self.sheet_index) self.sheet = self.book.sheet_by_index(self.sheet_index)
def get_sheet_name(self):
return self.sheet.name
def has_next_sheet(self): def has_next_sheet(self):
return self.sheet_index < len(self.book.sheet_names())-1 return self.sheet_index < len(self.book.sheet_names())-1
@@ -140,7 +169,24 @@ class XlrdSheetReader(ExcelSheetReader):
def cell(self, row, col): def cell(self, row, col):
"""Возвращает абстрактную клетку""" """Возвращает абстрактную клетку"""
c = self.sheet.cell(row, col) c = self.sheet.cell(row, col)
return TranschendentnostCell(c.value, c.ctype in EMPTY_CTYPES) is_empty = c.ctype in EMPTY_CTYPES
is_time = c.ctype == xlrd.XL_CELL_DATE
value = c.value
if is_empty:
value = ""
elif is_time:
if isinstance(value, float):
if value <= 1:
seconds = round(value * 86400)
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
value = time(hour=hours, second=seconds, minute=minutes)
else:
print(f"TODO: value is {value} its unix? not 0.xxxxxxxx")
else:
is_time = False
print("IsTime but not float!")
return TranschendentnostCell(value, is_empty, is_time=is_time)
def get_border_style(self, coord: Coord, side): def get_border_style(self, coord: Coord, side):
row = coord.row row = coord.row
@@ -192,6 +238,9 @@ class OpenpyxlSheetReader(ExcelSheetReader):
def get_sheet_index(self): def get_sheet_index(self):
return self.sheet_index return self.sheet_index
def get_sheet_name(self):
return self.workbook.sheetnames[self.sheet_index]
def has_next_sheet(self): def has_next_sheet(self):
return self.sheet_index < len(self.workbook.sheetnames)-1 return self.sheet_index < len(self.workbook.sheetnames)-1
@@ -221,7 +270,7 @@ class OpenpyxlSheetReader(ExcelSheetReader):
c = self._get_cell(row, col) c = self._get_cell(row, col)
is_empty = (c.value is None) is_empty = (c.value is None)
return TranschendentnostCell("" if is_empty else c.value, is_empty) return TranschendentnostCell("" if is_empty else c.value, is_empty, is_time=isinstance(c.value, time))
def get_cell_value(self, row, col): def get_cell_value(self, row, col):
cell = self._get_cell(row, col) cell = self._get_cell(row, col)

View File

@@ -53,7 +53,7 @@ def remove_from_list(l: list, todel: list):
return l return l
def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down): def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down, with_cells=False):
RET = set() RET = set()
row = min_pos.row row = min_pos.row
@@ -61,10 +61,9 @@ def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down):
col = min_pos.col col = min_pos.col
while col < min_pos.col + right: while col < min_pos.col + right:
#print(excel_coordinate(row, col)) #print(excel_coordinate(row, col))
cv = reader.get_cell_value(row, col) cv = reader.cell(row, col)
value = str(cv).strip() if cv is not None and not cv.is_empty():
if cv is not None and len(value) > 0: RET.add(cv if with_cells else str(cv.value))
RET.add(value)
col += 1 col += 1
row += 1 row += 1
@@ -165,7 +164,7 @@ def find(sh, query = None):
return None return None
def weekday_to_num(st: str): def weekday_to_num(st: str):
if st.upper().strip() == "ПОНЕДЕЛЬНИК": if st.upper().strip().startswith("ПОНЕД"):
return 1 return 1
if st.upper().strip() == "ВТОРНИК": if st.upper().strip() == "ВТОРНИК":
return 2 return 2
@@ -177,7 +176,7 @@ def weekday_to_num(st: str):
return 5 return 5
if st.upper().strip() == "СУББОТА": if st.upper().strip() == "СУББОТА":
return 6 return 6
if st.upper().strip() == "ВОСКРЕСЕНЬЕ": if st.upper().strip().startswith("ВОСКР"):
return 7 return 7
return -1 return -1