refactor: big, more patterns\n\nBREAKING CHANGES
This commit is contained in:
0
Dockerfile
Normal file
0
Dockerfile
Normal file
131
main.py
131
main.py
@@ -5,6 +5,7 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
@@ -26,50 +27,75 @@ FACULTETS = sorted([
|
||||
DIRNAME = "excels"
|
||||
DIFFABLE_DATES = "diffable_dates.txt"
|
||||
|
||||
SKIP_DIFFABLE_DATES = True
|
||||
|
||||
DEBUG_ONE_FAC = None #'fevt'
|
||||
result_groups = {}
|
||||
LOGGING = False
|
||||
|
||||
unique_raws = set()
|
||||
result = {
|
||||
"version": 1,
|
||||
"notice": "ОТКАЗ ОТ ОТВЕТСТВЕННОСТИ: Данные, доступ к API и т.д. предоставляется КАК-ЕСТЬ (AS-IS) без каких либо, явно или не явно подразумеваемых гарантий.\n\nПарсер написал: Миронов Станислав\n\nИсточник данных: https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php",
|
||||
"actual_at": round(time.time()),
|
||||
"documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json",
|
||||
"daypicture": "QwQ",
|
||||
"daycite": "running on a rope",
|
||||
"documentation": "https://fazziclay.com/api/v1/vstu_schedule_parser/scheme.json (temporary outdated)",
|
||||
"daypicture": "0w0",
|
||||
"daycite": "KIlLSWITCH",
|
||||
"contact": "https://fazziclay.com/",
|
||||
"university": "VSTU",
|
||||
"university_site": "https://www.vstu.ru/",
|
||||
"source": "https://fazziclay.com/api/v1/vstu_schedule_parser/result.json",
|
||||
"stat": {
|
||||
"total_parsing_time": -1,
|
||||
"excels": {
|
||||
"fine": 0,
|
||||
"bad": 0
|
||||
},
|
||||
"groups": 0,
|
||||
"unique_raws": -1
|
||||
},
|
||||
"api_notices": {
|
||||
"updated_at": 1759651871,
|
||||
"text": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.",
|
||||
"warning": False,
|
||||
"updated_at": 1773523692,
|
||||
"text_pre1": "Пожалуйста сохраняйте 'updated_at', это время изменения ЭТОГО текста. Тут возможно будут появлятся важные BREAKING CHANGES и дедлайны к ним.\nПо хорошему если updated_at другой по сравнению с вашем кэшем это сообщение должно отправляться вам в телеграм как уведомление о поедстоящих изменениях\nwarning=True значит 'text' содержит важное а не как щас hint.\n\n ~fazziclay aka Stanislav;\n\n2025-10-05: добавлено data_source_hash в эксель и в группу. Это SHA1 of скачанный эксель файл.",
|
||||
"text": "2026-03-15 BREAKING CHANGES! By Stanislav Mironov.\n\nИзменено многое в угоду унифкации и расширению спаршенных групп. Пока alpha",
|
||||
"warning": True,
|
||||
"tut-plavayuschaya-struktura": "required only 'updated_at', 'text' and 'warning'"
|
||||
},
|
||||
"doubled_groups": [],
|
||||
"debug": {
|
||||
"bleu~~": 2
|
||||
"bleu~~": 3
|
||||
},
|
||||
"excels": [],
|
||||
"facultets": FACULTETS,
|
||||
|
||||
"emptykey1": "",
|
||||
"emptykey2": "",
|
||||
|
||||
"groups": result_groups,
|
||||
|
||||
"emptykey3": "",
|
||||
"emptykey4": "",
|
||||
"group_names_parsed": [],
|
||||
"unique_raws": unique_raws,
|
||||
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
|
||||
}
|
||||
|
||||
def process_obj(data):
|
||||
try:
|
||||
if isinstance(data, dict):
|
||||
for key, value in data.items():
|
||||
if key == "raw":
|
||||
unique_raws.update(value)
|
||||
|
||||
process_obj(value)
|
||||
|
||||
# Если это список, проходим по его элементам
|
||||
elif isinstance(data, list):
|
||||
for item in data:
|
||||
process_obj(item)
|
||||
|
||||
except Exception as e:
|
||||
print("Failed process_obj")
|
||||
print(e)
|
||||
|
||||
def process_excel_file(facultet, excel_url, counter, latest_changed):
|
||||
is_xlsx = excel_url.endswith(".xlsx")
|
||||
download_place = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
|
||||
|
||||
excel_filename = excel_url.split("/")[-1]
|
||||
if "ФЭУ" not in excel_filename:
|
||||
print("SKIPPED")
|
||||
return
|
||||
|
||||
excel_info = {
|
||||
"filename": excel_filename,
|
||||
@@ -80,9 +106,9 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
|
||||
"group_names_parsed": [],
|
||||
"facultet": facultet,
|
||||
"counter": counter,
|
||||
"week_keys_metadata": {}
|
||||
"sheets": []
|
||||
}
|
||||
parser.LOGGING = False
|
||||
parser.LOGGING = LOGGING
|
||||
|
||||
try:
|
||||
aigenerated.download_file_from_url(excel_url, download_place)
|
||||
@@ -94,44 +120,45 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
|
||||
|
||||
while True:
|
||||
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
|
||||
sheet_dict = {
|
||||
"index": reader.get_sheet_index(),
|
||||
"name": reader.get_sheet_name(),
|
||||
"reader_info": reader.info(),
|
||||
"group_names_parsed": [],
|
||||
"groups": {}
|
||||
}
|
||||
excel_info['sheets'].append(sheet_dict)
|
||||
prs = parser.Parser(reader)
|
||||
|
||||
print("Parser created; parser.parse();")
|
||||
prs.parse()
|
||||
|
||||
print("parsed done!")
|
||||
|
||||
if len(prs.raw_no_schedule) > 0:
|
||||
sheet_dict["raw_no_schedule"] = prs.raw_no_schedule
|
||||
|
||||
if len(prs.features) > 0:
|
||||
sheet_dict["features"] = sorted(prs.features)
|
||||
|
||||
if prs.parser_error is not None:
|
||||
excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error
|
||||
sheet_dict["parser_error"] = prs.parser_error
|
||||
|
||||
if prs.parser_warnings is not None and len(prs.parser_warnings) > 0:
|
||||
excel_info["parser_warnings_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_warnings
|
||||
sheet_dict["parser_warnings"] = prs.parser_warnings
|
||||
|
||||
for group_name in prs.groups.keys():
|
||||
if group_name in result_groups.keys():
|
||||
print(f" -- WTF -- Doubled groups -- name: {group_name}")
|
||||
if 'warning_doubled_groups_skip' not in excel_info.keys():
|
||||
excel_info['warning_doubled_groups_skip'] = []
|
||||
|
||||
excel_info['warning_doubled_groups_skip'].append(group_name)
|
||||
result['doubled_groups'].append(group_name)
|
||||
|
||||
|
||||
continue
|
||||
|
||||
gr = result_groups[group_name] = prs.groups[group_name]
|
||||
gr['facultet'] = facultet
|
||||
gr['data_source'] = excel_filename # same as 'filename' in excel_info's
|
||||
gr['data_source_hash'] = sha1hash
|
||||
gr['debug'] = {
|
||||
"excel_url": excel_url,
|
||||
"reader_info": reader.info(),
|
||||
"reader_sheet_index": reader.get_sheet_index(),
|
||||
"download_place": download_place
|
||||
}
|
||||
gr = prs.groups[group_name]
|
||||
gr["excel_url"] = excel_url
|
||||
sheet_dict["group_names_parsed"].append(group_name)
|
||||
excel_info["group_names_parsed"].append(group_name)
|
||||
excel_info['week_keys_metadata'] = prs.week_keys_metadata
|
||||
result["group_names_parsed"].append(group_name)
|
||||
result['stat']['groups'] += 1
|
||||
sheet_dict['week_keys_metadata'] = prs.week_keys_metadata
|
||||
sheet_dict['groups'][group_name] = gr
|
||||
process_obj(gr['slots'])
|
||||
|
||||
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
|
||||
print(f"Populates {len(prs.groups)} groups: " + " ".join(prs.groups.keys()))
|
||||
|
||||
if not reader.has_next_sheet():
|
||||
print("File ended")
|
||||
@@ -159,10 +186,12 @@ def process_excel_file(facultet, excel_url, counter, latest_changed):
|
||||
})
|
||||
|
||||
result['excels'].append(excel_info)
|
||||
k = "fine" if len(excel_info['group_names_parsed']) > 0 else "bad"
|
||||
result['stat']['excels'][k] += 1
|
||||
|
||||
faileds = []
|
||||
def main():
|
||||
global result_groups, result
|
||||
global result
|
||||
t = utils.StepTimeCounter()
|
||||
try:
|
||||
try:
|
||||
@@ -189,7 +218,9 @@ def main():
|
||||
|
||||
if now_diffable_dates == prev_diffable_dates:
|
||||
print("No date changes in vstu.ru website. Stopping")
|
||||
return
|
||||
if not SKIP_DIFFABLE_DATES:
|
||||
return
|
||||
print("SKIP_DIFFABLE_DATES is True, force resuming")
|
||||
|
||||
counter = 10000
|
||||
for excel_link in EXCEL_LINKS:
|
||||
@@ -200,14 +231,8 @@ def main():
|
||||
process_excel_file(facultet, excel_url, counter, latest_changed)
|
||||
|
||||
print("Saving result.json")
|
||||
group_names_alphabeticaly = sorted(result_groups.keys())
|
||||
sorted_groups = {}
|
||||
for group_name in group_names_alphabeticaly:
|
||||
sorted_groups[group_name] = result_groups[group_name]
|
||||
|
||||
result['groups'] = sorted_groups
|
||||
|
||||
result['stat']['total_parsing_time'] = t.step()
|
||||
result['unique_raws'] = sorted(unique_raws)
|
||||
|
||||
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
||||
print("Saved to result.json indent=2")
|
||||
|
||||
419
parser.py
419
parser.py
@@ -3,10 +3,21 @@
|
||||
PAIR_NUMS = [
|
||||
"1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16"
|
||||
]
|
||||
WEEKDAYS_STARTSWITH = [
|
||||
"понед",
|
||||
"вторник",
|
||||
"среда",
|
||||
"четверг",
|
||||
"пятница",
|
||||
"суббота"
|
||||
]
|
||||
|
||||
bad_group_names = [
|
||||
"янв", "февр", "март", "апр", "май", "сент", "окт", "ноя", "дек", "июнь", "июль", "авг"
|
||||
]
|
||||
|
||||
from datetime import time
|
||||
import json
|
||||
import uuid
|
||||
import aigenerated
|
||||
from coord import Coord, Merged
|
||||
from translations import ExcelSheetReader
|
||||
import utils
|
||||
@@ -18,49 +29,126 @@ def pprint(*args, **kwargs):
|
||||
if LOGGING:
|
||||
print(*args, **kwargs)
|
||||
|
||||
def is_weeknum(text):
|
||||
for wd in WEEKDAYS_STARTSWITH:
|
||||
if text.strip().replace(" ", "").lower().startswith(wd):
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_pair(text):
|
||||
for p in PAIR_NUMS:
|
||||
if text.strip().replace(" ", "").lower().startswith(p):
|
||||
return True
|
||||
return False
|
||||
|
||||
class Parser:
|
||||
def __init__(self, reader: ExcelSheetReader):
|
||||
self.reader = reader
|
||||
self.groups = {}
|
||||
self.week_keys_metadata = {}
|
||||
self.groups = {} # Группы которые удалось распарсить
|
||||
self.features = set() # фичи данной страницы
|
||||
self.week_keys_metadata = {} # календарик
|
||||
self.schedule_range_row = None # [min, max] диапазон col включительно где расписание
|
||||
self.raw_no_schedule = [] # всё что не schedule_range_row отправляется сюда ('СОГЛАСОВАНО:', etc..)
|
||||
|
||||
self.weeknums: defaultdict = defaultdict(set) # no support json!
|
||||
self.parser_error = None
|
||||
self.parser_warnings = []
|
||||
self.weeknums: defaultdict = defaultdict(set) # no support json! (для week_keys_metadata)
|
||||
self.parser_error = None # ошибка парсера перед выходом
|
||||
self.parser_warnings = [] # предупреждения парсера
|
||||
pprint("Parser created for '{0}'".format(reader.info()))
|
||||
|
||||
def parse(self):
|
||||
monday = self.reader.find("ПОНЕДЕЛЬНИК")
|
||||
if monday is None:
|
||||
# Характерные признаки разных сеток
|
||||
no_pair_numeration = False
|
||||
col_distance_pair_weekday = None
|
||||
weekday_firstly_calendar = False
|
||||
|
||||
first_weekday = self.reader.find_any(WEEKDAYS_STARTSWITH, startswith=True, nospace=True)
|
||||
|
||||
if first_weekday is None:
|
||||
self.features.add("no_weekdays")
|
||||
print(" -- Failed parse! -- ")
|
||||
print("ПОНЕДЕЛЬНИК НЕ НАЙДЕН!")
|
||||
self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице."
|
||||
print("дни недели не найдены!")
|
||||
self.parser_error = f"{WEEKDAYS_STARTSWITH} ни один найден в таблице. Дня недели нет."
|
||||
return
|
||||
|
||||
if monday.col != 4:
|
||||
print("--- warning parse! ---")
|
||||
print(f"Monday col != 4 (actual: {monday})")
|
||||
self.parser_warnings.append(f"Monday col != 4 (actual: {monday}); Это, наверное, может работать не стабильно!")
|
||||
pair_num_any = self.reader.find_any(PAIR_NUMS, nospace=True)
|
||||
if pair_num_any is None:
|
||||
no_pair_numeration = True
|
||||
self.features.add("no_pair_numeration")
|
||||
self.parser_warnings.append(f"Нет нумерации академических часов {PAIR_NUMS}")
|
||||
|
||||
else:
|
||||
self.features.add("pair_numeration")
|
||||
col_distance_pair_weekday = pair_num_any.col - first_weekday.col
|
||||
|
||||
head_rx = first_weekday.row - 1 # выше первого понидельника
|
||||
group_col_start = first_weekday.col + 2
|
||||
if col_distance_pair_weekday is not None:
|
||||
if col_distance_pair_weekday > 1:
|
||||
weekday_firstly_calendar = True
|
||||
self.features.add("weekdays_before_calendar")
|
||||
group_col_start = pair_num_any.col + 1
|
||||
|
||||
head_rx = monday.row - 1 # выше первого понидельника
|
||||
if head_rx < 0:
|
||||
raise Exception("head_rx < 0: Программа пыталась найти 'ПОНЕДЕЛЬНИК', но по всей видимости не нашла.")
|
||||
raise Exception("head_rx < 0: Программа пыталась найти день недели, но по всей видимости не нашла.")
|
||||
|
||||
head = self.reader.get_row_values(head_rx) # get all ROW (months, groups)
|
||||
pprint(f"head={head}")
|
||||
self.groups = parse_groups(self.reader, head, monday, head_rx) # parse groups to self.groups
|
||||
|
||||
head_joined = " ||| ".join([v for v in head if isinstance(v, str) and v.strip()])
|
||||
print(head_joined)
|
||||
if "1 неделя" in head_joined or "1 НЕДЕЛЯ" in head_joined or "2 неделя" in head_joined or "2 НЕДЕЛЯ" in head_joined or "ИЗМЕНЕНИЯ" in head_joined or "изменения" in head_joined or "vtf-vstu.ru" in head_joined:
|
||||
head_rx -= 1
|
||||
head = self.reader.get_row_values(head_rx) # get all ROW (months, groups)
|
||||
pprint(f"head (upper)={head}")
|
||||
self.features.add("post_groups_info_row")
|
||||
|
||||
self.groups = parse_groups(self.reader, head, group_col_start, head_rx) # parse groups to self.groups
|
||||
pprint(f'self.groups={json.dumps(self.groups, indent=2, ensure_ascii=False)}')
|
||||
|
||||
pprint("\n\n\n")
|
||||
|
||||
for group in self.groups.values():
|
||||
pprint("\nSTART OF PROCESS GROUP\n")
|
||||
self.process_group(group, monday)
|
||||
self.process_group(group, first_weekday, pair_num_any.col if pair_num_any else None)
|
||||
pprint("\nEND OF PROCESS GROUP\n")
|
||||
|
||||
self.process_weekmetadatas(monday)
|
||||
# week metadatas parse
|
||||
S = 9999999
|
||||
group_min_col = S
|
||||
group_min_row = S
|
||||
|
||||
def process_weekmetadatas(self, first_monday: "Coord"):
|
||||
for x in self.groups.values():
|
||||
p = x['position']
|
||||
group_min_row = min(p[0], group_min_row)
|
||||
group_min_col = min(p[1], group_min_col)
|
||||
|
||||
if group_min_row != S and group_min_col != S:
|
||||
pprint("Process weekmetadatas!")
|
||||
self.process_weekmetadatas(Coord(row=group_min_row, col=group_min_col))
|
||||
|
||||
# parse no-schedule raws (согласовано, и т.д.)
|
||||
self.parse_raw_no_schedule()
|
||||
|
||||
|
||||
def parse_raw_no_schedule(self):
|
||||
"""Распарсить всё за пределами self.schedule_range_row в self.raw_no_schedule"""
|
||||
if self.schedule_range_row is None:
|
||||
return
|
||||
|
||||
row = 0
|
||||
while row < self.reader.get_row_count():
|
||||
if row >= self.schedule_range_row[0] and row <= self.schedule_range_row[1]:
|
||||
row = self.schedule_range_row[1] + 1
|
||||
|
||||
row_values = self.reader.get_row_values(row)
|
||||
row_values = [v for v in row_values if isinstance(v, str) and v.strip()]
|
||||
if len(row_values) > 0:
|
||||
self.raw_no_schedule.append(row_values)
|
||||
|
||||
row += 1
|
||||
|
||||
def process_weekmetadatas(self, first_group: "Coord"):
|
||||
"""Обработать календарик"""
|
||||
for x in self.weeknums.keys():
|
||||
pprint(x)
|
||||
set_of_merged: set = self.weeknums[x]
|
||||
@@ -82,14 +170,16 @@ class Parser:
|
||||
self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})")
|
||||
continue
|
||||
|
||||
month_row = first_monday.row - 1
|
||||
curr_col = weekday_merged.low.col - 1
|
||||
month_row = first_group.row
|
||||
curr_col = first_group.col - 1
|
||||
while curr_col >= 0:
|
||||
month_pos = Coord(month_row, curr_col)
|
||||
month_cell = month_pos.cell(self.reader)
|
||||
if month_cell.is_empty():
|
||||
pprint("month cell is empty")
|
||||
break
|
||||
curr_col -= 1
|
||||
continue
|
||||
|
||||
month_name = str(month_cell.value).strip()
|
||||
pprint(month_cell)
|
||||
all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.replace(row=weekday_merged.low.row), right=1, down=weekday_merged.height())
|
||||
@@ -118,6 +208,16 @@ class Parser:
|
||||
def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"):
|
||||
self.weeknums[week_key_name].add(merged)
|
||||
|
||||
def row_with_schedule_notify(self, row_coord):
|
||||
if self.schedule_range_row is None:
|
||||
self.schedule_range_row = [row_coord, row_coord]
|
||||
|
||||
if self.schedule_range_row[1] < row_coord:
|
||||
self.schedule_range_row[1] = row_coord
|
||||
|
||||
if self.schedule_range_row[0] > row_coord:
|
||||
self.schedule_range_row[0] = row_coord
|
||||
|
||||
def parse_potokoviy(self, merged: Merged):
|
||||
speaker = None
|
||||
location = None
|
||||
@@ -132,163 +232,157 @@ class Parser:
|
||||
|
||||
return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()}
|
||||
|
||||
def process_group(self, group: dict, monday: Coord):
|
||||
def process_group(self, group: dict, first_weekday: Coord, pair_pos_col):
|
||||
"""
|
||||
Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups)
|
||||
group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'}
|
||||
"""
|
||||
pprint(f"process_group group={group}")
|
||||
group_name = group['name']
|
||||
pprint(group_name)
|
||||
row = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля)
|
||||
pprint(F"Имя группы: {group_name}")
|
||||
row_c1 = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля)
|
||||
self.row_with_schedule_notify(group['position'][0])
|
||||
group_header_pos = Coord(group['position'][0], group['position'][1])
|
||||
width = group['width']
|
||||
weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии.
|
||||
previous_pair = None
|
||||
while row < self.reader.get_row_count(): # maybe условие чтобы не уйти ниже чем есть строк
|
||||
pos = Coord(row, group['position'][1]) # текущая позиция, верхний левый угол (=low)
|
||||
pprint(f"while pos={pos}")
|
||||
pos_right = pos.shift(right=3)
|
||||
pair_pos = pos.replace(col=monday.col + 1)
|
||||
weekday_pos = pos.replace(col=monday.col)
|
||||
merged = self.reader.get_merged_coord(pos)
|
||||
merged_cell = merged.cell(self.reader)
|
||||
cv = merged_cell.value
|
||||
# В конце (12 пара:>) название группы, можно использовать как якорь
|
||||
if utils.unspace(cv) == group_name:
|
||||
pprint("Lesson == group name; ending group loop.")
|
||||
|
||||
weekcycles = 0
|
||||
while row_c1 < self.reader.get_row_count():
|
||||
pos_c1 = Coord(row_c1, group['position'][1]) # текущая позиция, верхний левый угол (=low)
|
||||
self.row_with_schedule_notify(pos_c1.row)
|
||||
|
||||
|
||||
if pos_c1.cell(self.reader).is_nospace_nocase_same(group_name):
|
||||
pprint("Ended with grpup name; stop moving down, break")
|
||||
break
|
||||
|
||||
weekday_pos = pos_c1.replace(col=first_weekday.col)
|
||||
weekday_cell = weekday_pos.cell(self.reader)
|
||||
weekday_mr = self.reader.get_merged_coord(weekday_pos)
|
||||
weekday = utils.unspace(weekday_mr.cell(self.reader).value)
|
||||
pair_mr = self.reader.get_merged_coord(pair_pos)
|
||||
pair = utils.unspace(pair_mr.cell(self.reader).value)
|
||||
weekday = weekday_cell.value
|
||||
|
||||
skip = 0
|
||||
if weekday == "":
|
||||
if weeknum == 1:
|
||||
weeknum += 1
|
||||
pprint("------")
|
||||
skip = 1
|
||||
row += 1
|
||||
else:
|
||||
break
|
||||
if not is_weeknum(weekday):
|
||||
row_c1 += 1
|
||||
pprint("Not weeknum!")
|
||||
if weekcycles > 0:
|
||||
if (weeknum != 2):
|
||||
pprint("Weeknum now 2")
|
||||
weekday = 0
|
||||
weeknum = 2
|
||||
continue
|
||||
|
||||
if not skip:
|
||||
next = 3 # на сколько пыгнуть для следующего шага?
|
||||
pprint(weekday)
|
||||
weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2")
|
||||
self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr)
|
||||
|
||||
weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2")
|
||||
self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr)
|
||||
# state
|
||||
event_no = 1
|
||||
is_widely = False
|
||||
override_col_range = None
|
||||
all_raw = set()
|
||||
pairs = set()
|
||||
times = []
|
||||
first_coord = None
|
||||
|
||||
is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего..
|
||||
parsed_discipline_name = None
|
||||
parsed_location = None
|
||||
parsed_leader = None
|
||||
pairs = 1
|
||||
wtf_tomanypairs = False
|
||||
is_solid = pos_right in merged
|
||||
parsed_uncotigorized = []
|
||||
is_wide_maybe_potokoviy = merged.width() > 4 # потоковая ли лекция (занимает несколько групп.)
|
||||
row_c2 = row_c1
|
||||
while row_c2 <= weekday_mr.high.row:
|
||||
pos_c2 = Coord(row_c2, group['position'][1]) # текущая позиция (внутри группы, внутри дня недели), верхний левый угол (=low)
|
||||
cell_c2 = pos_c2.cell(self.reader)
|
||||
mr_c2 = self.reader.get_merged_coord(pos_c2)
|
||||
|
||||
if not is_empty_lesson:
|
||||
cur = pos.shift(down=2)
|
||||
while utils.has_no_bottom_border(self.reader, cur):
|
||||
next += 3
|
||||
pairs += 1
|
||||
pprint(f"next = {next} cur={cur}")
|
||||
if pairs >= 7:
|
||||
wtf_tomanypairs = True
|
||||
break
|
||||
cur = cur.shift(down=3)
|
||||
if first_coord is None:
|
||||
first_coord = pos_c2.row
|
||||
|
||||
if is_wide_maybe_potokoviy:
|
||||
ret = self.parse_potokoviy(merged)
|
||||
parsed_location = ret['loc']
|
||||
parsed_leader = ret['leader']
|
||||
parsed_discipline_name = ret['name']
|
||||
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, merged.width(), next))
|
||||
pair_num = None
|
||||
pair_num_mr = None
|
||||
if pair_pos_col is not None:
|
||||
pair_num = pos_c2.replace(col=pair_pos_col)
|
||||
pair_num_mr = self.reader.get_merged_coord(pair_num)
|
||||
|
||||
else:
|
||||
if (is_solid):
|
||||
parsed_discipline_name = cv
|
||||
if (not is_widely) and (mr_c2.low.col < group_header_pos.col or mr_c2.high.col > group_header_pos.col + width - 1):
|
||||
is_widely = True
|
||||
override_col_range = (mr_c2.low.col, mr_c2.high.col)
|
||||
|
||||
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, 4, next))
|
||||
col_low = group_header_pos.col
|
||||
col_high = group_header_pos.col + width - 1
|
||||
if override_col_range is not None:
|
||||
col_low = min(col_low, override_col_range[0])
|
||||
col_high = max(col_high, override_col_range[1])
|
||||
|
||||
# попытка исправить пару (1-2) если пустая.
|
||||
fuck_empty_pair_in_excel = pair == ""
|
||||
previous_dump = previous_pair
|
||||
if fuck_empty_pair_in_excel:
|
||||
if previous_pair is None or previous_pair == "":
|
||||
pair = f"EMPTY_IN_EXCEL"
|
||||
else:
|
||||
pair = utils.next_element(PAIR_NUMS, previous_pair)
|
||||
dirty_line = utils.parse_all_dirt(self.reader, Coord(row_c2, col_low), (col_high - col_low + 1), 1, with_cells=True)
|
||||
if len(dirty_line) > 0:
|
||||
if pair_num_mr is not None:
|
||||
pair_num_to_add = pair_num_mr.cell(self.reader).value.replace(" ", "").strip()
|
||||
if len(pair_num_to_add) == 0:
|
||||
pair_num_to_add = "???"
|
||||
pairs.add(pair_num_to_add)
|
||||
|
||||
if pair != "":
|
||||
previous_pair = pair if next == 3 else None # костыль чтобы избежать гипотетически не верной даты.
|
||||
for cell in dirty_line:
|
||||
if not cell.is_time:
|
||||
all_raw.add(str(cell.value))
|
||||
else:
|
||||
dt: time = cell.value
|
||||
times.append(str(dt))
|
||||
|
||||
# пытаемся из некотегорезированных данных выцепить место и лидера (препода)
|
||||
prepods = set()
|
||||
if parsed_leader is not None: prepods.add(parsed_leader.strip())
|
||||
def clean_state():
|
||||
nonlocal is_widely, override_col_range, event_no, all_raw, pairs, times, first_coord
|
||||
is_widely = False
|
||||
override_col_range = None
|
||||
event_no += 1
|
||||
all_raw = set()
|
||||
pairs = set()
|
||||
first_coord = None
|
||||
times = []
|
||||
|
||||
locations = set()
|
||||
if parsed_location is not None: locations.add(parsed_location.strip().replace(" ", ""))
|
||||
|
||||
for x in list(parsed_uncotigorized):
|
||||
if aigenerated.is_surname_string(x):
|
||||
prepods.add(x.strip())
|
||||
if not utils.has_no_bottom_border(self.reader, pos_c2) and not(mr_c2.high.row - row_c2 > 0):
|
||||
if not (len(all_raw) == 0):
|
||||
# this code last for current state event
|
||||
pprint(f"№{event_no} {pairs}: {'[wide] ' if is_widely else ''} raw={all_raw}")
|
||||
|
||||
if aigenerated.is_room_number(x):
|
||||
locations.add(x.strip().replace(" ", "") if x is not None else None)
|
||||
slots = group['slots']
|
||||
w = weekday_key_name
|
||||
if w not in slots.keys():
|
||||
slots[w] = {}
|
||||
|
||||
# попытка починить пустую дисциплину
|
||||
if parsed_discipline_name is None:
|
||||
l = sorted(utils.remove_from_list(list(parsed_uncotigorized), list(locations | prepods | set([parsed_location, parsed_leader]))))
|
||||
parsed_discipline_name = " ".join(l)
|
||||
pair_name = "????"
|
||||
try:
|
||||
pair_name = sorted(pairs)[0]
|
||||
except: pass
|
||||
|
||||
# чистим сеты от мусора
|
||||
utils.discards_list(prepods, nones=True, emptystrings=True)
|
||||
utils.discards_list(locations, nones=True, emptystrings=True)
|
||||
utils.discards_list(parsed_uncotigorized, nones=True, emptystrings=True)
|
||||
|
||||
# если не пустой предмет то записываем его
|
||||
if not is_empty_lesson:
|
||||
slots = group['slots']
|
||||
w = weekday_key_name
|
||||
if w not in slots.keys():
|
||||
slots[w] = {}
|
||||
|
||||
today = slots[w]
|
||||
today[pair] = {
|
||||
"excel_pos": str(pos),
|
||||
"discipline_name": parsed_discipline_name.strip(),
|
||||
"locations": sorted(locations),
|
||||
"leads": sorted(prepods),
|
||||
"is_solid": is_solid,
|
||||
"time_coeff": pairs,
|
||||
"is_flow": is_wide_maybe_potokoviy,
|
||||
"lefttopmerged": {
|
||||
"width": merged.width(),
|
||||
"height": merged.height(),
|
||||
"excel_range": utils.merged_humanize(merged.as_numbers())
|
||||
},
|
||||
"raw": sorted(parsed_uncotigorized),
|
||||
"weekday": utils.weekday_to_num(weekday),
|
||||
"weeknum": weeknum
|
||||
}
|
||||
if fuck_empty_pair_in_excel:
|
||||
today[pair]['pair_num_empty'] = {
|
||||
"prev": previous_dump,
|
||||
"restored": pair != "",
|
||||
"pair": pair
|
||||
today = slots[w]
|
||||
obj = {
|
||||
"object": "event",
|
||||
"pairs": sorted(pairs),
|
||||
"is_flow": is_widely,
|
||||
"excel_range": utils.merged_humanize((first_coord, col_low, row_c2, col_high)),
|
||||
"raw": sorted(all_raw),
|
||||
"weekday": utils.weekday_to_num(weekday),
|
||||
"weeknum": weeknum
|
||||
}
|
||||
if wtf_tomanypairs:
|
||||
today[pair]['to_many_parsing_time_coeff'] = True
|
||||
if len(times) > 0:
|
||||
obj['times'] = times
|
||||
|
||||
if pair_pos_col is None:
|
||||
slots[w] = obj
|
||||
else:
|
||||
today[pair_name] = obj
|
||||
# here may be a empty all_raw
|
||||
clean_state()
|
||||
first_coord = None
|
||||
|
||||
|
||||
# INCREMENT на next и конец цикла.
|
||||
row += next
|
||||
if row_c2 >= weekday_mr.high.row:
|
||||
clean_state()
|
||||
pprint("Last for weekday")
|
||||
|
||||
row_c2 += 1
|
||||
|
||||
row_c1 += weekday_mr.height()
|
||||
weekcycles += 1
|
||||
|
||||
def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx):
|
||||
def parse_groups(reader: "ExcelSheetReader", head, col_start, head_rx):
|
||||
"""Распознать список групп и метаданные к ним, по сути получить список названий группы и координат её верхнего header-а (AQ6:AT6)"""
|
||||
groups = {}
|
||||
i = 0
|
||||
@@ -296,21 +390,26 @@ def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx):
|
||||
x = head[i]
|
||||
pprint(f"while i={i} head[i]={x}")
|
||||
merged = reader.get_merged_coord(Coord(head_rx, i))
|
||||
if i > monday.col + 1:
|
||||
if merged is None or x == "":
|
||||
break
|
||||
|
||||
if merged.width() != 4:
|
||||
pprint(f"WARNING: group header witdh !=4 (found: {merged.width()}); blocks !=4 not supported by parser.")
|
||||
if i >= col_start:
|
||||
if merged is None or x == "" or x is None:
|
||||
break
|
||||
|
||||
name = utils.unspace(x)
|
||||
groups[name] = {
|
||||
"name": name,
|
||||
"position": [head_rx, i],
|
||||
"position_human": utils.merged_humanize(merged.as_numbers()),
|
||||
"slots": {}
|
||||
}
|
||||
skip = False
|
||||
if "-" not in name:
|
||||
for x in bad_group_names:
|
||||
if x in name.lower():
|
||||
skip = True
|
||||
pprint(f"Skip groupname {name} because not dash in name and in blacklist")
|
||||
|
||||
if not skip:
|
||||
groups[name] = {
|
||||
"name": name,
|
||||
"position": [head_rx, i],
|
||||
"width": merged.width(),
|
||||
"position_human": utils.merged_humanize(merged.as_numbers()),
|
||||
"slots": {}
|
||||
}
|
||||
|
||||
if merged is None:
|
||||
i += 1
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# --- Абстрактный базовый класс (Контракт) ---
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import datetime, time
|
||||
|
||||
import openpyxl
|
||||
import xlrd
|
||||
@@ -10,10 +11,19 @@ from coord import Coord, Merged
|
||||
EMPTY_CTYPES = [xlrd.XL_CELL_EMPTY, xlrd.XL_CELL_BLANK]
|
||||
|
||||
class TranschendentnostCell:
|
||||
def __init__(self, value, is_empty):
|
||||
def __init__(self, value, is_empty, is_time=False):
|
||||
self.value = value
|
||||
self.is_time = isinstance(value, time) or is_time
|
||||
self._is_empty = is_empty
|
||||
|
||||
def is_nospace_nocase_same(self, query):
|
||||
try:
|
||||
if self.value.lower().replace(" ", "").strip() == query.lower().replace(" ", "").strip():
|
||||
return True
|
||||
except: pass
|
||||
|
||||
return False
|
||||
|
||||
def is_empty(self):
|
||||
return self._is_empty
|
||||
|
||||
@@ -29,6 +39,10 @@ class ExcelSheetReader(ABC):
|
||||
def get_sheet_index(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_sheet_name(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def has_next_sheet(self):
|
||||
pass
|
||||
@@ -71,16 +85,28 @@ class ExcelSheetReader(ABC):
|
||||
return "TODO: info"
|
||||
|
||||
@abstractmethod
|
||||
def cell(self, row, col):
|
||||
def cell(self, row, col) -> TranschendentnostCell:
|
||||
"""Возвращает абстрактную клетку"""
|
||||
pass
|
||||
|
||||
def find(self, query = None):
|
||||
def find(self, query = None, startswith=False, nospace=False):
|
||||
return self.find_any([query], startswith=startswith, nospace=nospace)
|
||||
|
||||
def find_any(self, query = None, startswith=False, nospace=False):
|
||||
for rx in range(self.get_row_count()):
|
||||
i = 0
|
||||
for x in self.get_row_values(rx):
|
||||
if x == query:
|
||||
return Coord(rx, i)
|
||||
if nospace:
|
||||
x = str(x).replace(" ", "").strip()
|
||||
|
||||
for query_selected in query:
|
||||
if x == query_selected:
|
||||
return Coord(rx, i)
|
||||
elif startswith:
|
||||
try:
|
||||
if str(x).lower().startswith(query_selected.lower()):
|
||||
return Coord(rx, i)
|
||||
except: pass
|
||||
i += 1
|
||||
|
||||
return None
|
||||
@@ -118,6 +144,9 @@ class XlrdSheetReader(ExcelSheetReader):
|
||||
def init_sheet(self):
|
||||
self.sheet = self.book.sheet_by_index(self.sheet_index)
|
||||
|
||||
def get_sheet_name(self):
|
||||
return self.sheet.name
|
||||
|
||||
def has_next_sheet(self):
|
||||
return self.sheet_index < len(self.book.sheet_names())-1
|
||||
|
||||
@@ -140,7 +169,24 @@ class XlrdSheetReader(ExcelSheetReader):
|
||||
def cell(self, row, col):
|
||||
"""Возвращает абстрактную клетку"""
|
||||
c = self.sheet.cell(row, col)
|
||||
return TranschendentnostCell(c.value, c.ctype in EMPTY_CTYPES)
|
||||
is_empty = c.ctype in EMPTY_CTYPES
|
||||
is_time = c.ctype == xlrd.XL_CELL_DATE
|
||||
value = c.value
|
||||
if is_empty:
|
||||
value = ""
|
||||
elif is_time:
|
||||
if isinstance(value, float):
|
||||
if value <= 1:
|
||||
seconds = round(value * 86400)
|
||||
minutes, seconds = divmod(seconds, 60)
|
||||
hours, minutes = divmod(minutes, 60)
|
||||
value = time(hour=hours, second=seconds, minute=minutes)
|
||||
else:
|
||||
print(f"TODO: value is {value} its unix? not 0.xxxxxxxx")
|
||||
else:
|
||||
is_time = False
|
||||
print("IsTime but not float!")
|
||||
return TranschendentnostCell(value, is_empty, is_time=is_time)
|
||||
|
||||
def get_border_style(self, coord: Coord, side):
|
||||
row = coord.row
|
||||
@@ -192,6 +238,9 @@ class OpenpyxlSheetReader(ExcelSheetReader):
|
||||
def get_sheet_index(self):
|
||||
return self.sheet_index
|
||||
|
||||
def get_sheet_name(self):
|
||||
return self.workbook.sheetnames[self.sheet_index]
|
||||
|
||||
def has_next_sheet(self):
|
||||
return self.sheet_index < len(self.workbook.sheetnames)-1
|
||||
|
||||
@@ -221,7 +270,7 @@ class OpenpyxlSheetReader(ExcelSheetReader):
|
||||
c = self._get_cell(row, col)
|
||||
|
||||
is_empty = (c.value is None)
|
||||
return TranschendentnostCell("" if is_empty else c.value, is_empty)
|
||||
return TranschendentnostCell("" if is_empty else c.value, is_empty, is_time=isinstance(c.value, time))
|
||||
|
||||
def get_cell_value(self, row, col):
|
||||
cell = self._get_cell(row, col)
|
||||
|
||||
13
utils.py
13
utils.py
@@ -53,7 +53,7 @@ def remove_from_list(l: list, todel: list):
|
||||
|
||||
return l
|
||||
|
||||
def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down):
|
||||
def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down, with_cells=False):
|
||||
RET = set()
|
||||
|
||||
row = min_pos.row
|
||||
@@ -61,10 +61,9 @@ def parse_all_dirt(reader: "ExcelSheetReader", min_pos: Coord, right, down):
|
||||
col = min_pos.col
|
||||
while col < min_pos.col + right:
|
||||
#print(excel_coordinate(row, col))
|
||||
cv = reader.get_cell_value(row, col)
|
||||
value = str(cv).strip()
|
||||
if cv is not None and len(value) > 0:
|
||||
RET.add(value)
|
||||
cv = reader.cell(row, col)
|
||||
if cv is not None and not cv.is_empty():
|
||||
RET.add(cv if with_cells else str(cv.value))
|
||||
col += 1
|
||||
row += 1
|
||||
|
||||
@@ -165,7 +164,7 @@ def find(sh, query = None):
|
||||
return None
|
||||
|
||||
def weekday_to_num(st: str):
|
||||
if st.upper().strip() == "ПОНЕДЕЛЬНИК":
|
||||
if st.upper().strip().startswith("ПОНЕД"):
|
||||
return 1
|
||||
if st.upper().strip() == "ВТОРНИК":
|
||||
return 2
|
||||
@@ -177,7 +176,7 @@ def weekday_to_num(st: str):
|
||||
return 5
|
||||
if st.upper().strip() == "СУББОТА":
|
||||
return 6
|
||||
if st.upper().strip() == "ВОСКРЕСЕНЬЕ":
|
||||
if st.upper().strip().startswith("ВОСКР"):
|
||||
return 7
|
||||
|
||||
return -1
|
||||
|
||||
Reference in New Issue
Block a user