Files
VSTU_Schedule_Parser/parser.py

322 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Copyright Stanislav Mironov
PAIR_NUMS = [
"1-2", "3-4", "5-6", "7-8", "9-10", "11-12", "13-14", "15-16"
]
import json
import uuid
import aigenerated
from coord import Coord, Merged
from translations import ExcelSheetReader
import utils
from collections import defaultdict
LOGGING = False
def pprint(*args, **kwargs):
if LOGGING:
print(*args, **kwargs)
class Parser:
def __init__(self, reader: ExcelSheetReader):
self.reader = reader
self.groups = {}
self.week_keys_metadata = {}
self.weeknums: defaultdict = defaultdict(set) # no support json!
self.parser_error = None
self.parser_warnings = []
pprint("Parser created for '{0}'".format(reader.info()))
def parse(self):
monday = self.reader.find("ПОНЕДЕЛЬНИК")
if monday is None:
print(" -- Failed parse! -- ")
print("ПОНЕДЕЛЬНИК НЕ НАЙДЕН!")
self.parser_error = "'ПОНЕДЕЛЬНИК' не найден в таблице."
return
if monday.col != 4:
print("--- warning parse! ---")
print(f"Monday col != 4 (actual: {monday})")
self.parser_warnings.append(f"Monday col != 4 (actual: {monday}); Это, наверное, может работать не стабильно!")
head_rx = monday.row - 1 # выше первого понидельника
if head_rx < 0:
raise Exception("head_rx < 0: Программа пыталась найти 'ПОНЕДЕЛЬНИК', но по всей видимости не нашла.")
head = self.reader.get_row_values(head_rx) # get all ROW (months, groups)
pprint(f"head={head}")
self.groups = parse_groups(self.reader, head, monday, head_rx) # parse groups to self.groups
pprint(f'self.groups={json.dumps(self.groups, indent=2, ensure_ascii=False)}')
pprint("\n\n\n")
for group in self.groups.values():
pprint("\nSTART OF PROCESS GROUP\n")
self.process_group(group, monday)
pprint("\nEND OF PROCESS GROUP\n")
self.process_weekmetadatas(monday)
def process_weekmetadatas(self, first_monday: "Coord"):
for x in self.weeknums.keys():
pprint(x)
set_of_merged: set = self.weeknums[x]
l = len(set_of_merged)
if l != 1:
self.week_keys_metadata[x] = {
"error": True,
"error_text": f"Parse error: count of found '{x}' (need view like WEEKDAY_1; weekday - in r; 1 - weeknum[1, 2]) is {l}; required only one!"
}
self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because count of uniqie merged cells not one (actual: {l}). :<")
continue
weekday_merged: Merged = set_of_merged.pop()
if weekday_merged.width() != 1:
self.week_keys_metadata[x] = {
"error": True,
"error_text": f"Weekday excel block width != 1 (actual {weekday_merged.width()})"
}
self.parser_warnings.append(f"Processing weekmetadata for '{x}' failed because weekday excel block width != 1 (actual {weekday_merged.width()})")
continue
month_row = first_monday.row - 1
curr_col = weekday_merged.low.col - 1
while curr_col >= 0:
month_pos = Coord(month_row, curr_col)
month_cell = month_pos.cell(self.reader)
if month_cell.is_empty():
pprint("month cell is empty")
break
month_name = str(month_cell.value).strip()
pprint(month_cell)
all_nums_of_month = utils.parse_all_dirt(self.reader, month_pos.replace(row=weekday_merged.low.row), right=1, down=weekday_merged.height())
pprint(f"all_nums_of_month={all_nums_of_month}")
if (x not in self.week_keys_metadata.keys()):
self.week_keys_metadata[x] = {}
if (month_name not in self.week_keys_metadata[x].keys()):
self.week_keys_metadata[x][month_name] = []
for x2 in all_nums_of_month:
if x2.lower() == month_name.lower():
pprint(f"Skip {x2} month number because it == month_name")
continue
m = self.week_keys_metadata[x][month_name]
if x2 not in m:
try:
m.append(str(x2).replace(".0", ""))
except:
m.append(x2)
curr_col -= 1
def push_weekday_meta(self, weekday: str, weeknum: int, week_key_name: str, merged: "Merged"):
self.weeknums[week_key_name].add(merged)
def parse_potokoviy(self, merged: Merged):
speaker = None
location = None
# speaker
low = merged.low
speaker_pos = low.shift(down=merged.height())
speaker = speaker_pos.cell(self.reader).value
# location
location = merged.high.shift(down=1).cell(self.reader).value
return {"loc": str(location).strip(), "leader": str(speaker).strip(), "name": str(merged.cell(self.reader).value).strip()}
def process_group(self, group: dict, monday: Coord):
"""
Обработать группы, выполняется для каждой группы, после того как они распарены (parse_groups)
group = {'name': 'ИВТ-260', 'position': [5, 6], 'position_human': 'G6:J6'}
"""
pprint(f"process_group group={group}")
group_name = group['name']
pprint(group_name)
row = group['position'][0] + 1 # counter for while, +1 for shift down; также номер строки в таблице (вроде с нуля)
weeknum = 1 # номер недели, щёлкнет +1 при каком-то условии.
previous_pair = None
while row < self.reader.get_row_count(): # maybe условие чтобы не уйти ниже чем есть строк
pos = Coord(row, group['position'][1]) # текущая позиция, верхний левый угол (=low)
pprint(f"while pos={pos}")
pos_right = pos.shift(right=3)
pair_pos = pos.replace(col=monday.col + 1)
weekday_pos = pos.replace(col=monday.col)
merged = self.reader.get_merged_coord(pos)
merged_cell = merged.cell(self.reader)
cv = merged_cell.value
# В конце (12 пара:>) название группы, можно использовать как якорь
if utils.unspace(cv) == group_name:
pprint("Lesson == group name; ending group loop.")
break
weekday_mr = self.reader.get_merged_coord(weekday_pos)
weekday = utils.unspace(weekday_mr.cell(self.reader).value)
pair_mr = self.reader.get_merged_coord(pair_pos)
pair = utils.unspace(pair_mr.cell(self.reader).value)
skip = 0
if weekday == "":
if weeknum == 1:
weeknum += 1
pprint("------")
skip = 1
row += 1
else:
break
if not skip:
next = 3 # на сколько пыгнуть для следующего шага?
weekday_key_name = weekday + ("_1" if weeknum == 1 else "_2")
self.push_weekday_meta(weekday, weeknum, weekday_key_name, weekday_mr)
is_empty_lesson = len(utils.parse_all_dirt(self.reader, pos, 4, 3)) == 0 # если в поле не найдено ничего..
parsed_discipline_name = None
parsed_location = None
parsed_leader = None
pairs = 1
wtf_tomanypairs = False
is_solid = pos_right in merged
parsed_uncotigorized = []
is_wide_maybe_potokoviy = merged.width() > 4 # потоковая ли лекция (занимает несколько групп.)
if not is_empty_lesson:
cur = pos.shift(down=2)
while utils.has_no_bottom_border(self.reader, cur):
next += 3
pairs += 1
pprint(f"next = {next} cur={cur}")
if pairs >= 7:
wtf_tomanypairs = True
break
cur = cur.shift(down=3)
if is_wide_maybe_potokoviy:
ret = self.parse_potokoviy(merged)
parsed_location = ret['loc']
parsed_leader = ret['leader']
parsed_discipline_name = ret['name']
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, merged.width(), next))
else:
if (is_solid):
parsed_discipline_name = cv
parsed_uncotigorized = list(utils.parse_all_dirt(self.reader, merged.low, 4, next))
# попытка исправить пару (1-2) если пустая.
fuck_empty_pair_in_excel = pair == ""
previous_dump = previous_pair
if fuck_empty_pair_in_excel:
if previous_pair is None or previous_pair == "":
pair = f"EMPTY_IN_EXCEL"
else:
pair = utils.next_element(PAIR_NUMS, previous_pair)
if pair != "":
previous_pair = pair if next == 3 else None # костыль чтобы избежать гипотетически не верной даты.
# пытаемся из некотегорезированных данных выцепить место и лидера (препода)
prepods = set()
if parsed_leader is not None: prepods.add(parsed_leader.strip())
locations = set()
if parsed_location is not None: locations.add(parsed_location.strip().replace(" ", ""))
for x in list(parsed_uncotigorized):
if aigenerated.is_surname_string(x):
prepods.add(x.strip())
if aigenerated.is_room_number(x):
locations.add(x.strip().replace(" ", "") if x is not None else None)
# попытка починить пустую дисциплину
if parsed_discipline_name is None:
l = sorted(utils.remove_from_list(list(parsed_uncotigorized), list(locations | prepods | set([parsed_location, parsed_leader]))))
parsed_discipline_name = " ".join(l)
# чистим сеты от мусора
utils.discards_list(prepods, nones=True, emptystrings=True)
utils.discards_list(locations, nones=True, emptystrings=True)
utils.discards_list(parsed_uncotigorized, nones=True, emptystrings=True)
# если не пустой предмет то записываем его
if not is_empty_lesson:
slots = group['slots']
w = weekday_key_name
if w not in slots.keys():
slots[w] = {}
today = slots[w]
today[pair] = {
"excel_pos": str(pos),
"discipline_name": parsed_discipline_name.strip(),
"locations": sorted(locations),
"leads": sorted(prepods),
"is_solid": is_solid,
"time_coeff": pairs,
"is_flow": is_wide_maybe_potokoviy,
"lefttopmerged": {
"width": merged.width(),
"height": merged.height(),
"excel_range": utils.merged_humanize(merged.as_numbers())
},
"raw": sorted(parsed_uncotigorized),
"weekday": utils.weekday_to_num(weekday),
"weeknum": weeknum
}
if fuck_empty_pair_in_excel:
today[pair]['pair_num_empty'] = {
"prev": previous_dump,
"restored": pair != "",
"pair": pair
}
if wtf_tomanypairs:
today[pair]['to_many_parsing_time_coeff'] = True
# INCREMENT на next и конец цикла.
row += next
def parse_groups(reader: "ExcelSheetReader", head, monday: Coord, head_rx):
"""Распознать список групп и метаданные к ним, по сути получить список названий группы и координат её верхнего header-а (AQ6:AT6)"""
groups = {}
i = 0
while i < len(head):
x = head[i]
pprint(f"while i={i} head[i]={x}")
merged = reader.get_merged_coord(Coord(head_rx, i))
if i > monday.col + 1:
if merged is None or x == "":
break
if merged.width() != 4:
pprint(f"WARNING: group header witdh !=4 (found: {merged.width()}); blocks !=4 not supported by parser.")
break
name = utils.unspace(x)
groups[name] = {
"name": name,
"position": [head_rx, i],
"position_human": utils.merged_humanize(merged.as_numbers()),
"slots": {}
}
if merged is None:
i += 1
else:
i += merged.width()
return groups