diff --git a/.gitignore b/.gitignore index 99010fe..2dec1e4 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__ .idea result*.json groups.json +diffable_dates.txt \ No newline at end of file diff --git a/links_parser.py b/links_parser.py index 3553032..584a02f 100644 --- a/links_parser.py +++ b/links_parser.py @@ -10,6 +10,11 @@ from bs4 import BeautifulSoup BASE_URL = "https://www.vstu.ru/" RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep=" +def sibling_clear_to_date(s: str): + if s is None: + return "!!!Python None!!!" + return s.lower().replace("(последнее изменение:", "").replace(")", "").strip() + # Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их def parse_links(facultets): session = requests.Session() @@ -27,7 +32,7 @@ def parse_links(facultets): } ) - EXCEL_LINKS = {} + EXCEL_LINKS = [] for facultet in facultets: url = RASP_PREFIX + facultet print("getting...") @@ -38,19 +43,23 @@ def parse_links(facultets): # Ищем все теги , у которых атрибут href соответствует нашему паттерну excel_tags = soup.find_all('a', href=excel_pattern) - excel_links = [tag.get('href') for tag in excel_tags] - - # Предположим, вы уже получили excel_links из одного из методов выше - # excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...] - - absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links] - - if facultet not in EXCEL_LINKS.keys(): - EXCEL_LINKS[facultet] = set() - - for excel_url in absolute_links: - EXCEL_LINKS[facultet].add(excel_url) - print(f"+url {excel_url}") + for a in excel_tags: + last_changed = sibling_clear_to_date(a.next_sibling) + url = urljoin(BASE_URL, a.get('href')) + record = { + "facultet": facultet, + "url": url, + "last_changed": last_changed + } + print(record) + EXCEL_LINKS.append(record) - return EXCEL_LINKS + return sorted(EXCEL_LINKS, key=lambda x: x['url']) + +def excels_to_diffabledates(excels): + dates = [] + for excel in excels: + dates.append(f"{excel['last_changed']} {excel['facultet']} {excel['url']}") + + return "\n".join(sorted(dates)).strip() diff --git a/main.py b/main.py index f78a644..4f6fa2c 100644 --- a/main.py +++ b/main.py @@ -19,10 +19,11 @@ import shutil def currt(): return round(time.time()) -FACULTETS = [ +FACULTETS = sorted([ "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik" -] +]) DIRNAME = "excels" +DIFFABLE_DATES = "diffable_dates.txt" DEBUG_ONE_FAC = None #'htf' result_groups = {} @@ -63,20 +64,15 @@ result = { "see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА" } -def process_excel_file(facultet, excel_url, counter, timeid): +def process_excel_file(facultet, excel_url, counter, latest_changed): is_xlsx = excel_url.endswith(".xlsx") - filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") + filename = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "") excel_info = { "filename": excel_url.split("/")[-1], "url": excel_url, + "latest_changed": latest_changed, "download_place": filename, - "stat": { - "download": -1, - "create_reader": -1, - "parse": -1, - "cycles": 0 - }, "group_names_parsed": [], "facultet": facultet, "counter": counter @@ -84,17 +80,12 @@ def process_excel_file(facultet, excel_url, counter, timeid): parser.LOGGING = False try: - t = utils.StepTimeCounter() aigenerated.download_file_from_url(excel_url, filename) - excel_info["stat"]['download'] = t.step() - reader = translations.create_reader(filename) print("Reader info") print(reader.info()) - excel_info["stat"]['create_reader'] = t.step() - + while True: - excel_info['stat']['cycles'] += 1 print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)") prs = parser.Parser(reader) @@ -103,7 +94,7 @@ def process_excel_file(facultet, excel_url, counter, timeid): print("parsed done!") if prs.parser_error is not None: - excel_info["parser_error_cycle_" + str(excel_info['stat']['cycles'])] = prs.parser_error + excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error for group_name in prs.groups.keys(): if group_name in result_groups.keys(): @@ -121,8 +112,6 @@ def process_excel_file(facultet, excel_url, counter, timeid): gr['facultet'] = facultet gr['data_source'] = excel_url.split("/")[-1] gr['debug'] = { - "counter": counter, - "timeid": timeid, "excel_url": excel_url, "reader_info": reader.info(), "reader_sheet_index": reader.get_sheet_index(), @@ -138,9 +127,6 @@ def process_excel_file(facultet, excel_url, counter, timeid): else: reader.next_sheet() print("Next sheet!") - - excel_info["stat"]['parse'] = t.step() - except Exception as e: print(f"Error while {excel_url}") @@ -164,6 +150,7 @@ def process_excel_file(facultet, excel_url, counter, timeid): faileds = [] def main(): + global result_groups, result t = utils.StepTimeCounter() try: try: @@ -179,22 +166,34 @@ def main(): print("main(); parse links starting...") EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC]) - counter = 0 - timeid = str(round(time.time())) - for facultet in EXCEL_LINKS.keys(): - counter += 1000 - print(f"\n\n-- Факультет '{facultet}' --") - facultet_urls = EXCEL_LINKS[facultet] - for excel_url in facultet_urls: - counter += 1 - print(f"\n\n-- Ссылка --") - print(f"{excel_url}") - - print("Start processing excel file") - process_excel_file(facultet, excel_url, counter, timeid) - print("Excel file processing done!") + now_diffable_dates = links_parser.excels_to_diffabledates(EXCEL_LINKS) + prev_diffable_dates = None + if os.path.exists("diffable_dates.txt"): + with open(DIFFABLE_DATES, 'r') as fp: + prev_diffable_dates = fp.read().strip() + + with open(DIFFABLE_DATES, 'w') as fp: + fp.write(now_diffable_dates) + + if now_diffable_dates == prev_diffable_dates: + print("No date changes in vstu.ru website. Stopping") + return + + counter = 10000 + for excel_link in EXCEL_LINKS: + counter += 1 + facultet = excel_link['facultet'] + excel_url = excel_link['url'] + latest_changed = excel_link['last_changed'] + process_excel_file(facultet, excel_url, counter, latest_changed) print("Saving result.json") + group_names_alphabeticaly = sorted(result_groups.keys()) + sorted_groups = {} + for group_name in group_names_alphabeticaly: + sorted_groups[group_name] = result_groups[group_name] + + result['groups'] = sorted_groups result['stat']['total_parsing_time'] = t.step()