import json import re import time import traceback from urllib.parse import urljoin import pandas as pd import xlwt import xlrd import requests from bs4 import BeautifulSoup import aigenerated import parser import translations import utils import json import links_parser # Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля FACULTETS = [ "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik" ] DEBUG_ONE_FAC = None #'fevt' def process_excel_file(facultet, excel_url, counter, timeid): is_xlsx = excel_url.endswith(".xlsx") try: filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "") aigenerated.download_file_from_url(excel_url, filename) reader = translations.create_reader(filename) print("Reader info") print(reader.info()) while True: print(f"Parsing sheet №{reader.get_sheet_index()+1}") parser.LOGGING = False prs = parser.Parser(reader) prs.parse() for group_name in prs.groups.keys(): if group_name in result.keys(): print(f" -- WTF -- Doubled groups -- name: {group_name}") continue gr = result[group_name] = prs.groups[group_name] gr['facultet'] = facultet gr['data_source'] = excel_url.split("/")[-1] gr['parser_debug'] = { "C_COUNTER": counter, "timeid": timeid, "excel_url": excel_url, "reader_info": reader.info(), "reader_sheet_index": reader.get_sheet_index(), "filename": filename } print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys())) if not reader.has_next_sheet(): print("File ended") break else: reader.next_sheet() print("Next sheet!") except Exception as e: print(f"Error while {excel_url}") print(e) traceback.print_exc() faileds.append({ "ex": e, "fac": facultet, "url": excel_url }) result = {} faileds = [] def main(): EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC]) counter = 0 timeid = str(round(time.time())) for facultet in EXCEL_LINKS.keys(): counter += 1000 print(f"\n\n-- Факультет '{facultet}' --") facultet_urls = EXCEL_LINKS[facultet] for excel_url in facultet_urls: counter += 1 print(f"\n\n-- Ссылка --") print(f"{excel_url}") print("Start processing excel file") process_excel_file(facultet, excel_url, counter, timeid) print("Excel file processing done!") print("Saving result.json") json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False) print("Saved to result.json") print("Faileds:") print(faileds) if __name__ == "__main__": main() print("Bye!")