VSTU_Schedule_Parser/main.py

import json
import re
import time
import traceback
from urllib.parse import urljoin
import pandas as pd
import xlwt

import xlrd
import requests


from bs4 import BeautifulSoup
import aigenerated
import parser
import translations
import utils
import json
import links_parser
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля

FACULTETS = [
    "asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
]

DEBUG_ONE_FAC = None #'fevt'


def process_excel_file(facultet, excel_url, counter, timeid):
    is_xlsx = excel_url.endswith(".xlsx")
    try:
        filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
        aigenerated.download_file_from_url(excel_url, filename)

        reader = translations.create_reader(filename)
        print("Reader info")
        print(reader.info())

        while True:
            print(f"Parsing sheet №{reader.get_sheet_index()+1}")
            parser.LOGGING = False
            prs = parser.Parser(reader)
            prs.parse()
            for group_name in prs.groups.keys():
                if group_name in result.keys():
                    print(f" -- WTF -- Doubled groups -- name: {group_name}")
                    continue

                gr = result[group_name] = prs.groups[group_name]
                gr['facultet'] = facultet
                gr['data_source'] = excel_url.split("/")[-1]
                gr['parser_debug'] = {
                    "C_COUNTER": counter,
                    "timeid": timeid,
                    "excel_url": excel_url,
                    "reader_info": reader.info(),
                    "reader_sheet_index": reader.get_sheet_index(),
                    "filename": filename
                }

            print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))

            if not reader.has_next_sheet():
                print("File ended")
                break
            else:
                reader.next_sheet()
                print("Next sheet!")

    except Exception as e:
        print(f"Error while {excel_url}")
        print(e)
        traceback.print_exc()
        faileds.append({
            "ex": e,
            "fac": facultet,
            "url": excel_url
        })


result = {}
faileds = []
def main():
    EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])

    counter = 0
    timeid = str(round(time.time()))
    for facultet in EXCEL_LINKS.keys():
        counter += 1000
        print(f"\n\n-- Факультет '{facultet}' --")
        facultet_urls = EXCEL_LINKS[facultet]
        for excel_url in facultet_urls:
            counter += 1
            print(f"\n\n-- Ссылка --")
            print(f"{excel_url}")

            print("Start processing excel file")
            process_excel_file(facultet, excel_url, counter, timeid)
            print("Excel file processing done!")

    print("Saving result.json")
    json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
    print("Saved to result.json")

    print("Faileds:")
    print(faileds)


if __name__ == "__main__":
    main()
    print("Bye!")