Files
VSTU_Schedule_Parser/main.py
2025-09-11 16:35:55 +03:00

112 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
import time
import traceback
from urllib.parse import urljoin
import pandas as pd
import xlwt
import xlrd
import requests
from bs4 import BeautifulSoup
import aigenerated
import parser
import translations
import utils
import json
import links_parser
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
FACULTETS = [
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
]
DEBUG_ONE_FAC = None #'fevt'
def process_excel_file(facultet, excel_url, counter, timeid):
is_xlsx = excel_url.endswith(".xlsx")
try:
filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
aigenerated.download_file_from_url(excel_url, filename)
reader = translations.create_reader(filename)
print("Reader info")
print(reader.info())
while True:
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
parser.LOGGING = False
prs = parser.Parser(reader)
prs.parse()
for group_name in prs.groups.keys():
if group_name in result.keys():
print(f" -- WTF -- Doubled groups -- name: {group_name}")
continue
gr = result[group_name] = prs.groups[group_name]
gr['facultet'] = facultet
gr['data_source'] = excel_url.split("/")[-1]
gr['parser_debug'] = {
"C_COUNTER": counter,
"timeid": timeid,
"excel_url": excel_url,
"reader_info": reader.info(),
"reader_sheet_index": reader.get_sheet_index(),
"filename": filename
}
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
if not reader.has_next_sheet():
print("File ended")
break
else:
reader.next_sheet()
print("Next sheet!")
except Exception as e:
print(f"Error while {excel_url}")
print(e)
traceback.print_exc()
faileds.append({
"ex": e,
"fac": facultet,
"url": excel_url
})
result = {}
faileds = []
def main():
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
counter = 0
timeid = str(round(time.time()))
for facultet in EXCEL_LINKS.keys():
counter += 1000
print(f"\n\n-- Факультет '{facultet}' --")
facultet_urls = EXCEL_LINKS[facultet]
for excel_url in facultet_urls:
counter += 1
print(f"\n\n-- Ссылка --")
print(f"{excel_url}")
print("Start processing excel file")
process_excel_file(facultet, excel_url, counter, timeid)
print("Excel file processing done!")
print("Saving result.json")
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
print("Saved to result.json")
print("Faileds:")
print(faileds)
if __name__ == "__main__":
main()
print("Bye!")