112 lines
3.3 KiB
Python
112 lines
3.3 KiB
Python
import json
|
||
import re
|
||
import time
|
||
import traceback
|
||
from urllib.parse import urljoin
|
||
import pandas as pd
|
||
import xlwt
|
||
|
||
import xlrd
|
||
import requests
|
||
|
||
|
||
from bs4 import BeautifulSoup
|
||
import aigenerated
|
||
import parser
|
||
import translations
|
||
import utils
|
||
import json
|
||
import links_parser
|
||
# Общее правило проекта, сначала в координатах идёт ROW а потом COL, нумерация с нуля
|
||
|
||
FACULTETS = [
|
||
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
||
]
|
||
|
||
DEBUG_ONE_FAC = None #'fevt'
|
||
|
||
|
||
def process_excel_file(facultet, excel_url, counter, timeid):
|
||
is_xlsx = excel_url.endswith(".xlsx")
|
||
try:
|
||
filename = "excels/" + timeid + "_" + facultet + f"_[C{counter}]" + ".xls" + ("x" if is_xlsx else "")
|
||
aigenerated.download_file_from_url(excel_url, filename)
|
||
|
||
reader = translations.create_reader(filename)
|
||
print("Reader info")
|
||
print(reader.info())
|
||
|
||
while True:
|
||
print(f"Parsing sheet №{reader.get_sheet_index()+1}")
|
||
parser.LOGGING = False
|
||
prs = parser.Parser(reader)
|
||
prs.parse()
|
||
for group_name in prs.groups.keys():
|
||
if group_name in result.keys():
|
||
print(f" -- WTF -- Doubled groups -- name: {group_name}")
|
||
continue
|
||
|
||
gr = result[group_name] = prs.groups[group_name]
|
||
gr['facultet'] = facultet
|
||
gr['data_source'] = excel_url.split("/")[-1]
|
||
gr['parser_debug'] = {
|
||
"C_COUNTER": counter,
|
||
"timeid": timeid,
|
||
"excel_url": excel_url,
|
||
"reader_info": reader.info(),
|
||
"reader_sheet_index": reader.get_sheet_index(),
|
||
"filename": filename
|
||
}
|
||
|
||
print(f"Populates {len(prs.groups)} groups to result: " + " ".join(prs.groups.keys()))
|
||
|
||
if not reader.has_next_sheet():
|
||
print("File ended")
|
||
break
|
||
else:
|
||
reader.next_sheet()
|
||
print("Next sheet!")
|
||
|
||
except Exception as e:
|
||
print(f"Error while {excel_url}")
|
||
print(e)
|
||
traceback.print_exc()
|
||
faileds.append({
|
||
"ex": e,
|
||
"fac": facultet,
|
||
"url": excel_url
|
||
})
|
||
|
||
|
||
result = {}
|
||
faileds = []
|
||
def main():
|
||
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
||
|
||
counter = 0
|
||
timeid = str(round(time.time()))
|
||
for facultet in EXCEL_LINKS.keys():
|
||
counter += 1000
|
||
print(f"\n\n-- Факультет '{facultet}' --")
|
||
facultet_urls = EXCEL_LINKS[facultet]
|
||
for excel_url in facultet_urls:
|
||
counter += 1
|
||
print(f"\n\n-- Ссылка --")
|
||
print(f"{excel_url}")
|
||
|
||
print("Start processing excel file")
|
||
process_excel_file(facultet, excel_url, counter, timeid)
|
||
print("Excel file processing done!")
|
||
|
||
print("Saving result.json")
|
||
json.dump(result, open('result.json', 'w'), indent=2, ensure_ascii=False)
|
||
print("Saved to result.json")
|
||
|
||
print("Faileds:")
|
||
print(faileds)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
print("Bye!")
|