Economy resources, sorts and latest_changes
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -4,3 +4,4 @@ __pycache__
|
|||||||
.idea
|
.idea
|
||||||
result*.json
|
result*.json
|
||||||
groups.json
|
groups.json
|
||||||
|
diffable_dates.txt
|
||||||
@@ -10,6 +10,11 @@ from bs4 import BeautifulSoup
|
|||||||
BASE_URL = "https://www.vstu.ru/"
|
BASE_URL = "https://www.vstu.ru/"
|
||||||
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
|
RASP_PREFIX = "https://www.vstu.ru/student/raspisaniya/zanyatiy/index.php?dep="
|
||||||
|
|
||||||
|
def sibling_clear_to_date(s: str):
|
||||||
|
if s is None:
|
||||||
|
return "!!!Python None!!!"
|
||||||
|
return s.lower().replace("(последнее изменение:", "").replace(")", "").strip()
|
||||||
|
|
||||||
# Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их
|
# Парсит ссылки на эксель .xls & .xlsx файлы и выдаёт их
|
||||||
def parse_links(facultets):
|
def parse_links(facultets):
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
@@ -27,7 +32,7 @@ def parse_links(facultets):
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
EXCEL_LINKS = {}
|
EXCEL_LINKS = []
|
||||||
for facultet in facultets:
|
for facultet in facultets:
|
||||||
url = RASP_PREFIX + facultet
|
url = RASP_PREFIX + facultet
|
||||||
print("getting...")
|
print("getting...")
|
||||||
@@ -38,19 +43,23 @@ def parse_links(facultets):
|
|||||||
|
|
||||||
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
|
# Ищем все теги <a>, у которых атрибут href соответствует нашему паттерну
|
||||||
excel_tags = soup.find_all('a', href=excel_pattern)
|
excel_tags = soup.find_all('a', href=excel_pattern)
|
||||||
excel_links = [tag.get('href') for tag in excel_tags]
|
for a in excel_tags:
|
||||||
|
last_changed = sibling_clear_to_date(a.next_sibling)
|
||||||
|
url = urljoin(BASE_URL, a.get('href'))
|
||||||
|
record = {
|
||||||
|
"facultet": facultet,
|
||||||
|
"url": url,
|
||||||
|
"last_changed": last_changed
|
||||||
|
}
|
||||||
|
print(record)
|
||||||
|
EXCEL_LINKS.append(record)
|
||||||
|
|
||||||
# Предположим, вы уже получили excel_links из одного из методов выше
|
return sorted(EXCEL_LINKS, key=lambda x: x['url'])
|
||||||
# excel_links = ['../../../upload/raspisanie/z/ОН_ХТФ_1 курс.xlsx', ...]
|
|
||||||
|
|
||||||
absolute_links = [urljoin(BASE_URL, relative_link) for relative_link in excel_links]
|
|
||||||
|
|
||||||
if facultet not in EXCEL_LINKS.keys():
|
def excels_to_diffabledates(excels):
|
||||||
EXCEL_LINKS[facultet] = set()
|
dates = []
|
||||||
|
for excel in excels:
|
||||||
for excel_url in absolute_links:
|
dates.append(f"{excel['last_changed']} {excel['facultet']} {excel['url']}")
|
||||||
EXCEL_LINKS[facultet].add(excel_url)
|
|
||||||
print(f"+url {excel_url}")
|
|
||||||
|
|
||||||
return EXCEL_LINKS
|
|
||||||
|
|
||||||
|
return "\n".join(sorted(dates)).strip()
|
||||||
|
|||||||
67
main.py
67
main.py
@@ -19,10 +19,11 @@ import shutil
|
|||||||
def currt():
|
def currt():
|
||||||
return round(time.time())
|
return round(time.time())
|
||||||
|
|
||||||
FACULTETS = [
|
FACULTETS = sorted([
|
||||||
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
"asp", "mag", "fastiv", "fat", "ftkm", "ftpp", "feu", "fevt", "htf", "vkf", "mmf", "fpik"
|
||||||
]
|
])
|
||||||
DIRNAME = "excels"
|
DIRNAME = "excels"
|
||||||
|
DIFFABLE_DATES = "diffable_dates.txt"
|
||||||
|
|
||||||
DEBUG_ONE_FAC = None #'htf'
|
DEBUG_ONE_FAC = None #'htf'
|
||||||
result_groups = {}
|
result_groups = {}
|
||||||
@@ -63,20 +64,15 @@ result = {
|
|||||||
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
|
"see_header_at_top_of_this_file": "SEE TOP OF THIS FILE | ОБРАТИТЕ ВНИМАНИЕ НА ВЕРХ ЭТОГО ФАЙЛА"
|
||||||
}
|
}
|
||||||
|
|
||||||
def process_excel_file(facultet, excel_url, counter, timeid):
|
def process_excel_file(facultet, excel_url, counter, latest_changed):
|
||||||
is_xlsx = excel_url.endswith(".xlsx")
|
is_xlsx = excel_url.endswith(".xlsx")
|
||||||
filename = f"{DIRNAME}/" + timeid + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
|
filename = f"{DIRNAME}/" + f"_[C{counter}]_" + facultet + ".xls" + ("x" if is_xlsx else "")
|
||||||
|
|
||||||
excel_info = {
|
excel_info = {
|
||||||
"filename": excel_url.split("/")[-1],
|
"filename": excel_url.split("/")[-1],
|
||||||
"url": excel_url,
|
"url": excel_url,
|
||||||
|
"latest_changed": latest_changed,
|
||||||
"download_place": filename,
|
"download_place": filename,
|
||||||
"stat": {
|
|
||||||
"download": -1,
|
|
||||||
"create_reader": -1,
|
|
||||||
"parse": -1,
|
|
||||||
"cycles": 0
|
|
||||||
},
|
|
||||||
"group_names_parsed": [],
|
"group_names_parsed": [],
|
||||||
"facultet": facultet,
|
"facultet": facultet,
|
||||||
"counter": counter
|
"counter": counter
|
||||||
@@ -84,17 +80,12 @@ def process_excel_file(facultet, excel_url, counter, timeid):
|
|||||||
parser.LOGGING = False
|
parser.LOGGING = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
t = utils.StepTimeCounter()
|
|
||||||
aigenerated.download_file_from_url(excel_url, filename)
|
aigenerated.download_file_from_url(excel_url, filename)
|
||||||
excel_info["stat"]['download'] = t.step()
|
|
||||||
|
|
||||||
reader = translations.create_reader(filename)
|
reader = translations.create_reader(filename)
|
||||||
print("Reader info")
|
print("Reader info")
|
||||||
print(reader.info())
|
print(reader.info())
|
||||||
excel_info["stat"]['create_reader'] = t.step()
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
excel_info['stat']['cycles'] += 1
|
|
||||||
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
|
print(f"Parsing sheet №{reader.get_sheet_index()+1} (from 1)")
|
||||||
prs = parser.Parser(reader)
|
prs = parser.Parser(reader)
|
||||||
|
|
||||||
@@ -103,7 +94,7 @@ def process_excel_file(facultet, excel_url, counter, timeid):
|
|||||||
|
|
||||||
print("parsed done!")
|
print("parsed done!")
|
||||||
if prs.parser_error is not None:
|
if prs.parser_error is not None:
|
||||||
excel_info["parser_error_cycle_" + str(excel_info['stat']['cycles'])] = prs.parser_error
|
excel_info["parser_error_cycle_" + str(reader.get_sheet_index()+1)] = prs.parser_error
|
||||||
|
|
||||||
for group_name in prs.groups.keys():
|
for group_name in prs.groups.keys():
|
||||||
if group_name in result_groups.keys():
|
if group_name in result_groups.keys():
|
||||||
@@ -121,8 +112,6 @@ def process_excel_file(facultet, excel_url, counter, timeid):
|
|||||||
gr['facultet'] = facultet
|
gr['facultet'] = facultet
|
||||||
gr['data_source'] = excel_url.split("/")[-1]
|
gr['data_source'] = excel_url.split("/")[-1]
|
||||||
gr['debug'] = {
|
gr['debug'] = {
|
||||||
"counter": counter,
|
|
||||||
"timeid": timeid,
|
|
||||||
"excel_url": excel_url,
|
"excel_url": excel_url,
|
||||||
"reader_info": reader.info(),
|
"reader_info": reader.info(),
|
||||||
"reader_sheet_index": reader.get_sheet_index(),
|
"reader_sheet_index": reader.get_sheet_index(),
|
||||||
@@ -139,9 +128,6 @@ def process_excel_file(facultet, excel_url, counter, timeid):
|
|||||||
reader.next_sheet()
|
reader.next_sheet()
|
||||||
print("Next sheet!")
|
print("Next sheet!")
|
||||||
|
|
||||||
excel_info["stat"]['parse'] = t.step()
|
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error while {excel_url}")
|
print(f"Error while {excel_url}")
|
||||||
print(e)
|
print(e)
|
||||||
@@ -164,6 +150,7 @@ def process_excel_file(facultet, excel_url, counter, timeid):
|
|||||||
|
|
||||||
faileds = []
|
faileds = []
|
||||||
def main():
|
def main():
|
||||||
|
global result_groups, result
|
||||||
t = utils.StepTimeCounter()
|
t = utils.StepTimeCounter()
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
@@ -179,22 +166,34 @@ def main():
|
|||||||
|
|
||||||
print("main(); parse links starting...")
|
print("main(); parse links starting...")
|
||||||
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
EXCEL_LINKS = links_parser.parse_links(FACULTETS if DEBUG_ONE_FAC is None else [DEBUG_ONE_FAC])
|
||||||
counter = 0
|
now_diffable_dates = links_parser.excels_to_diffabledates(EXCEL_LINKS)
|
||||||
timeid = str(round(time.time()))
|
prev_diffable_dates = None
|
||||||
for facultet in EXCEL_LINKS.keys():
|
if os.path.exists("diffable_dates.txt"):
|
||||||
counter += 1000
|
with open(DIFFABLE_DATES, 'r') as fp:
|
||||||
print(f"\n\n-- Факультет '{facultet}' --")
|
prev_diffable_dates = fp.read().strip()
|
||||||
facultet_urls = EXCEL_LINKS[facultet]
|
|
||||||
for excel_url in facultet_urls:
|
|
||||||
counter += 1
|
|
||||||
print(f"\n\n-- Ссылка --")
|
|
||||||
print(f"{excel_url}")
|
|
||||||
|
|
||||||
print("Start processing excel file")
|
with open(DIFFABLE_DATES, 'w') as fp:
|
||||||
process_excel_file(facultet, excel_url, counter, timeid)
|
fp.write(now_diffable_dates)
|
||||||
print("Excel file processing done!")
|
|
||||||
|
if now_diffable_dates == prev_diffable_dates:
|
||||||
|
print("No date changes in vstu.ru website. Stopping")
|
||||||
|
return
|
||||||
|
|
||||||
|
counter = 10000
|
||||||
|
for excel_link in EXCEL_LINKS:
|
||||||
|
counter += 1
|
||||||
|
facultet = excel_link['facultet']
|
||||||
|
excel_url = excel_link['url']
|
||||||
|
latest_changed = excel_link['last_changed']
|
||||||
|
process_excel_file(facultet, excel_url, counter, latest_changed)
|
||||||
|
|
||||||
print("Saving result.json")
|
print("Saving result.json")
|
||||||
|
group_names_alphabeticaly = sorted(result_groups.keys())
|
||||||
|
sorted_groups = {}
|
||||||
|
for group_name in group_names_alphabeticaly:
|
||||||
|
sorted_groups[group_name] = result_groups[group_name]
|
||||||
|
|
||||||
|
result['groups'] = sorted_groups
|
||||||
|
|
||||||
result['stat']['total_parsing_time'] = t.step()
|
result['stat']['total_parsing_time'] = t.step()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user