Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| import pathlib | |
| import ssl | |
| import shutil | |
| import csv | |
| import concurrent.futures | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from progress.bar import ChargingBar | |
| from entity import Entity | |
| from common import selectors | |
| import screenshot | |
| def write_cert(e: Entity): | |
| ssl_url = e.url.split("/")[2] | |
| try: | |
| cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None) | |
| with open(f"{e.DATA_PATH}/cert", 'w') as f: | |
| f.write(cert) | |
| except Exception as err: | |
| with open(f"{e.DATA_PATH}/error.log", 'w+') as f: | |
| f.write(str(err)) | |
| def get_logos(e: Entity, page): | |
| soup = BeautifulSoup(page.content, "html.parser") | |
| logos = soup.select(selectors.logo) | |
| i = 0 | |
| lfn = [] | |
| for l in logos: | |
| src = l.attrs['src'] | |
| ext = src.split('.')[-1].split('/')[-1] | |
| try: | |
| res = requests.get(src, stream=True) | |
| except Exception: | |
| res = requests.get(f"{e.url}/{src}") | |
| fn = f"{e.DATA_PATH}/{i}.{ext}" | |
| with open(fn, "wb") as f: | |
| shutil.copyfileobj(res.raw, f) | |
| lfn.append(fn) | |
| i+=1 | |
| def query_vendor_site(e: Entity): | |
| pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True) | |
| try: | |
| page = requests.get(e.url) | |
| except Exception: | |
| e.url = e.url.replace('http', 'https') | |
| page = requests.get(e.url) | |
| write_cert(e) | |
| get_logos(e, page) | |
| screenshot.sc_entity(e) | |
| return (fn, lfn) | |
| def from_csv(fn): | |
| with open(fn, newline='') as csvfile: | |
| reader = csv.DictReader(csvfile) | |
| with concurrent.futures.ThreadPoolExecutor(max_workers = 5) as executor: | |
| futures = {executor.submit(query_vendor_site, e): e for e in [Entity.from_dict(d) for d in reader]} | |
| bar = ChargingBar('Processing', max=len(futures)) | |
| for f in concurrent.futures.as_completed(futures): | |
| url = futures[f] | |
| try: | |
| (cert, logos) = f.result() | |
| except Exception as exc: | |
| print('%r generated an exception: %s' % (url, exc)) | |
| else: | |
| print(cert, logos) | |
| bar.next() | |
| bar.finish() | |
| #query_vendor_site('http://www.bancoprovincia.com.ar', 'debug') | |
| #exit() | |
| if __name__ == '__main__': | |
| from_csv('entidades.csv') | |