Upload app.py
Browse files
app.py
CHANGED
|
@@ -1,55 +1,76 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
import re
|
| 3 |
import time
|
| 4 |
import requests
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
from bs4 import BeautifulSoup
|
| 8 |
-
import subprocess
|
| 9 |
-
import sys
|
| 10 |
-
import gradio as gr
|
| 11 |
import folium
|
| 12 |
from folium.plugins import FloatImage
|
|
|
|
| 13 |
|
| 14 |
-
# Install Playwright and dependencies if running in Hugging Face Space
|
| 15 |
-
if "SPACE_ID" in os.environ:
|
| 16 |
-
print("Running in Hugging Face Space - installing Playwright...")
|
| 17 |
-
subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True)
|
| 18 |
-
|
| 19 |
-
# Import Playwright after potential installation
|
| 20 |
-
from playwright.sync_api import sync_playwright
|
| 21 |
|
| 22 |
# Configuration
|
| 23 |
file_name = 'bathing_sites.csv'
|
| 24 |
url = 'https://eau.gouvernement.lu/fr/domaines-activite/eauxbaignade/sites-de-baignade.html'
|
| 25 |
|
| 26 |
-
# Data processing functions
|
| 27 |
-
def get_final_url(url):
|
| 28 |
-
with sync_playwright() as p:
|
| 29 |
-
browser = p.chromium.launch(headless=True)
|
| 30 |
-
page = browser.new_page()
|
| 31 |
-
page.set_extra_http_headers({"max-redirects": "9"})
|
| 32 |
-
if (('&X=' not in url) or ('&Y=' not in url)):
|
| 33 |
-
page.goto(url, timeout=5000)
|
| 34 |
-
page.wait_for_timeout(2000)
|
| 35 |
-
url = page.url
|
| 36 |
-
browser.close()
|
| 37 |
-
return url
|
| 38 |
-
|
| 39 |
def extract_coordinates(url):
|
| 40 |
x_match = re.search(r'X=(\d+)', url)
|
| 41 |
y_match = re.search(r'Y=(\d+)', url)
|
| 42 |
|
| 43 |
-
x = x_match.group(1) if x_match else
|
| 44 |
-
y = y_match.group(1) if y_match else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
return pd.Series([x, y])
|
| 47 |
|
| 48 |
-
def
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def file_download():
|
| 55 |
df = pd.read_html(url)[0]
|
|
@@ -58,9 +79,9 @@ def file_download():
|
|
| 58 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 59 |
|
| 60 |
df['images'] = [tag.find("img")["src"] for tag in soup.select("td:has(img)")]
|
| 61 |
-
df['
|
| 62 |
-
|
| 63 |
-
|
| 64 |
|
| 65 |
df.columns = ['Lake', 'Sector', 'Water Quality', 'Swimming allowed', 'Reason for ban', 'Traffic lights', 'URL coordinates']
|
| 66 |
|
|
@@ -76,10 +97,18 @@ def file_download():
|
|
| 76 |
df.loc[df['Traffic lights'].str.contains('greng'), 'Swimming allowed'] = 'Yes'
|
| 77 |
df.loc[df['Traffic lights'].str.contains('roud'), 'Swimming allowed'] = 'No'
|
| 78 |
df = df.fillna('N/A')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
df[['X', 'Y']] = df['URL coordinates'].apply(extract_coordinates)
|
| 81 |
-
df[['X', 'Y']] = df[['X', 'Y']].apply(pd.to_numeric, errors='coerce')
|
| 82 |
-
df[['lat', 'long']] = df.apply(lambda row: web_mercator_to_wgs84(row['X'], row['Y']), axis=1, result_type='expand')
|
| 83 |
df.drop(columns=['Traffic lights', 'URL coordinates', 'X', 'Y'], inplace=True)
|
| 84 |
|
| 85 |
df.to_csv(file_name, index=False)
|
|
|
|
| 1 |
import os
|
| 2 |
+
import io
|
| 3 |
import re
|
| 4 |
import time
|
| 5 |
import requests
|
| 6 |
+
import pdfplumber
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
| 10 |
import folium
|
| 11 |
from folium.plugins import FloatImage
|
| 12 |
+
import gradio as gr
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Configuration
|
| 16 |
file_name = 'bathing_sites.csv'
|
| 17 |
url = 'https://eau.gouvernement.lu/fr/domaines-activite/eauxbaignade/sites-de-baignade.html'
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
def extract_coordinates(url):
|
| 20 |
x_match = re.search(r'X=(\d+)', url)
|
| 21 |
y_match = re.search(r'Y=(\d+)', url)
|
| 22 |
|
| 23 |
+
x = int(x_match.group(1) if x_match else 0)
|
| 24 |
+
y = int(y_match.group(1) if y_match else 0)
|
| 25 |
+
|
| 26 |
+
R = 6378137 # Earth's radius in meters
|
| 27 |
+
if x != 0:
|
| 28 |
+
x = (x / R) * (180 / np.pi)
|
| 29 |
+
if y != 0:
|
| 30 |
+
y = (180 / np.pi) * (2 * np.arctan(np.exp(y / R)) - np.pi / 2)
|
| 31 |
|
| 32 |
return pd.Series([x, y])
|
| 33 |
|
| 34 |
+
def get_coordinates(pdf_list):
|
| 35 |
+
sites_list = []
|
| 36 |
+
for lake in pdf_list:
|
| 37 |
+
url_pdf = 'https:' + lake
|
| 38 |
+
response_pdf = requests.get(url_pdf)
|
| 39 |
+
bytes_io = io.BytesIO(response_pdf.content)
|
| 40 |
+
|
| 41 |
+
with pdfplumber.open(bytes_io) as pdf:
|
| 42 |
+
page = pdf.pages[0]
|
| 43 |
+
text = page.extract_text()
|
| 44 |
+
site = text.split('\n')[1].split(' ')[-1].split('’')[-1].replace('-', ' ').title().replace('Sure', 'Sûre').strip()
|
| 45 |
+
for page in pdf.pages:
|
| 46 |
+
tables = page.extract_table()
|
| 47 |
+
if tables and ('baignade' in tables[0][0]):
|
| 48 |
+
headers = tables[0]
|
| 49 |
+
headers = headers[:3]
|
| 50 |
+
headers.append('Sector')
|
| 51 |
+
headers.append('Lake')
|
| 52 |
+
i = 1
|
| 53 |
+
for table in tables[1:]:
|
| 54 |
+
table = table[:3]
|
| 55 |
+
if (site == 'Weiswampach') or (site == 'Remerschen'):
|
| 56 |
+
table.append('Zone' + ' ' + str(i))
|
| 57 |
+
elif site == 'Echternach':
|
| 58 |
+
table.append('Designated Zone')
|
| 59 |
+
else:
|
| 60 |
+
table.append(table[0].split(' ')[1].strip())
|
| 61 |
+
table.append(site)
|
| 62 |
+
sites_list.append(table)
|
| 63 |
+
i += 1
|
| 64 |
+
|
| 65 |
+
df = pd.DataFrame(sites_list, columns = headers)
|
| 66 |
+
df = df.dropna()
|
| 67 |
+
df = df.iloc[:, 1 : ]
|
| 68 |
+
df = df.iloc[:, ::-1]
|
| 69 |
+
df.columns = ['Lake', 'Sector', 'Y', 'X']
|
| 70 |
+
df[['Y', 'X']] = df[['Y', 'X']].apply(pd.to_numeric, errors='coerce')
|
| 71 |
+
df = df.drop_duplicates(subset = ['Lake', 'Sector'], keep = 'last').reset_index(drop = True)
|
| 72 |
+
return df
|
| 73 |
+
|
| 74 |
|
| 75 |
def file_download():
|
| 76 |
df = pd.read_html(url)[0]
|
|
|
|
| 79 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 80 |
|
| 81 |
df['images'] = [tag.find("img")["src"] for tag in soup.select("td:has(img)")]
|
| 82 |
+
df['URL coordinates'] = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'geoportail' in tag.find("a")["href"]]
|
| 83 |
+
pdf_list = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'pdf' in tag.find("a")["href"]]
|
| 84 |
+
df_coord = get_coordinates(pdf_list)
|
| 85 |
|
| 86 |
df.columns = ['Lake', 'Sector', 'Water Quality', 'Swimming allowed', 'Reason for ban', 'Traffic lights', 'URL coordinates']
|
| 87 |
|
|
|
|
| 97 |
df.loc[df['Traffic lights'].str.contains('greng'), 'Swimming allowed'] = 'Yes'
|
| 98 |
df.loc[df['Traffic lights'].str.contains('roud'), 'Swimming allowed'] = 'No'
|
| 99 |
df = df.fillna('N/A')
|
| 100 |
+
|
| 101 |
+
df[['long', 'lat']] = df['URL coordinates'].apply(extract_coordinates)
|
| 102 |
+
df[['long', 'lat']] = df[['long', 'lat']].apply(pd.to_numeric, errors='coerce')
|
| 103 |
+
|
| 104 |
+
df = df.reset_index(drop = True)
|
| 105 |
+
df = pd.merge(left=df, right=df_coord, how='left', left_on=['Lake', 'Sector'], right_on=['Lake', 'Sector'])
|
| 106 |
+
|
| 107 |
+
df.loc[df['long']==0, 'long'] = np.nan
|
| 108 |
+
df.loc[df['lat']==0, 'lat'] = np.nan
|
| 109 |
+
df['long'] = df['long'].fillna(df['X'])
|
| 110 |
+
df['lat'] = df['lat'].fillna(df['Y'])
|
| 111 |
|
|
|
|
|
|
|
|
|
|
| 112 |
df.drop(columns=['Traffic lights', 'URL coordinates', 'X', 'Y'], inplace=True)
|
| 113 |
|
| 114 |
df.to_csv(file_name, index=False)
|