Spaces:

mirix
/

LuxSplash

Sleeping

App Files Files Community

mirix commited on Jun 29

Commit

698e383

verified ·

1 Parent(s): 5e30de5

Upload app.py

Browse files

Files changed (1) hide show

app.py +65 -36

app.py CHANGED Viewed

@@ -1,55 +1,76 @@
 import os
 import re
 import time
 import requests
 import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
-import subprocess
-import sys
-import gradio as gr
 import folium
 from folium.plugins import FloatImage
-# Install Playwright and dependencies if running in Hugging Face Space
-if "SPACE_ID" in os.environ:
-    print("Running in Hugging Face Space - installing Playwright...")
-    subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True)
-# Import Playwright after potential installation
-from playwright.sync_api import sync_playwright
 # Configuration
 file_name = 'bathing_sites.csv'
 url = 'https://eau.gouvernement.lu/fr/domaines-activite/eauxbaignade/sites-de-baignade.html'
-# Data processing functions
-def get_final_url(url):
-    with sync_playwright() as p:
-        browser = p.chromium.launch(headless=True)
-        page = browser.new_page()
-        page.set_extra_http_headers({"max-redirects": "9"})
-        if (('&X=' not in url) or ('&Y=' not in url)):
-            page.goto(url, timeout=5000)
-            page.wait_for_timeout(2000)
-            url = page.url
-            browser.close()
-    return url
 def extract_coordinates(url):
     x_match = re.search(r'X=(\d+)', url)
     y_match = re.search(r'Y=(\d+)', url)
-    x = x_match.group(1) if x_match else None
-    y = y_match.group(1) if y_match else None
     return pd.Series([x, y])
-def web_mercator_to_wgs84(x, y):
-    R = 6378137  # Earth's radius in meters
-    lon = (x / R) * (180 / np.pi)
-    lat = (180 / np.pi) * (2 * np.arctan(np.exp(y / R)) - np.pi / 2)
-    return lat, lon
 def file_download():
     df = pd.read_html(url)[0]
@@ -58,9 +79,9 @@ def file_download():
     soup = BeautifulSoup(response.text, 'html.parser')
     df['images'] = [tag.find("img")["src"] for tag in soup.select("td:has(img)")]
-    df['geoport'] = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'geoportail' in tag.find("a")["href"]]
-    df['geoport'] = df['geoport'].apply(get_final_url)
     df.columns = ['Lake', 'Sector', 'Water Quality', 'Swimming allowed', 'Reason for ban', 'Traffic lights', 'URL coordinates']
@@ -76,10 +97,18 @@ def file_download():
     df.loc[df['Traffic lights'].str.contains('greng'), 'Swimming allowed'] = 'Yes'
     df.loc[df['Traffic lights'].str.contains('roud'), 'Swimming allowed'] = 'No'
     df = df.fillna('N/A')
-    df[['X', 'Y']] = df['URL coordinates'].apply(extract_coordinates)
-    df[['X', 'Y']] = df[['X', 'Y']].apply(pd.to_numeric, errors='coerce')
-    df[['lat', 'long']] = df.apply(lambda row: web_mercator_to_wgs84(row['X'], row['Y']), axis=1, result_type='expand')
     df.drop(columns=['Traffic lights', 'URL coordinates', 'X', 'Y'], inplace=True)
     df.to_csv(file_name, index=False)

 import os
+import io
 import re
 import time
 import requests
+import pdfplumber
 import numpy as np
 import pandas as pd
 from bs4 import BeautifulSoup
 import folium
 from folium.plugins import FloatImage
+import gradio as gr
 # Configuration
 file_name = 'bathing_sites.csv'
 url = 'https://eau.gouvernement.lu/fr/domaines-activite/eauxbaignade/sites-de-baignade.html'
 def extract_coordinates(url):
     x_match = re.search(r'X=(\d+)', url)
     y_match = re.search(r'Y=(\d+)', url)
+    x = int(x_match.group(1) if x_match else 0)
+    y = int(y_match.group(1) if y_match else 0)
+    R = 6378137  # Earth's radius in meters
+    if x != 0:
+        x = (x / R) * (180 / np.pi)
+    if y != 0:
+        y = (180 / np.pi) * (2 * np.arctan(np.exp(y / R)) - np.pi / 2)
     return pd.Series([x, y])
+def get_coordinates(pdf_list):
+	sites_list = []
+	for lake in pdf_list:
+		url_pdf = 'https:' + lake
+		response_pdf = requests.get(url_pdf)
+		bytes_io = io.BytesIO(response_pdf.content)
+		with pdfplumber.open(bytes_io) as pdf:
+			page = pdf.pages[0]
+			text = page.extract_text()
+			site = text.split('\n')[1].split(' ')[-1].split('’')[-1].replace('-', ' ').title().replace('Sure', 'Sûre').strip()
+			for page in pdf.pages:
+				tables = page.extract_table()
+				if tables and ('baignade' in tables[0][0]):
+					headers = tables[0]
+					headers = headers[:3]
+					headers.append('Sector')
+					headers.append('Lake')
+					i = 1
+					for table in tables[1:]:
+						table = table[:3]
+						if (site == 'Weiswampach') or  (site == 'Remerschen'):
+							table.append('Zone' + ' ' + str(i))
+						elif site == 'Echternach':
+							table.append('Designated Zone')
+						else:
+							table.append(table[0].split(' ')[1].strip())
+						table.append(site)
+						sites_list.append(table)
+						i  += 1
+	df = pd.DataFrame(sites_list, columns = headers)
+	df = df.dropna()
+	df = df.iloc[:, 1 : ]
+	df = df.iloc[:, ::-1]
+	df.columns = ['Lake', 'Sector', 'Y', 'X']
+	df[['Y', 'X']] = df[['Y', 'X']].apply(pd.to_numeric, errors='coerce')
+	df = df.drop_duplicates(subset = ['Lake', 'Sector'], keep = 'last').reset_index(drop = True)
+	return df
 def file_download():
     df = pd.read_html(url)[0]
     soup = BeautifulSoup(response.text, 'html.parser')
     df['images'] = [tag.find("img")["src"] for tag in soup.select("td:has(img)")]
+    df['URL coordinates'] = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'geoportail' in tag.find("a")["href"]]
+    pdf_list = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'pdf' in tag.find("a")["href"]]
+    df_coord = get_coordinates(pdf_list)
     df.columns = ['Lake', 'Sector', 'Water Quality', 'Swimming allowed', 'Reason for ban', 'Traffic lights', 'URL coordinates']
     df.loc[df['Traffic lights'].str.contains('greng'), 'Swimming allowed'] = 'Yes'
     df.loc[df['Traffic lights'].str.contains('roud'), 'Swimming allowed'] = 'No'
     df = df.fillna('N/A')
+    df[['long', 'lat']] = df['URL coordinates'].apply(extract_coordinates)
+    df[['long', 'lat']] = df[['long', 'lat']].apply(pd.to_numeric, errors='coerce')
+    df = df.reset_index(drop = True)
+    df = pd.merge(left=df, right=df_coord, how='left', left_on=['Lake', 'Sector'], right_on=['Lake', 'Sector'])
+    df.loc[df['long']==0, 'long'] = np.nan
+    df.loc[df['lat']==0, 'lat'] = np.nan
+    df['long'] = df['long'].fillna(df['X'])
+    df['lat'] = df['lat'].fillna(df['Y'])
     df.drop(columns=['Traffic lights', 'URL coordinates', 'X', 'Y'], inplace=True)
     df.to_csv(file_name, index=False)