mirix commited on
Commit
698e383
·
verified ·
1 Parent(s): 5e30de5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -36
app.py CHANGED
@@ -1,55 +1,76 @@
1
  import os
 
2
  import re
3
  import time
4
  import requests
 
5
  import numpy as np
6
  import pandas as pd
7
  from bs4 import BeautifulSoup
8
- import subprocess
9
- import sys
10
- import gradio as gr
11
  import folium
12
  from folium.plugins import FloatImage
 
13
 
14
- # Install Playwright and dependencies if running in Hugging Face Space
15
- if "SPACE_ID" in os.environ:
16
- print("Running in Hugging Face Space - installing Playwright...")
17
- subprocess.run([sys.executable, "-m", "playwright", "install", "chromium"], check=True)
18
-
19
- # Import Playwright after potential installation
20
- from playwright.sync_api import sync_playwright
21
 
22
  # Configuration
23
  file_name = 'bathing_sites.csv'
24
  url = 'https://eau.gouvernement.lu/fr/domaines-activite/eauxbaignade/sites-de-baignade.html'
25
 
26
- # Data processing functions
27
- def get_final_url(url):
28
- with sync_playwright() as p:
29
- browser = p.chromium.launch(headless=True)
30
- page = browser.new_page()
31
- page.set_extra_http_headers({"max-redirects": "9"})
32
- if (('&X=' not in url) or ('&Y=' not in url)):
33
- page.goto(url, timeout=5000)
34
- page.wait_for_timeout(2000)
35
- url = page.url
36
- browser.close()
37
- return url
38
-
39
  def extract_coordinates(url):
40
  x_match = re.search(r'X=(\d+)', url)
41
  y_match = re.search(r'Y=(\d+)', url)
42
 
43
- x = x_match.group(1) if x_match else None
44
- y = y_match.group(1) if y_match else None
 
 
 
 
 
 
45
 
46
  return pd.Series([x, y])
47
 
48
- def web_mercator_to_wgs84(x, y):
49
- R = 6378137 # Earth's radius in meters
50
- lon = (x / R) * (180 / np.pi)
51
- lat = (180 / np.pi) * (2 * np.arctan(np.exp(y / R)) - np.pi / 2)
52
- return lat, lon
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  def file_download():
55
  df = pd.read_html(url)[0]
@@ -58,9 +79,9 @@ def file_download():
58
  soup = BeautifulSoup(response.text, 'html.parser')
59
 
60
  df['images'] = [tag.find("img")["src"] for tag in soup.select("td:has(img)")]
61
- df['geoport'] = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'geoportail' in tag.find("a")["href"]]
62
-
63
- df['geoport'] = df['geoport'].apply(get_final_url)
64
 
65
  df.columns = ['Lake', 'Sector', 'Water Quality', 'Swimming allowed', 'Reason for ban', 'Traffic lights', 'URL coordinates']
66
 
@@ -76,10 +97,18 @@ def file_download():
76
  df.loc[df['Traffic lights'].str.contains('greng'), 'Swimming allowed'] = 'Yes'
77
  df.loc[df['Traffic lights'].str.contains('roud'), 'Swimming allowed'] = 'No'
78
  df = df.fillna('N/A')
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- df[['X', 'Y']] = df['URL coordinates'].apply(extract_coordinates)
81
- df[['X', 'Y']] = df[['X', 'Y']].apply(pd.to_numeric, errors='coerce')
82
- df[['lat', 'long']] = df.apply(lambda row: web_mercator_to_wgs84(row['X'], row['Y']), axis=1, result_type='expand')
83
  df.drop(columns=['Traffic lights', 'URL coordinates', 'X', 'Y'], inplace=True)
84
 
85
  df.to_csv(file_name, index=False)
 
1
  import os
2
+ import io
3
  import re
4
  import time
5
  import requests
6
+ import pdfplumber
7
  import numpy as np
8
  import pandas as pd
9
  from bs4 import BeautifulSoup
 
 
 
10
  import folium
11
  from folium.plugins import FloatImage
12
+ import gradio as gr
13
 
 
 
 
 
 
 
 
14
 
15
  # Configuration
16
  file_name = 'bathing_sites.csv'
17
  url = 'https://eau.gouvernement.lu/fr/domaines-activite/eauxbaignade/sites-de-baignade.html'
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def extract_coordinates(url):
20
  x_match = re.search(r'X=(\d+)', url)
21
  y_match = re.search(r'Y=(\d+)', url)
22
 
23
+ x = int(x_match.group(1) if x_match else 0)
24
+ y = int(y_match.group(1) if y_match else 0)
25
+
26
+ R = 6378137 # Earth's radius in meters
27
+ if x != 0:
28
+ x = (x / R) * (180 / np.pi)
29
+ if y != 0:
30
+ y = (180 / np.pi) * (2 * np.arctan(np.exp(y / R)) - np.pi / 2)
31
 
32
  return pd.Series([x, y])
33
 
34
+ def get_coordinates(pdf_list):
35
+ sites_list = []
36
+ for lake in pdf_list:
37
+ url_pdf = 'https:' + lake
38
+ response_pdf = requests.get(url_pdf)
39
+ bytes_io = io.BytesIO(response_pdf.content)
40
+
41
+ with pdfplumber.open(bytes_io) as pdf:
42
+ page = pdf.pages[0]
43
+ text = page.extract_text()
44
+ site = text.split('\n')[1].split(' ')[-1].split('’')[-1].replace('-', ' ').title().replace('Sure', 'Sûre').strip()
45
+ for page in pdf.pages:
46
+ tables = page.extract_table()
47
+ if tables and ('baignade' in tables[0][0]):
48
+ headers = tables[0]
49
+ headers = headers[:3]
50
+ headers.append('Sector')
51
+ headers.append('Lake')
52
+ i = 1
53
+ for table in tables[1:]:
54
+ table = table[:3]
55
+ if (site == 'Weiswampach') or (site == 'Remerschen'):
56
+ table.append('Zone' + ' ' + str(i))
57
+ elif site == 'Echternach':
58
+ table.append('Designated Zone')
59
+ else:
60
+ table.append(table[0].split(' ')[1].strip())
61
+ table.append(site)
62
+ sites_list.append(table)
63
+ i += 1
64
+
65
+ df = pd.DataFrame(sites_list, columns = headers)
66
+ df = df.dropna()
67
+ df = df.iloc[:, 1 : ]
68
+ df = df.iloc[:, ::-1]
69
+ df.columns = ['Lake', 'Sector', 'Y', 'X']
70
+ df[['Y', 'X']] = df[['Y', 'X']].apply(pd.to_numeric, errors='coerce')
71
+ df = df.drop_duplicates(subset = ['Lake', 'Sector'], keep = 'last').reset_index(drop = True)
72
+ return df
73
+
74
 
75
  def file_download():
76
  df = pd.read_html(url)[0]
 
79
  soup = BeautifulSoup(response.text, 'html.parser')
80
 
81
  df['images'] = [tag.find("img")["src"] for tag in soup.select("td:has(img)")]
82
+ df['URL coordinates'] = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'geoportail' in tag.find("a")["href"]]
83
+ pdf_list = [tag.find("a")["href"] for tag in soup.select("td:has(a)") if 'pdf' in tag.find("a")["href"]]
84
+ df_coord = get_coordinates(pdf_list)
85
 
86
  df.columns = ['Lake', 'Sector', 'Water Quality', 'Swimming allowed', 'Reason for ban', 'Traffic lights', 'URL coordinates']
87
 
 
97
  df.loc[df['Traffic lights'].str.contains('greng'), 'Swimming allowed'] = 'Yes'
98
  df.loc[df['Traffic lights'].str.contains('roud'), 'Swimming allowed'] = 'No'
99
  df = df.fillna('N/A')
100
+
101
+ df[['long', 'lat']] = df['URL coordinates'].apply(extract_coordinates)
102
+ df[['long', 'lat']] = df[['long', 'lat']].apply(pd.to_numeric, errors='coerce')
103
+
104
+ df = df.reset_index(drop = True)
105
+ df = pd.merge(left=df, right=df_coord, how='left', left_on=['Lake', 'Sector'], right_on=['Lake', 'Sector'])
106
+
107
+ df.loc[df['long']==0, 'long'] = np.nan
108
+ df.loc[df['lat']==0, 'lat'] = np.nan
109
+ df['long'] = df['long'].fillna(df['X'])
110
+ df['lat'] = df['lat'].fillna(df['Y'])
111
 
 
 
 
112
  df.drop(columns=['Traffic lights', 'URL coordinates', 'X', 'Y'], inplace=True)
113
 
114
  df.to_csv(file_name, index=False)