major-tom-explorer / utils.py
MarcSkovMadsen's picture
speed up by removing dask
864777a
raw
history blame
5.71 kB
from io import BytesIO
from pathlib import Path
import holoviews as hv
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from fsspec.parquet import open_parquet_file
from holoviews import opts
from PIL import Image
MAJOR_TOM_LOGO = "assets/major-tom-esa-logo.png"
MAJOR_TOM_PICTURE = (
"https://upload.wikimedia.org/wikipedia/en/6/6d/Major_tom_space_oddity_video.JPG"
)
MAJOR_TOM_REF_URL = "https://huggingface.co/Major-TOM"
MAJOR_TOM_ARXIV_URL = "https://www.arxiv.org/abs/2402.12095"
PANEL_LOGO = "https://panel.holoviz.org/_static/logo_horizontal_light_theme.png"
PANEL_URL = "https://panel.holoviz.org"
DATASHADER_LOGO = "https://datashader.org/_static/logo_horizontal.svg"
DATASHADER_URL = "https://datashader.org/"
REPOSITORY = "Major-TOM"
DATASETS = ["Core-S2L2A", "Core-S2L1C"]
ESA_EASTING = 250668.73322714816
ESA_NORTHING = 6259216.653115547
META_DATA_COLUMNS = {
"Coastal aerosol": "B01",
"Blue": "B02",
"Green": "B03",
"Red": "B04",
"Vegetation Blue": "B05",
"Vegetation Green": "B06",
"Vegetation Red": "B07",
"NIR": "B08",
"Narrow NIR": "B8A",
"Water vapour": "B09",
"SWIR, 1613.7": "B11",
"SWIR, 2202.4": "B12",
"Cloud Mask": "cloud_mask",
"Thumbnail": "thumbnail",
}
DATA_PATH = Path(__file__).parent / "data"
DESCRIPTION = f"""\
## Dataset Explorer
This app provides a way of exploring samples present in the [MajorTOM-Core]({MAJOR_TOM_REF_URL}) dataset. It contains nearly every piece of Earth captured by ESA [Sentinel-2](https://sentinels.copernicus.eu/web/sentinel/missions/sentinel-2) satellite.
[Website]({MAJOR_TOM_REF_URL}), [arXiv Paper]({MAJOR_TOM_ARXIV_URL})
## Instructions
To find a sample, navigate on the map to a place of interest. Click the map to find a dataset sample at the location you clicked.
## Powered by
"""
MAJOR_TOM_LYRICS = """
Standing there alone, the ship is waiting
All systems are go, are you sure?
Control is not convinced, but the computer
Has the evidence, no need to abort
The countdown starts
Watching in a trance, the crew is certain
Nothing left to chance, all is working
Trying to relax up in the capsule
"Send me up a drink, " jokes **Major Tom**
The count goes on
Four, three, two, one
Earth below us, drifting, falling
Floating weightless, calling, calling home
Second stage is cut, we're now in orbit
Stabilizers up, running perfect
Starting to collect requested data
"What will it affect when all is done?"
Thinks **Major Tom**
Back at ground control, there is a problem
Go to rockets full, not responding
"Hello **Major Tom**, are you receiving?
Turn the thrusters on, we're standing by"
There's no reply
Four, three, two, one
Earth below us, drifting, falling
Floating weightless, calling, calling home
Across the stratosphere a final message
"Give my wife my love, " then nothing more
Far beneath the ship, the world is mourning
They don't realize he's alive
No one understands, but **Major Tom** sees
"Now the light commands, this is my home
I'm coming home"
Earth below us, drifting, falling
Floating weightless, coming home
Earth below us, drifting, falling
Floating weightless, coming home
Earth below us, drifting, falling
Floating weightless, coming, coming home
Home
Home
Home
Home
Home
"""
hv.extension("bokeh")
opts.defaults(
# opts.Curve(xaxis=None, yaxis=None, show_grid=False, show_frame=False,
# color='orangered', framewise=True, width=100),
opts.HLine(color="gray", line_width=1),
# opts.Layout(shared_axes=False),
opts.VLine(color="gray", line_width=1),
)
def _meta_data_url(dataset="Core-S2L2A", repository=REPOSITORY):
return f"https://huggingface.co/datasets/{repository}/{dataset}/resolve/main/metadata.parquet"
def _meta_data_path(dataset="Core-S2L2A", repository=REPOSITORY):
DATA_PATH.mkdir(parents=True, exist_ok=True)
return DATA_PATH / f"{dataset}_metadata.parquet"
def get_meta_data(dataset="Core-S2L2A", repository=REPOSITORY):
print(f"Loading {dataset}")
path = _meta_data_path(dataset=dataset)
if not path.exists():
data = pd.read_parquet(_meta_data_url(dataset=dataset))
data.to_parquet(path)
data = pd.read_parquet(path)
data["centre_easting"], data["centre_northing"] = (
hv.util.transform.lon_lat_to_easting_northing(
data["centre_lon"], data["centre_lat"]
)
)
# Optimize Performance
data["timestamp"] = pd.to_datetime(data["timestamp"])
numeric_cols = ["cloud_cover", "nodata", "centre_lat", "centre_lon"]
data[numeric_cols] = data[numeric_cols].astype("float32")
return data
def get_image(row, column="thumbnail"):
parquet_url = row["parquet_url"]
parquet_row = row["parquet_row"]
print(parquet_url, parquet_row, column)
with open_parquet_file(parquet_url, columns=[column]) as f:
with pq.ParquetFile(f) as pf:
first_row_group = pf.read_row_group(parquet_row, columns=[column])
stream = BytesIO(first_row_group[column][0].as_py())
image = Image.open(stream)
return image
def euclidean_distance(x, y, target_x, target_y):
return np.sqrt((x - target_x) ** 2 + (y - target_y) ** 2)
def get_closest_row(data, target_easting, target_northing):
distance = euclidean_distance(
data["centre_easting"], data["centre_northing"], target_easting, target_northing
)
closest_row = data.loc[distance.idxmin()]
return closest_row
def get_closest_rows(data, target_easting, target_northing):
distance = euclidean_distance(
data["centre_easting"], data["centre_northing"], target_easting, target_northing
)
closest_rows = data[distance == distance.min()]
return closest_rows