Plonk / scripts /preprocessing /enrich-metadata-quadtree.py
nicolas-dufour's picture
squash: merge all unpushed commits
c4c7cee
import hydra
import numpy as np
import pandas as pd
from os.path import join, dirname
import matplotlib.pyplot as plt
import torch
class QuadTree(object):
def __init__(self, data, mins=None, maxs=None, id="", depth=3, do_split=1000):
self.id = id
self.data = data
if mins is None:
mins = data[["latitude", "longitude"]].to_numpy().min(0)
if maxs is None:
maxs = data[["latitude", "longitude"]].to_numpy().max(0)
self.mins = np.asarray(mins)
self.maxs = np.asarray(maxs)
self.sizes = self.maxs - self.mins
self.children = []
mids = 0.5 * (self.mins + self.maxs)
xmin, ymin = self.mins
xmax, ymax = self.maxs
xmid, ymid = mids
if (depth > 0) and (len(self.data) >= do_split):
# split the data into four quadrants
data_q1 = data[(data["latitude"] < mids[0]) & (data["longitude"] < mids[1])]
data_q2 = data[
(data["latitude"] < mids[0]) & (data["longitude"] >= mids[1])
]
data_q3 = data[
(data["latitude"] >= mids[0]) & (data["longitude"] < mids[1])
]
data_q4 = data[
(data["latitude"] >= mids[0]) & (data["longitude"] >= mids[1])
]
# recursively build a quad tree on each quadrant which has data
if data_q1.shape[0] > 0:
self.children.append(
QuadTree(
data_q1,
[xmin, ymin],
[xmid, ymid],
id + "0",
depth - 1,
do_split=do_split,
)
)
if data_q2.shape[0] > 0:
self.children.append(
QuadTree(
data_q2,
[xmin, ymid],
[xmid, ymax],
id + "1",
depth - 1,
do_split=do_split,
)
)
if data_q3.shape[0] > 0:
self.children.append(
QuadTree(
data_q3,
[xmid, ymin],
[xmax, ymid],
id + "2",
depth - 1,
do_split=do_split,
)
)
if data_q4.shape[0] > 0:
self.children.append(
QuadTree(
data_q4,
[xmid, ymid],
[xmax, ymax],
id + "3",
depth - 1,
do_split=do_split,
)
)
def unwrap(self):
if len(self.children) == 0:
return {self.id: [self.mins, self.maxs, self.data.copy()]}
else:
d = dict()
for child in self.children:
d.update(child.unwrap())
return d
def extract(qt, name_new_column):
cluster = qt.unwrap()
boundaries, data = {}, []
id_to_quad = np.array(list(cluster.keys()))
for i, (id, vs) in zip(np.arange(len(cluster)), cluster.items()):
(min_lat, min_lon), (max_lat, max_lon), points = vs
points[name_new_column] = int(i)
data.append(points)
boundaries[i] = (
float(min_lat),
float(min_lon),
float(max_lat),
float(max_lon),
points["latitude"].mean(),
points["longitude"].mean(),
)
data = pd.concat(data)
return boundaries, data, id_to_quad
def vizu(name_new_column, df_train, boundaries):
plt.hist(df_train[name_new_column], bins=len(boundaries))
plt.xlabel("Cluster ID")
plt.ylabel("Number of images")
plt.title("Cluster distribution")
plt.yscale("log")
plt.savefig(f"{name_new_column}_distrib.png")
plt.clf()
plt.scatter(
df_train["longitude"].to_numpy(),
df_train["latitude"].to_numpy(),
c=np.random.permutation(len(boundaries))[df_train[name_new_column].to_numpy()],
cmap="tab20",
s=0.1,
alpha=0.5,
)
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.title("Quadtree map")
plt.savefig(f"{name_new_column}_map.png")
@hydra.main(
config_path="../configs/scripts",
config_name="enrich-metadata-quadtree",
version_base=None,
)
def main(cfg):
data_path = join(cfg.data_dir, "osv5m")
name_new_column = f"quadtree_{cfg.depth}_{cfg.do_split}"
# Create clusters from train images
train_fp = join(data_path, f"train.csv")
df_train = pd.read_csv(train_fp)
qt = QuadTree(df_train, depth=cfg.depth, do_split=cfg.do_split)
boundaries, df_train, id_to_quad = extract(qt, name_new_column)
vizu(name_new_column, df_train, boundaries)
# Save clusters
boundaries = pd.DataFrame.from_dict(
boundaries,
orient="index",
columns=["min_lat", "min_lon", "max_lat", "max_lon", "mean_lat", "mean_lon"],
)
boundaries.to_csv(f"{name_new_column}.csv", index_label="cluster_id")
# Assign test images to clusters
test_fp = join(data_path, f"test.csv")
df_test = pd.read_csv(test_fp)
above_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) > np.expand_dims(
boundaries["min_lat"].to_numpy(), 0
)
below_lat = np.expand_dims(df_test["latitude"].to_numpy(), -1) < np.expand_dims(
boundaries["max_lat"].to_numpy(), 0
)
above_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) > np.expand_dims(
boundaries["min_lon"].to_numpy(), 0
)
below_lon = np.expand_dims(df_test["longitude"].to_numpy(), -1) < np.expand_dims(
boundaries["max_lon"].to_numpy(), 0
)
mask = np.logical_and(
np.logical_and(above_lat, below_lat), np.logical_and(above_lon, below_lon)
)
df_test[name_new_column] = np.argmax(mask, axis=1)
# save index_to_gps_quadtree file
lat = torch.tensor(boundaries["mean_lat"])
lon = torch.tensor(boundaries["mean_lon"])
coord = torch.stack([lat / 90, lon / 180], dim=-1)
torch.save(
coord, join(data_path, f"index_to_gps_quadtree_{cfg.depth}_{cfg.do_split}.pt")
)
torch.save(id_to_quad, join(data_path, f"id_to_quad_{cfg.depth}_{cfg.do_split}.pt"))
# Overwrite test.csv and train.csv
if cfg.overwrite_csv:
df_train.to_csv(train_fp, index=False)
df_test.to_csv(test_fp, index=False)
if __name__ == "__main__":
main()