|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
|
|
from sklearn.datasets import make_moons |
|
from torch.utils.data import TensorDataset |
|
|
|
|
|
def moons_dataset(n=8000): |
|
X, _ = make_moons(n_samples=n, random_state=42, noise=0.03) |
|
X[:, 0] = (X[:, 0] + 0.3) * 2 - 1 |
|
X[:, 1] = (X[:, 1] + 0.3) * 3 - 1 |
|
return TensorDataset(torch.from_numpy(X.astype(np.float32))) |
|
|
|
|
|
def line_dataset(n=8000): |
|
rng = np.random.default_rng(42) |
|
x = rng.uniform(-0.5, 0.5, n) |
|
y = rng.uniform(-1, 1, n) |
|
X = np.stack((x, y), axis=1) |
|
X *= 4 |
|
return TensorDataset(torch.from_numpy(X.astype(np.float32))) |
|
|
|
|
|
def circle_dataset(n=8000): |
|
rng = np.random.default_rng(42) |
|
x = np.round(rng.uniform(-0.5, 0.5, n) / 2, 1) * 2 |
|
y = np.round(rng.uniform(-0.5, 0.5, n) / 2, 1) * 2 |
|
norm = np.sqrt(x ** 2 + y ** 2) + 1e-10 |
|
x /= norm |
|
y /= norm |
|
theta = 2 * np.pi * rng.uniform(0, 1, n) |
|
r = rng.uniform(0, 0.03, n) |
|
x += r * np.cos(theta) |
|
y += r * np.sin(theta) |
|
X = np.stack((x, y), axis=1) |
|
X *= 3 |
|
return TensorDataset(torch.from_numpy(X.astype(np.float32))) |
|
|
|
|
|
def dino_dataset(n=8000): |
|
df = pd.read_csv("DatasaurusDozen.tsv", sep="\t") |
|
df = df[df["dataset"] == "dino"] |
|
|
|
rng = np.random.default_rng(42) |
|
ix = rng.integers(0, len(df), n) |
|
x = df["x"].iloc[ix].tolist() |
|
x = np.array(x) + rng.normal(size=len(x)) * 0.15 |
|
y = df["y"].iloc[ix].tolist() |
|
y = np.array(y) + rng.normal(size=len(x)) * 0.15 |
|
x = (x / 54 - 1) * 4 |
|
y = (y / 48 - 1) * 4 |
|
X = np.stack((x, y), axis=1) |
|
return TensorDataset(torch.from_numpy(X.astype(np.float32))) |
|
|
|
|
|
def get_dataset(name, n=10000): |
|
if name == "moons": |
|
return moons_dataset(n) |
|
elif name == "dino": |
|
return dino_dataset(n) |
|
elif name == "line": |
|
return line_dataset(n) |
|
elif name == "circle": |
|
return circle_dataset(n) |
|
else: |
|
raise ValueError(f"Unknown dataset: {name}") |
|
|