File size: 1,935 Bytes
f71c233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import numpy as np
import pandas as pd
import torch
from sklearn.datasets import make_moons
from torch.utils.data import TensorDataset
def moons_dataset(n=8000):
X, _ = make_moons(n_samples=n, random_state=42, noise=0.03)
X[:, 0] = (X[:, 0] + 0.3) * 2 - 1
X[:, 1] = (X[:, 1] + 0.3) * 3 - 1
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def line_dataset(n=8000):
rng = np.random.default_rng(42)
x = rng.uniform(-0.5, 0.5, n)
y = rng.uniform(-1, 1, n)
X = np.stack((x, y), axis=1)
X *= 4
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def circle_dataset(n=8000):
rng = np.random.default_rng(42)
x = np.round(rng.uniform(-0.5, 0.5, n) / 2, 1) * 2
y = np.round(rng.uniform(-0.5, 0.5, n) / 2, 1) * 2
norm = np.sqrt(x ** 2 + y ** 2) + 1e-10
x /= norm
y /= norm
theta = 2 * np.pi * rng.uniform(0, 1, n)
r = rng.uniform(0, 0.03, n)
x += r * np.cos(theta)
y += r * np.sin(theta)
X = np.stack((x, y), axis=1)
X *= 3
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def dino_dataset(n=8000):
df = pd.read_csv("DatasaurusDozen.tsv", sep="\t")
df = df[df["dataset"] == "dino"]
rng = np.random.default_rng(42)
ix = rng.integers(0, len(df), n)
x = df["x"].iloc[ix].tolist()
x = np.array(x) + rng.normal(size=len(x)) * 0.15
y = df["y"].iloc[ix].tolist()
y = np.array(y) + rng.normal(size=len(x)) * 0.15
x = (x / 54 - 1) * 4
y = (y / 48 - 1) * 4
X = np.stack((x, y), axis=1)
return TensorDataset(torch.from_numpy(X.astype(np.float32)))
def get_dataset(name, n=10000):
if name == "moons":
return moons_dataset(n)
elif name == "dino":
return dino_dataset(n)
elif name == "line":
return line_dataset(n)
elif name == "circle":
return circle_dataset(n)
else:
raise ValueError(f"Unknown dataset: {name}")
|