Skip to content

Train utils

batchify(data: np.ndarray, batch_size: int, func: Callable[[np.ndarray], np.ndarray] | None = None) -> Iterator[np.ndarray]

Batchify data. If func is not None, then the emitted item is func(batch).

Parameters:

Name Type Description Default
data np.ndarray

NumPy array of items to batchify.

required
batch_size int

Batch size; must be between 1 and len(data).

required
func Callable[[np.ndarray], np.ndarray]

Optional function to apply to each emitted batch. Defaults to identity function.

None

Returns:

Type Description
Iterator[np.ndarray]

Iterator[np.ndarray]: Generator object containing batches.

Source code in opskrift/train_utils.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def batchify(
    data: np.ndarray,
    batch_size: int,
    func: Callable[[np.ndarray], np.ndarray] | None = None,
) -> Iterator[np.ndarray]:
    """Batchify `data`. If `func` is not None, then the emitted item is `func(batch)`.

    Args:
        data (np.ndarray): NumPy array of items to batchify.
        batch_size (int): Batch size; must be between 1 and `len(data)`.
        func (Callable[[np.ndarray], np.ndarray], optional): Optional function to apply
            to each emitted batch. Defaults to identity function.

    Returns:
        Iterator[np.ndarray]: Generator object containing batches.
    """
    if not isinstance(batch_size, int) or not (1 <= batch_size <= len(data)):
        raise ValueError(f"Batch size must be an int in [1, {data.shape[0]}].")

    if func is None:
        func = lambda x: x

    n = len(data)
    for i in range(0, n, batch_size):
        yield func(data[i : min(i + batch_size, n)])

get_cosine_learning_rates(lr_min: float, lr_max: float, freq: float, num_points: int) -> list[float]

Decay the learning rate based on a cosine schedule of frequency freq. Returns a list of N learning rate values in the interval [lr_min, lr_max].

Source code in opskrift/train_utils.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def get_cosine_learning_rates(
    lr_min: float, lr_max: float, freq: float, num_points: int
) -> list[float]:
    """Decay the learning rate based on a cosine schedule of frequency `freq`.
    Returns a list of `N` learning rate values in the interval `[lr_min, lr_max]`.
    """
    lr = []

    for i in range(num_points):
        freq = freq * i / num_points
        scaler = 0.5 * (1 + math.cos(2 * math.pi * freq))  # [0, 1]
        lr.append(lr_min + scaler * (lr_max - lr_min))

    return lr

split_data(data: list[Any], train_f: float, test_f: float, shuffle: bool = False) -> dict[str, list[Any]]

Get train / test / valid splits from data. If shuffle is True, then use a random permutation of data. valid split size is given by (1 - train_f - test_f) * len(data).

Parameters:

Name Type Description Default
data list[Any]

Any collection of items to be split.

required
train_f float

Train size factor from the entire length (must be between 0 and 1).

required
test_f float

Test size factor from the entire length (must be between 0 and 1).

required
shuffle bool

Whether to use a random permutation of data.

False

Returns:

Type Description
dict[str, list[Any]]

dict[str, list[Any]]: Keys are {train, test, valid}, and values are corresponding splits

Source code in opskrift/train_utils.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def split_data(
    data: list[Any], train_f: float, test_f: float, shuffle: bool = False
) -> dict[str, list[Any]]:
    """Get `train / test / valid` splits from `data`.
    If `shuffle` is True, then use a random permutation of `data`.
    `valid` split size is given by `(1 - train_f - test_f) * len(data)`.

    Args:
        data (list[Any]): Any collection of items to be split.
        train_f (float): Train size factor from the entire length (must be between 0 and 1).
        test_f (float): Test size factor from the entire length (must be between 0 and 1).
        shuffle (bool): Whether to use a random permutation of `data`.

    Returns:
        dict[str, list[Any]]: Keys are {train, test, valid}, and values are corresponding splits
    """
    n = len(data)

    # use a generator to keep offset internally when taking elements
    if shuffle:
        rand_idx = np.random.permutation(n)
        gen = (data[i] for i in rand_idx)
    else:
        gen = (x for x in data)

    return {
        "train": list(itertoolz.take(int(n * train_f), gen)),  # take first
        "test": list(itertoolz.take(int(n * test_f), gen)),  # take next
        "valid": list(gen),  # take remaining
    }