Skip to content

Trip and TripDataset Data Structures

Classes to handle datasets with baskets of products.

Trip

Class for a trip.

A trip is a sequence of purchases made at a specific time and at a specific store with given prices and a specific assortment. It can be seen as the content of a time-stamped purchase receipt with store identification.

Trip = (purchases, store, week, prices, assortment)

Source code in choice_learn/basket_models/dataset.py
class Trip:
    """Class for a trip.

    A trip is a sequence of purchases made at a specific time and
    at a specific store with given prices and a specific assortment.
    It can be seen as the content of a time-stamped purchase receipt with store identification.

    Trip = (purchases, store, week, prices, assortment)
    """

    def __init__(
        self,
        purchases: np.ndarray,
        prices: np.ndarray,
        assortment: Union[int, np.ndarray],
        store: int = 0,
        week: int = 0,
    ) -> None:
        """Initialize the trip.

        Parameters
        ----------
        purchases: np.ndarray
            List of the ID of the purchased items, 0 to n_items - 1 (0-indexed)
            Shape must be (len_basket,), the last item is the checkout item 0
        store: int
            Store ID, 0 to n_stores - 1 (0-indexed)
        week: int
            Week number, 0 to 51 (0-indexed)
        prices: np.ndarray
            Prices of all the items in the dataset
            Shape must be (n_items,) with n_items the number of items in
            the TripDataset
        assortment: int or np.ndarray
            Assortment ID (int) corresponding to the assortment (ie its index in
            self.available_items) OR availability matrix (np.ndarray) of the
            assortment (binary vector of length n_items where 1 means the item
            is available and 0 means the item is not available)
            An assortment is the list of available items of a specific store at a given time
        """
        if week not in range(52):
            raise ValueError("Week number must be between 0 and 51, inclusive.")

        # Constitutive elements of a trip
        self.purchases = purchases
        self.store = store
        self.week = week
        self.prices = prices
        self.assortment = assortment

        self.trip_length = len(purchases)

    def __str__(self) -> str:
        """Return short representation of the trip.

        Returns
        -------
        str
            Representation of the trip
        """
        desc = f"Trip with {self.trip_length} purchases {self.purchases}"
        desc += f" at store {self.store} in week {self.week}"
        desc += f" with prices {self.prices} and assortment {self.assortment}"
        return desc

    def get_items_up_to_index(self, i: int) -> np.ndarray:
        """Get items up to index i.

        Parameters
        ----------
        i: int
            Index of the item to get

        Returns
        -------
        np.ndarray
            List of items up to index i (excluded)
            Shape must be (i,)
        """
        return self.purchases[:i]

__init__(purchases, prices, assortment, store=0, week=0)

Initialize the trip.

Parameters:

Name Type Description Default
purchases ndarray

List of the ID of the purchased items, 0 to n_items - 1 (0-indexed) Shape must be (len_basket,), the last item is the checkout item 0

required
store int

Store ID, 0 to n_stores - 1 (0-indexed)

0
week int

Week number, 0 to 51 (0-indexed)

0
prices ndarray

Prices of all the items in the dataset Shape must be (n_items,) with n_items the number of items in the TripDataset

required
assortment Union[int, ndarray]

Assortment ID (int) corresponding to the assortment (ie its index in self.available_items) OR availability matrix (np.ndarray) of the assortment (binary vector of length n_items where 1 means the item is available and 0 means the item is not available) An assortment is the list of available items of a specific store at a given time

required
Source code in choice_learn/basket_models/dataset.py
def __init__(
    self,
    purchases: np.ndarray,
    prices: np.ndarray,
    assortment: Union[int, np.ndarray],
    store: int = 0,
    week: int = 0,
) -> None:
    """Initialize the trip.

    Parameters
    ----------
    purchases: np.ndarray
        List of the ID of the purchased items, 0 to n_items - 1 (0-indexed)
        Shape must be (len_basket,), the last item is the checkout item 0
    store: int
        Store ID, 0 to n_stores - 1 (0-indexed)
    week: int
        Week number, 0 to 51 (0-indexed)
    prices: np.ndarray
        Prices of all the items in the dataset
        Shape must be (n_items,) with n_items the number of items in
        the TripDataset
    assortment: int or np.ndarray
        Assortment ID (int) corresponding to the assortment (ie its index in
        self.available_items) OR availability matrix (np.ndarray) of the
        assortment (binary vector of length n_items where 1 means the item
        is available and 0 means the item is not available)
        An assortment is the list of available items of a specific store at a given time
    """
    if week not in range(52):
        raise ValueError("Week number must be between 0 and 51, inclusive.")

    # Constitutive elements of a trip
    self.purchases = purchases
    self.store = store
    self.week = week
    self.prices = prices
    self.assortment = assortment

    self.trip_length = len(purchases)

__str__()

Return short representation of the trip.

Returns:

Type Description
str

Representation of the trip

Source code in choice_learn/basket_models/dataset.py
def __str__(self) -> str:
    """Return short representation of the trip.

    Returns
    -------
    str
        Representation of the trip
    """
    desc = f"Trip with {self.trip_length} purchases {self.purchases}"
    desc += f" at store {self.store} in week {self.week}"
    desc += f" with prices {self.prices} and assortment {self.assortment}"
    return desc

get_items_up_to_index(i)

Get items up to index i.

Parameters:

Name Type Description Default
i int

Index of the item to get

required

Returns:

Type Description
ndarray

List of items up to index i (excluded) Shape must be (i,)

Source code in choice_learn/basket_models/dataset.py
def get_items_up_to_index(self, i: int) -> np.ndarray:
    """Get items up to index i.

    Parameters
    ----------
    i: int
        Index of the item to get

    Returns
    -------
    np.ndarray
        List of items up to index i (excluded)
        Shape must be (i,)
    """
    return self.purchases[:i]

TripDataset

Class for a dataset of trips.

Source code in choice_learn/basket_models/dataset.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
class TripDataset:
    """Class for a dataset of trips."""

    def __init__(self, trips: list[Trip], available_items: np.ndarray) -> None:
        """Initialize the dataset.

        Parameters
        ----------
        trips: list[Trip]
            List of trips
            Length must be n_trips
        available_items: np.ndarray
            Array of availability matrices
            available_items[i]: availability matrix of the assortment whose ID is i
            (The availability matrix is a binary vector of length n_items
            where 1 means the item is available and 0 means the item is not available)
            Shape must be (n_assortments, n_items)
        """
        self.trips = trips
        self.max_length = max([trip.trip_length for trip in self.trips])
        self.n_samples = len(self.get_transactions())
        self.available_items = available_items

    def __len__(self) -> int:
        """Return the number of trips in the dataset.

        Returns
        -------
        int
            Number of trips in the dataset
        """
        return len(self.trips)

    def __str__(self) -> str:
        """Return short representation of the dataset.

        Returns
        -------
        str
            Representation of the dataset
        """
        return f"TripDataset with {len(self)} trips"

    def __iter__(self) -> iter:
        """Iterate over the trips in the dataset.

        Returns
        -------
        iter
            Iterator over the trips
        """
        return iter(self.trips)

    def concatenate(self, other: object, inplace: bool = False) -> object:
        """Add a dataset to another.

        Parameters
        ----------
        other: TripDataset
            Dataset to add
        inplace: bool
            Whether to add the dataset in-place or not, by default False

        Returns
        -------
        TripDataset
            Concatenated dataset
        """
        if inplace:  # Add another dataset to the current one (in-place)
            # Concatenate the list of trips
            self.trips += other.trips
            # Update the attributes of the TripDataset
            self.max_length = max([trip.trip_length for trip in self.trips])
            self.n_samples = len(self.get_transactions())
            # Concatenate the arrays of availability matrices
            # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
            # changes
            self.available_items = np.concatenate(
                (self.available_items, other.available_items), axis=0
            )
            return self

        # Else: create a new dataset by adding 2 datasets together
        return TripDataset(
            # Concatenate the list of trips
            trips=self.trips + other.trips,
            # Concatenate the arrays of availability matrices
            # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
            # changes
            available_items=np.concatenate((self.available_items, other.available_items), axis=0),
        )

    def get_trip(self, index: int) -> Trip:
        """Return the trip at the given index.

        Parameters
        ----------
        index: int
            Index of the trip to get

        Returns
        -------
        Trip
            Trip at the given index
        """
        return self.trips[index]

    def get_transactions(self) -> np.ndarray:
        """Return the transactions of the TripDataset.

        One transaction is a triplet (store, trip, item).

        Returns
        -------
        dict
            Transactions of the TripDataset
            keys: trans_id
            values: (store, trip, item)
        """
        transactions = {}

        trans_id = 0
        for i, trip in enumerate(self.trips):
            for item in trip.purchases:
                transactions[trans_id] = (trip.store, i, item)
                trans_id += 1

        return transactions

    def get_all_items(self) -> np.ndarray:
        """Return the list of all items available in the dataset.

        Returns
        -------
        np.ndarray
            List of items available in the dataset
        """
        return np.arange(self.n_items)

    def get_all_baskets(self) -> np.ndarray:
        """Return the list of all baskets in the dataset.

        Returns
        -------
        np.ndarray
            List of baskets in the dataset
        """
        return np.array([self.trips[i].purchases for i in range(len(self))])

    def get_all_stores(self) -> np.ndarray:
        """Return the list of all stores in the dataset.

        Returns
        -------
        np.ndarray
            List of stores in the dataset
        """
        # If preprocessing working well, equal to [0, 1, ..., n_stores - 1]
        return np.array(list({self.trips[i].store for i in range(len(self))}))

    def get_all_weeks(self) -> np.ndarray:
        """Return the list of all weeks in the dataset.

        Returns
        -------
        np.ndarray
            List of weeks in the dataset
        """
        # If preprocessing working well, equal to [0, 1, ..., 51 or 52]
        return np.array(list({self.trips[i].week for i in range(len(self))}))

    def get_all_prices(self) -> np.ndarray:
        """Return the list of all price arrays in the dataset.

        Returns
        -------
        np.ndarray
            List of price arrays in the dataset
        """
        return np.array([self.trips[i].prices for i in range(len(self))])

    @property
    def n_items(self) -> int:
        """Return the number of items available in the dataset.

        Returns
        -------
        int
            Number of items available in the dataset
        """
        return self.available_items.shape[1]

    @property
    def n_stores(self) -> int:
        """Return the number of stores in the dataset.

        Returns
        -------
        int
            Number of stores in the dataset
        """
        return len(self.get_all_stores())

    @property
    def n_assortments(self) -> int:
        """Return the number of assortments in the dataset.

        Returns
        -------
        int
            Number of assortments in the dataset
        """
        return self.available_items.shape[0]

    def get_augmented_data_from_trip_index(
        self,
        trip_index: int,
    ) -> tuple[np.ndarray]:
        """Get augmented data from a trip index.

        Augmented data includes all the transactions obtained sequentially from the trip:
            - permuted items,
            - permuted, truncated and padded baskets,
            - padded future purchases based on the baskets,
            - stores,
            - weeks,
            - prices,
            - available items.

        Parameters
        ----------
        trip_index: int
            Index of the trip from which to get the data

        Returns
        -------
        tuple[np.ndarray]
            For each sample (ie transaction) from the trip:
            item, basket, future purchases, store, week, prices, available items
            Length must be 7
        """
        # Get the trip from the index
        trip = self.trips[trip_index]
        length_trip = len(trip.purchases)

        # Draw a random permutation of the items in the basket without the checkout item 0
        # TODO at a later stage: improve by sampling several permutations here
        permutation_list = list(permutations(range(length_trip - 1)))
        permutation = random.sample(permutation_list, 1)[0]

        # Permute the basket while keeping the checkout item 0 at the end
        permuted_purchases = np.array([trip.purchases[j] for j in permutation] + [0])

        # Truncate the baskets: for each batch sample, we consider the truncation possibilities
        # ranging from an empty basket to the basket with all the elements except the checkout item
        # And pad the truncated baskets with -1 to have the same length (because we need
        # numpy arrays for tiling and numpy arrays must have the same length)
        padded_truncated_purchases = np.array(
            [
                np.concatenate((permuted_purchases[:i], -1 * np.ones(self.max_length - i)))
                for i in range(0, length_trip)
            ],
            dtype=int,
        )

        # padded_future_purchases are the complements of padded_truncated_purchases, ie the
        # items that are not yet in the (permuted) basket but that we know will be purchased
        # during the next steps of the trip
        # Pad the future purchases with -1 to have the same length
        padded_future_purchases = np.array(
            [
                np.concatenate(
                    (
                        permuted_purchases[i + 1 :],
                        -1 * np.ones(self.max_length - len(permuted_purchases) + i + 1),
                    )
                )
                for i in range(0, length_trip)
            ],
            dtype=int,
        )

        if isinstance(trip.assortment, int):
            # Then it is the assortment ID (ie its index in self.available_items)
            assortment = self.available_items[trip.assortment]
        else:  # np.ndarray
            # Then it is directly the availability matrix
            assortment = trip.assortment

        # Each item is linked to a basket, the future purchases,
        # a store, a week, prices and an assortment
        return (
            permuted_purchases,  # Items
            padded_truncated_purchases,  # Baskets
            padded_future_purchases,  # Future purchases
            np.full(length_trip, trip.store),  # Stores
            np.full(length_trip, trip.week),  # Weeks
            np.tile(trip.prices, (length_trip, 1)),  # Prices
            np.tile(assortment, (length_trip, 1)),  # Available items
        )

    def iter_batch(
        self,
        batch_size: int,
        shuffle: bool = False,
    ) -> object:
        """Iterate over a TripDataset to return batches of items of length batch_size.

        Parameters
        ----------
        batch_size: int
            Batch size (number of items in the batch)
        shuffle: bool
            Whether or not to shuffle the dataset

        Yields
        ------
        tuple[np.ndarray]
            For each item in the batch: item, basket, future purchases,
            store, week, prices, available items
            Length must 7
        """
        # Get trip indexes
        num_trips = len(self)
        trip_indexes = np.arange(num_trips)

        # Shuffle trip indexes
        # TODO: shuffling on the trip indexes or on the item indexes?
        if shuffle:
            trip_indexes = np.random.default_rng().permutation(trip_indexes)

        # Initialize the buffer
        buffer = (
            np.empty(0, dtype=int),  # Items
            np.empty((0, self.max_length), dtype=int),  # Baskets
            np.empty((0, self.max_length), dtype=int),  # Future purchases
            np.empty(0, dtype=int),  # Stores
            np.empty(0, dtype=int),  # Weeks
            np.empty((0, self.n_items), dtype=int),  # Prices
            np.empty((0, self.n_items), dtype=int),  # Available items
        )

        if batch_size == -1:
            # Get the whole dataset in one batch
            for trip_index in trip_indexes:
                additional_trip_data = self.get_augmented_data_from_trip_index(trip_index)
                buffer = tuple(
                    np.concatenate((buffer[i], additional_trip_data[i])) for i in range(len(buffer))
                )

            # Yield the whole dataset
            yield buffer

        else:
            # Yield batches of size batch_size while going through all the trips
            index = 0
            outer_break = False
            while index < num_trips:
                # Fill the buffer with trips' augmented data until it reaches the batch size
                while len(buffer[0]) < batch_size:
                    if index >= num_trips:
                        # Then the buffer is not full but there are no more trips to consider
                        # Yield the batch partially filled
                        yield buffer

                        # Exit the TWO while loops when all trips have been considered
                        outer_break = True
                        break

                    else:
                        # Consider a new trip to fill the buffer
                        additional_trip_data = self.get_augmented_data_from_trip_index(
                            trip_indexes[index]
                        )
                        index += 1

                        # Fill the buffer with the new trip
                        buffer = tuple(
                            np.concatenate((buffer[i], additional_trip_data[i]))
                            for i in range(len(buffer))
                        )

                if outer_break:
                    break

                # Once the buffer is full, get the batch and update the next buffer
                batch = tuple(buffer[i][:batch_size] for i in range(len(buffer)))
                buffer = tuple(buffer[i][batch_size:] for i in range(len(buffer)))

                # Yield the batch
                yield batch

    def __getitem__(self, index: Union[int, list, np.ndarray, range, slice]) -> object:
        """Return a TripDataset object populated with the trips at index.

        Parameters
        ----------
        index: int, list[int], np.ndarray, range or list
            Index or list of indices of the trip(s) to get

        Returns
        -------
        Trip or list[Trip]
            Trip at the given index or list of trips at the given indices
        """
        if isinstance(index, int):
            return TripDataset(
                trips=[self.trips[index]],
                available_items=self.available_items,
            )
        if isinstance(index, (list, np.ndarray, range)):
            return TripDataset(
                trips=[self.trips[i] for i in index],
                available_items=self.available_items,
            )
        if isinstance(index, slice):
            return TripDataset(
                trips=self.trips[index],
                available_items=self.available_items,
            )

        raise TypeError("Type of index must be int, list, np.ndarray, range or slice.")

n_assortments: int property

Return the number of assortments in the dataset.

Returns:

Type Description
int

Number of assortments in the dataset

n_items: int property

Return the number of items available in the dataset.

Returns:

Type Description
int

Number of items available in the dataset

n_stores: int property

Return the number of stores in the dataset.

Returns:

Type Description
int

Number of stores in the dataset

__getitem__(index)

Return a TripDataset object populated with the trips at index.

Parameters:

Name Type Description Default
index Union[int, list, ndarray, range, slice]

Index or list of indices of the trip(s) to get

required

Returns:

Type Description
Trip or list[Trip]

Trip at the given index or list of trips at the given indices

Source code in choice_learn/basket_models/dataset.py
def __getitem__(self, index: Union[int, list, np.ndarray, range, slice]) -> object:
    """Return a TripDataset object populated with the trips at index.

    Parameters
    ----------
    index: int, list[int], np.ndarray, range or list
        Index or list of indices of the trip(s) to get

    Returns
    -------
    Trip or list[Trip]
        Trip at the given index or list of trips at the given indices
    """
    if isinstance(index, int):
        return TripDataset(
            trips=[self.trips[index]],
            available_items=self.available_items,
        )
    if isinstance(index, (list, np.ndarray, range)):
        return TripDataset(
            trips=[self.trips[i] for i in index],
            available_items=self.available_items,
        )
    if isinstance(index, slice):
        return TripDataset(
            trips=self.trips[index],
            available_items=self.available_items,
        )

    raise TypeError("Type of index must be int, list, np.ndarray, range or slice.")

__init__(trips, available_items)

Initialize the dataset.

Parameters:

Name Type Description Default
trips list[Trip]

List of trips Length must be n_trips

required
available_items ndarray

Array of availability matrices available_items[i]: availability matrix of the assortment whose ID is i (The availability matrix is a binary vector of length n_items where 1 means the item is available and 0 means the item is not available) Shape must be (n_assortments, n_items)

required
Source code in choice_learn/basket_models/dataset.py
def __init__(self, trips: list[Trip], available_items: np.ndarray) -> None:
    """Initialize the dataset.

    Parameters
    ----------
    trips: list[Trip]
        List of trips
        Length must be n_trips
    available_items: np.ndarray
        Array of availability matrices
        available_items[i]: availability matrix of the assortment whose ID is i
        (The availability matrix is a binary vector of length n_items
        where 1 means the item is available and 0 means the item is not available)
        Shape must be (n_assortments, n_items)
    """
    self.trips = trips
    self.max_length = max([trip.trip_length for trip in self.trips])
    self.n_samples = len(self.get_transactions())
    self.available_items = available_items

__iter__()

Iterate over the trips in the dataset.

Returns:

Type Description
iter

Iterator over the trips

Source code in choice_learn/basket_models/dataset.py
def __iter__(self) -> iter:
    """Iterate over the trips in the dataset.

    Returns
    -------
    iter
        Iterator over the trips
    """
    return iter(self.trips)

__len__()

Return the number of trips in the dataset.

Returns:

Type Description
int

Number of trips in the dataset

Source code in choice_learn/basket_models/dataset.py
def __len__(self) -> int:
    """Return the number of trips in the dataset.

    Returns
    -------
    int
        Number of trips in the dataset
    """
    return len(self.trips)

__str__()

Return short representation of the dataset.

Returns:

Type Description
str

Representation of the dataset

Source code in choice_learn/basket_models/dataset.py
def __str__(self) -> str:
    """Return short representation of the dataset.

    Returns
    -------
    str
        Representation of the dataset
    """
    return f"TripDataset with {len(self)} trips"

concatenate(other, inplace=False)

Add a dataset to another.

Parameters:

Name Type Description Default
other object

Dataset to add

required
inplace bool

Whether to add the dataset in-place or not, by default False

False

Returns:

Type Description
TripDataset

Concatenated dataset

Source code in choice_learn/basket_models/dataset.py
def concatenate(self, other: object, inplace: bool = False) -> object:
    """Add a dataset to another.

    Parameters
    ----------
    other: TripDataset
        Dataset to add
    inplace: bool
        Whether to add the dataset in-place or not, by default False

    Returns
    -------
    TripDataset
        Concatenated dataset
    """
    if inplace:  # Add another dataset to the current one (in-place)
        # Concatenate the list of trips
        self.trips += other.trips
        # Update the attributes of the TripDataset
        self.max_length = max([trip.trip_length for trip in self.trips])
        self.n_samples = len(self.get_transactions())
        # Concatenate the arrays of availability matrices
        # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
        # changes
        self.available_items = np.concatenate(
            (self.available_items, other.available_items), axis=0
        )
        return self

    # Else: create a new dataset by adding 2 datasets together
    return TripDataset(
        # Concatenate the list of trips
        trips=self.trips + other.trips,
        # Concatenate the arrays of availability matrices
        # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
        # changes
        available_items=np.concatenate((self.available_items, other.available_items), axis=0),
    )

get_all_baskets()

Return the list of all baskets in the dataset.

Returns:

Type Description
ndarray

List of baskets in the dataset

Source code in choice_learn/basket_models/dataset.py
def get_all_baskets(self) -> np.ndarray:
    """Return the list of all baskets in the dataset.

    Returns
    -------
    np.ndarray
        List of baskets in the dataset
    """
    return np.array([self.trips[i].purchases for i in range(len(self))])

get_all_items()

Return the list of all items available in the dataset.

Returns:

Type Description
ndarray

List of items available in the dataset

Source code in choice_learn/basket_models/dataset.py
def get_all_items(self) -> np.ndarray:
    """Return the list of all items available in the dataset.

    Returns
    -------
    np.ndarray
        List of items available in the dataset
    """
    return np.arange(self.n_items)

get_all_prices()

Return the list of all price arrays in the dataset.

Returns:

Type Description
ndarray

List of price arrays in the dataset

Source code in choice_learn/basket_models/dataset.py
def get_all_prices(self) -> np.ndarray:
    """Return the list of all price arrays in the dataset.

    Returns
    -------
    np.ndarray
        List of price arrays in the dataset
    """
    return np.array([self.trips[i].prices for i in range(len(self))])

get_all_stores()

Return the list of all stores in the dataset.

Returns:

Type Description
ndarray

List of stores in the dataset

Source code in choice_learn/basket_models/dataset.py
def get_all_stores(self) -> np.ndarray:
    """Return the list of all stores in the dataset.

    Returns
    -------
    np.ndarray
        List of stores in the dataset
    """
    # If preprocessing working well, equal to [0, 1, ..., n_stores - 1]
    return np.array(list({self.trips[i].store for i in range(len(self))}))

get_all_weeks()

Return the list of all weeks in the dataset.

Returns:

Type Description
ndarray

List of weeks in the dataset

Source code in choice_learn/basket_models/dataset.py
def get_all_weeks(self) -> np.ndarray:
    """Return the list of all weeks in the dataset.

    Returns
    -------
    np.ndarray
        List of weeks in the dataset
    """
    # If preprocessing working well, equal to [0, 1, ..., 51 or 52]
    return np.array(list({self.trips[i].week for i in range(len(self))}))

get_augmented_data_from_trip_index(trip_index)

Get augmented data from a trip index.

Augmented data includes all the transactions obtained sequentially from the trip: - permuted items, - permuted, truncated and padded baskets, - padded future purchases based on the baskets, - stores, - weeks, - prices, - available items.

Parameters:

Name Type Description Default
trip_index int

Index of the trip from which to get the data

required

Returns:

Type Description
tuple[ndarray]

For each sample (ie transaction) from the trip: item, basket, future purchases, store, week, prices, available items Length must be 7

Source code in choice_learn/basket_models/dataset.py
def get_augmented_data_from_trip_index(
    self,
    trip_index: int,
) -> tuple[np.ndarray]:
    """Get augmented data from a trip index.

    Augmented data includes all the transactions obtained sequentially from the trip:
        - permuted items,
        - permuted, truncated and padded baskets,
        - padded future purchases based on the baskets,
        - stores,
        - weeks,
        - prices,
        - available items.

    Parameters
    ----------
    trip_index: int
        Index of the trip from which to get the data

    Returns
    -------
    tuple[np.ndarray]
        For each sample (ie transaction) from the trip:
        item, basket, future purchases, store, week, prices, available items
        Length must be 7
    """
    # Get the trip from the index
    trip = self.trips[trip_index]
    length_trip = len(trip.purchases)

    # Draw a random permutation of the items in the basket without the checkout item 0
    # TODO at a later stage: improve by sampling several permutations here
    permutation_list = list(permutations(range(length_trip - 1)))
    permutation = random.sample(permutation_list, 1)[0]

    # Permute the basket while keeping the checkout item 0 at the end
    permuted_purchases = np.array([trip.purchases[j] for j in permutation] + [0])

    # Truncate the baskets: for each batch sample, we consider the truncation possibilities
    # ranging from an empty basket to the basket with all the elements except the checkout item
    # And pad the truncated baskets with -1 to have the same length (because we need
    # numpy arrays for tiling and numpy arrays must have the same length)
    padded_truncated_purchases = np.array(
        [
            np.concatenate((permuted_purchases[:i], -1 * np.ones(self.max_length - i)))
            for i in range(0, length_trip)
        ],
        dtype=int,
    )

    # padded_future_purchases are the complements of padded_truncated_purchases, ie the
    # items that are not yet in the (permuted) basket but that we know will be purchased
    # during the next steps of the trip
    # Pad the future purchases with -1 to have the same length
    padded_future_purchases = np.array(
        [
            np.concatenate(
                (
                    permuted_purchases[i + 1 :],
                    -1 * np.ones(self.max_length - len(permuted_purchases) + i + 1),
                )
            )
            for i in range(0, length_trip)
        ],
        dtype=int,
    )

    if isinstance(trip.assortment, int):
        # Then it is the assortment ID (ie its index in self.available_items)
        assortment = self.available_items[trip.assortment]
    else:  # np.ndarray
        # Then it is directly the availability matrix
        assortment = trip.assortment

    # Each item is linked to a basket, the future purchases,
    # a store, a week, prices and an assortment
    return (
        permuted_purchases,  # Items
        padded_truncated_purchases,  # Baskets
        padded_future_purchases,  # Future purchases
        np.full(length_trip, trip.store),  # Stores
        np.full(length_trip, trip.week),  # Weeks
        np.tile(trip.prices, (length_trip, 1)),  # Prices
        np.tile(assortment, (length_trip, 1)),  # Available items
    )

get_transactions()

Return the transactions of the TripDataset.

One transaction is a triplet (store, trip, item).

Returns:

Type Description
dict

Transactions of the TripDataset keys: trans_id values: (store, trip, item)

Source code in choice_learn/basket_models/dataset.py
def get_transactions(self) -> np.ndarray:
    """Return the transactions of the TripDataset.

    One transaction is a triplet (store, trip, item).

    Returns
    -------
    dict
        Transactions of the TripDataset
        keys: trans_id
        values: (store, trip, item)
    """
    transactions = {}

    trans_id = 0
    for i, trip in enumerate(self.trips):
        for item in trip.purchases:
            transactions[trans_id] = (trip.store, i, item)
            trans_id += 1

    return transactions

get_trip(index)

Return the trip at the given index.

Parameters:

Name Type Description Default
index int

Index of the trip to get

required

Returns:

Type Description
Trip

Trip at the given index

Source code in choice_learn/basket_models/dataset.py
def get_trip(self, index: int) -> Trip:
    """Return the trip at the given index.

    Parameters
    ----------
    index: int
        Index of the trip to get

    Returns
    -------
    Trip
        Trip at the given index
    """
    return self.trips[index]

iter_batch(batch_size, shuffle=False)

Iterate over a TripDataset to return batches of items of length batch_size.

Parameters:

Name Type Description Default
batch_size int

Batch size (number of items in the batch)

required
shuffle bool

Whether or not to shuffle the dataset

False

Yields:

Type Description
tuple[ndarray]

For each item in the batch: item, basket, future purchases, store, week, prices, available items Length must 7

Source code in choice_learn/basket_models/dataset.py
def iter_batch(
    self,
    batch_size: int,
    shuffle: bool = False,
) -> object:
    """Iterate over a TripDataset to return batches of items of length batch_size.

    Parameters
    ----------
    batch_size: int
        Batch size (number of items in the batch)
    shuffle: bool
        Whether or not to shuffle the dataset

    Yields
    ------
    tuple[np.ndarray]
        For each item in the batch: item, basket, future purchases,
        store, week, prices, available items
        Length must 7
    """
    # Get trip indexes
    num_trips = len(self)
    trip_indexes = np.arange(num_trips)

    # Shuffle trip indexes
    # TODO: shuffling on the trip indexes or on the item indexes?
    if shuffle:
        trip_indexes = np.random.default_rng().permutation(trip_indexes)

    # Initialize the buffer
    buffer = (
        np.empty(0, dtype=int),  # Items
        np.empty((0, self.max_length), dtype=int),  # Baskets
        np.empty((0, self.max_length), dtype=int),  # Future purchases
        np.empty(0, dtype=int),  # Stores
        np.empty(0, dtype=int),  # Weeks
        np.empty((0, self.n_items), dtype=int),  # Prices
        np.empty((0, self.n_items), dtype=int),  # Available items
    )

    if batch_size == -1:
        # Get the whole dataset in one batch
        for trip_index in trip_indexes:
            additional_trip_data = self.get_augmented_data_from_trip_index(trip_index)
            buffer = tuple(
                np.concatenate((buffer[i], additional_trip_data[i])) for i in range(len(buffer))
            )

        # Yield the whole dataset
        yield buffer

    else:
        # Yield batches of size batch_size while going through all the trips
        index = 0
        outer_break = False
        while index < num_trips:
            # Fill the buffer with trips' augmented data until it reaches the batch size
            while len(buffer[0]) < batch_size:
                if index >= num_trips:
                    # Then the buffer is not full but there are no more trips to consider
                    # Yield the batch partially filled
                    yield buffer

                    # Exit the TWO while loops when all trips have been considered
                    outer_break = True
                    break

                else:
                    # Consider a new trip to fill the buffer
                    additional_trip_data = self.get_augmented_data_from_trip_index(
                        trip_indexes[index]
                    )
                    index += 1

                    # Fill the buffer with the new trip
                    buffer = tuple(
                        np.concatenate((buffer[i], additional_trip_data[i]))
                        for i in range(len(buffer))
                    )

            if outer_break:
                break

            # Once the buffer is full, get the batch and update the next buffer
            batch = tuple(buffer[i][:batch_size] for i in range(len(buffer)))
            buffer = tuple(buffer[i][batch_size:] for i in range(len(buffer)))

            # Yield the batch
            yield batch