Trip and TripDataset Data Structures

Classes to handle datasets with baskets of products.

`Trip`

Class for a trip.

A trip is a sequence of purchases made at a specific time and at a specific store with given prices and a specific assortment. It can be seen as the content of a time-stamped purchase receipt with store identification.

Trip = (purchases, store, week, prices, assortment)

Source code in choice_learn/basket_models/dataset.py

class Trip:
    """Class for a trip.

    A trip is a sequence of purchases made at a specific time and
    at a specific store with given prices and a specific assortment.
    It can be seen as the content of a time-stamped purchase receipt with store identification.

    Trip = (purchases, store, week, prices, assortment)
    """

    def __init__(
        self,
        purchases: np.ndarray,
        prices: np.ndarray,
        assortment: Union[int, np.ndarray],
        store: int = 0,
        week: int = 0,
    ) -> None:
        """Initialize the trip.

        Parameters
        ----------
        purchases: np.ndarray
            List of the ID of the purchased items, 0 to n_items - 1 (0-indexed)
            Shape must be (len_basket,), the last item is the checkout item 0
        store: int
            Store ID, 0 to n_stores - 1 (0-indexed)
        week: int
            Week number, 0 to 51 (0-indexed)
        prices: np.ndarray
            Prices of all the items in the dataset
            Shape must be (n_items,) with n_items the number of items in
            the TripDataset
        assortment: int or np.ndarray
            Assortment ID (int) corresponding to the assortment (ie its index in
            self.available_items) OR availability matrix (np.ndarray) of the
            assortment (binary vector of length n_items where 1 means the item
            is available and 0 means the item is not available)
            An assortment is the list of available items of a specific store at a given time
        """
        if week not in range(52):
            raise ValueError("Week number must be between 0 and 51, inclusive.")

        # Constitutive elements of a trip
        self.purchases = purchases
        self.store = store
        self.week = week
        self.prices = prices
        self.assortment = assortment

        self.trip_length = len(purchases)

    def __str__(self) -> str:
        """Return short representation of the trip.

        Returns
        -------
        str
            Representation of the trip
        """
        desc = f"Trip with {self.trip_length} purchases {self.purchases}"
        desc += f" at store {self.store} in week {self.week}"
        desc += f" with prices {self.prices} and assortment {self.assortment}"
        return desc

    def get_items_up_to_index(self, i: int) -> np.ndarray:
        """Get items up to index i.

        Parameters
        ----------
        i: int
            Index of the item to get

        Returns
        -------
        np.ndarray
            List of items up to index i (excluded)
            Shape must be (i,)
        """
        return self.purchases[:i]

`init(purchases, prices, assortment, store=0, week=0)`

Initialize the trip.

Parameters:

Name	Type	Description	Default
`purchases`	`ndarray`	List of the ID of the purchased items, 0 to n_items - 1 (0-indexed) Shape must be (len_basket,), the last item is the checkout item 0	required
`store`	`int`	Store ID, 0 to n_stores - 1 (0-indexed)	`0`
`week`	`int`	Week number, 0 to 51 (0-indexed)	`0`
`prices`	`ndarray`	Prices of all the items in the dataset Shape must be (n_items,) with n_items the number of items in the TripDataset	required
`assortment`	`Union[int, ndarray]`	Assortment ID (int) corresponding to the assortment (ie its index in self.available_items) OR availability matrix (np.ndarray) of the assortment (binary vector of length n_items where 1 means the item is available and 0 means the item is not available) An assortment is the list of available items of a specific store at a given time	required

Source code in choice_learn/basket_models/dataset.py

def __init__(
    self,
    purchases: np.ndarray,
    prices: np.ndarray,
    assortment: Union[int, np.ndarray],
    store: int = 0,
    week: int = 0,
) -> None:
    """Initialize the trip.

    Parameters
    ----------
    purchases: np.ndarray
        List of the ID of the purchased items, 0 to n_items - 1 (0-indexed)
        Shape must be (len_basket,), the last item is the checkout item 0
    store: int
        Store ID, 0 to n_stores - 1 (0-indexed)
    week: int
        Week number, 0 to 51 (0-indexed)
    prices: np.ndarray
        Prices of all the items in the dataset
        Shape must be (n_items,) with n_items the number of items in
        the TripDataset
    assortment: int or np.ndarray
        Assortment ID (int) corresponding to the assortment (ie its index in
        self.available_items) OR availability matrix (np.ndarray) of the
        assortment (binary vector of length n_items where 1 means the item
        is available and 0 means the item is not available)
        An assortment is the list of available items of a specific store at a given time
    """
    if week not in range(52):
        raise ValueError("Week number must be between 0 and 51, inclusive.")

    # Constitutive elements of a trip
    self.purchases = purchases
    self.store = store
    self.week = week
    self.prices = prices
    self.assortment = assortment

    self.trip_length = len(purchases)

`str()`

Return short representation of the trip.

Returns:

Type	Description
`str`	Representation of the trip

Source code in choice_learn/basket_models/dataset.py

def __str__(self) -> str:
    """Return short representation of the trip.

    Returns
    -------
    str
        Representation of the trip
    """
    desc = f"Trip with {self.trip_length} purchases {self.purchases}"
    desc += f" at store {self.store} in week {self.week}"
    desc += f" with prices {self.prices} and assortment {self.assortment}"
    return desc

`get_items_up_to_index(i)`

Get items up to index i.

Parameters:

Name	Type	Description	Default
`i`	`int`	Index of the item to get	required

Returns:

Type	Description
`ndarray`	List of items up to index i (excluded) Shape must be (i,)

Source code in choice_learn/basket_models/dataset.py

def get_items_up_to_index(self, i: int) -> np.ndarray:
    """Get items up to index i.

    Parameters
    ----------
    i: int
        Index of the item to get

    Returns
    -------
    np.ndarray
        List of items up to index i (excluded)
        Shape must be (i,)
    """
    return self.purchases[:i]

`TripDataset`

Class for a dataset of trips.

Source code in choice_learn/basket_models/dataset.py

class TripDataset:
    """Class for a dataset of trips."""

    def __init__(self, trips: list[Trip], available_items: np.ndarray) -> None:
        """Initialize the dataset.

        Parameters
        ----------
        trips: list[Trip]
            List of trips
            Length must be n_trips
        available_items: np.ndarray
            Array of availability matrices
            available_items[i]: availability matrix of the assortment whose ID is i
            (The availability matrix is a binary vector of length n_items
            where 1 means the item is available and 0 means the item is not available)
            Shape must be (n_assortments, n_items)
        """
        self.trips = trips
        self.max_length = max([trip.trip_length for trip in self.trips])
        self.n_samples = len(self.get_transactions())
        self.available_items = available_items

    def __len__(self) -> int:
        """Return the number of trips in the dataset.

        Returns
        -------
        int
            Number of trips in the dataset
        """
        return len(self.trips)

    def __str__(self) -> str:
        """Return short representation of the dataset.

        Returns
        -------
        str
            Representation of the dataset
        """
        return f"TripDataset with {len(self)} trips"

    def __iter__(self) -> iter:
        """Iterate over the trips in the dataset.

        Returns
        -------
        iter
            Iterator over the trips
        """
        return iter(self.trips)

    def concatenate(self, other: object, inplace: bool = False) -> object:
        """Add a dataset to another.

        Parameters
        ----------
        other: TripDataset
            Dataset to add
        inplace: bool
            Whether to add the dataset in-place or not, by default False

        Returns
        -------
        TripDataset
            Concatenated dataset
        """
        if inplace:  # Add another dataset to the current one (in-place)
            # Concatenate the list of trips
            self.trips += other.trips
            # Update the attributes of the TripDataset
            self.max_length = max([trip.trip_length for trip in self.trips])
            self.n_samples = len(self.get_transactions())
            # Concatenate the arrays of availability matrices
            # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
            # changes
            self.available_items = np.concatenate(
                (self.available_items, other.available_items), axis=0
            )
            return self

        # Else: create a new dataset by adding 2 datasets together
        return TripDataset(
            # Concatenate the list of trips
            trips=self.trips + other.trips,
            # Concatenate the arrays of availability matrices
            # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
            # changes
            available_items=np.concatenate((self.available_items, other.available_items), axis=0),
        )

    def get_trip(self, index: int) -> Trip:
        """Return the trip at the given index.

        Parameters
        ----------
        index: int
            Index of the trip to get

        Returns
        -------
        Trip
            Trip at the given index
        """
        return self.trips[index]

    def get_transactions(self) -> np.ndarray:
        """Return the transactions of the TripDataset.

        One transaction is a triplet (store, trip, item).

        Returns
        -------
        dict
            Transactions of the TripDataset
            keys: trans_id
            values: (store, trip, item)
        """
        transactions = {}

        trans_id = 0
        for i, trip in enumerate(self.trips):
            for item in trip.purchases:
                transactions[trans_id] = (trip.store, i, item)
                trans_id += 1

        return transactions

    def get_all_items(self) -> np.ndarray:
        """Return the list of all items available in the dataset.

        Returns
        -------
        np.ndarray
            List of items available in the dataset
        """
        return np.arange(self.n_items)

    def get_all_baskets(self) -> np.ndarray:
        """Return the list of all baskets in the dataset.

        Returns
        -------
        np.ndarray
            List of baskets in the dataset
        """
        return np.array([self.trips[i].purchases for i in range(len(self))])

    def get_all_stores(self) -> np.ndarray:
        """Return the list of all stores in the dataset.

        Returns
        -------
        np.ndarray
            List of stores in the dataset
        """
        # If preprocessing working well, equal to [0, 1, ..., n_stores - 1]
        return np.array(list({self.trips[i].store for i in range(len(self))}))

    def get_all_weeks(self) -> np.ndarray:
        """Return the list of all weeks in the dataset.

        Returns
        -------
        np.ndarray
            List of weeks in the dataset
        """
        # If preprocessing working well, equal to [0, 1, ..., 51 or 52]
        return np.array(list({self.trips[i].week for i in range(len(self))}))

    def get_all_prices(self) -> np.ndarray:
        """Return the list of all price arrays in the dataset.

        Returns
        -------
        np.ndarray
            List of price arrays in the dataset
        """
        return np.array([self.trips[i].prices for i in range(len(self))])

    @property
    def n_items(self) -> int:
        """Return the number of items available in the dataset.

        Returns
        -------
        int
            Number of items available in the dataset
        """
        return self.available_items.shape[1]

    @property
    def n_stores(self) -> int:
        """Return the number of stores in the dataset.

        Returns
        -------
        int
            Number of stores in the dataset
        """
        return len(self.get_all_stores())

    @property
    def n_assortments(self) -> int:
        """Return the number of assortments in the dataset.

        Returns
        -------
        int
            Number of assortments in the dataset
        """
        return self.available_items.shape[0]

    def get_one_vs_all_augmented_data_from_trip_index(
        self,
        trip_index: int,
    ) -> tuple[np.ndarray]:
        """Get augmented data from a trip index.

        Augmented data consists in removing one item from the basket that will be used
        as a target from the remaining items. It is done for all items, leading to returning:
            - items,
            - padded baskets with an item removed,
            - stores,
            - weeks,
            - prices,
            - available items.

        Parameters
        ----------
        trip_index: int
            Index of the trip from which to get the data

        Returns
        -------
        tuple[np.ndarray]
            For each sample (ie transaction) from the trip:
            item, basket, store, week, prices, available items
            Length must be 6
        """
        # Get the trip from the index
        trip = self.trips[trip_index]
        length_trip = len(trip.purchases)
        permuted_purchases = np.array(trip.purchases)

        # Create new baskets with one item removed that will be used as target
        # (len(basket) new baskets created)
        # And pad the truncated baskets with -1 to have the same length (because we need
        # numpy arrays for tiling and numpy arrays must have the same length)
        padded_purchases_lacking_one_item = np.array(
            [
                np.concatenate(
                    (
                        permuted_purchases[:i],
                        # Pad the removed item with -1
                        [-1],
                        permuted_purchases[i + 1 :],
                        # Pad to have the same length
                        -1 * np.ones(self.max_length - length_trip),
                    )
                )
                for i in range(0, length_trip)
            ],
            dtype=int,
        )

        if not (isinstance(trip.assortment, np.ndarray) or isinstance(trip.assortment, list)):
            # Then it is the assortment ID (ie its index in self.available_items)
            assortment = self.available_items[trip.assortment]
        else:  # np.ndarray
            # Then it is directly the availability matrix
            assortment = trip.assortment

        if not (isinstance(trip.prices, np.ndarray) or isinstance(trip.prices, list)):
            # Then it is the assortment ID (ie its index in self.available_items)
            prices = self.prices[trip.prices]
        else:  # np.ndarray
            # Then it is directly the availability matrix
            prices = trip.prices

        # Each item is linked to a basket, a store, a week, prices and an assortment
        return (
            permuted_purchases,  # Items
            padded_purchases_lacking_one_item,  # Baskets
            np.empty((0, self.max_length), dtype=int),  # Future purchases
            np.full(length_trip, trip.store),  # Stores
            np.full(length_trip, trip.week),  # Weeks
            np.tile(prices, (length_trip, 1)),  # Prices
            np.tile(assortment, (length_trip, 1)),  # Available items
        )

    def get_subbaskets_augmented_data_from_trip_index(
        self,
        trip_index: int,
    ) -> tuple[np.ndarray]:
        """Get augmented data from a trip index.

        Augmented data includes all the transactions obtained sequentially from the trip.
        In particular, items in the basket are shuffled and sub-baskets are built iteratively
        with the next item that will be used as a target. In particular, it leads to:
            - permuted items,
            - permuted, truncated and padded baskets,
            - padded future purchases based on the baskets,
            - stores,
            - weeks,
            - prices,
            - available items.

        Parameters
        ----------
        trip_index: int
            Index of the trip from which to get the data

        Returns
        -------
        tuple[np.ndarray]
            For each sample (ie transaction) from the trip:
            item, basket, future purchases, store, week, prices, available items
            Length must be 7
        """
        # Get the trip from the index
        trip = self.trips[trip_index]
        length_trip = len(trip.purchases)

        # Draw a random permutation of the items in the basket without the checkout item 0
        # TODO at a later stage: improve by sampling several permutations here
        permutation_list = list(permutations(range(length_trip - 1)))
        permutation = random.sample(permutation_list, 1)[0]

        # Permute the basket while keeping the checkout item 0 at the end
        permuted_purchases = np.array([trip.purchases[j] for j in permutation] + [0])

        # Truncate the baskets: for each batch sample, we consider the truncation possibilities
        # ranging from an empty basket to the basket with all the elements except the checkout item
        # And pad the truncated baskets with -1 to have the same length (because we need
        # numpy arrays for tiling and numpy arrays must have the same length)
        padded_truncated_purchases = np.array(
            [
                np.concatenate((permuted_purchases[:i], -1 * np.ones(self.max_length - i)))
                for i in range(0, length_trip)
            ],
            dtype=int,
        )

        # padded_future_purchases are the complements of padded_truncated_purchases, ie the
        # items that are not yet in the (permuted) basket but that we know will be purchased
        # during the next steps of the trip
        # Pad the future purchases with -1 to have the same length
        padded_future_purchases = np.array(
            [
                np.concatenate(
                    (
                        permuted_purchases[i + 1 :],
                        -1 * np.ones(self.max_length - len(permuted_purchases) + i + 1),
                    )
                )
                for i in range(0, length_trip)
            ],
            dtype=int,
        )

        if isinstance(trip.assortment, int):
            # Then it is the assortment ID (ie its index in self.available_items)
            assortment = self.available_items[trip.assortment]
        else:  # np.ndarray
            # Then it is directly the availability matrix
            assortment = trip.assortment

        # Each item is linked to a basket, the future purchases,
        # a store, a week, prices and an assortment
        return (
            permuted_purchases,  # Items
            padded_truncated_purchases,  # Baskets
            padded_future_purchases,  # Future purchases
            np.full(length_trip, trip.store),  # Stores
            np.full(length_trip, trip.week),  # Weeks
            np.tile(trip.prices, (length_trip, 1)),  # Prices
            np.tile(assortment, (length_trip, 1)),  # Available items
        )

    def iter_batch(
        self,
        batch_size: int,
        shuffle: bool = False,
        data_method: str = "shopper",
    ) -> object:
        """Iterate over a TripDataset to return batches of items of length batch_size.

        Parameters
        ----------
        batch_size: int
            Batch size (number of items in the batch)
        shuffle: bool
            Whether or not to shuffle the dataset

        Yields
        ------
        tuple[np.ndarray]
            For each item in the batch: item, basket, future purchases,
            store, week, prices, available items
            Length must 7
        """
        # Get trip indexes
        num_trips = len(self)
        trip_indexes = np.arange(num_trips)

        # Shuffle trip indexes
        # TODO: shuffling on the trip indexes or on the item indexes?
        if shuffle:
            trip_indexes = np.random.default_rng().permutation(trip_indexes)

        # Initialize the buffer
        buffer = (
            np.empty(0, dtype=int),  # Items
            np.empty((0, self.max_length), dtype=int),  # Baskets
            np.empty((0, self.max_length), dtype=int),  # Future purchases
            np.empty(0, dtype=int),  # Stores
            np.empty(0, dtype=int),  # Weeks
            np.empty((0, self.n_items), dtype=int),  # Prices
            np.empty((0, self.n_items), dtype=int),  # Available items
        )

        if batch_size == -1:
            # Get the whole dataset in one batch
            for trip_index in trip_indexes:
                if data_method == "shopper":
                    additional_trip_data = self.get_subbaskets_augmented_data_from_trip_index(
                        trip_index
                    )
                elif data_method == "aleacarta":
                    additional_trip_data = self.get_one_vs_all_augmented_data_from_trip_index(
                        trip_index
                    )
                else:
                    raise ValueError(f"Unknown data method: {data_method}")
                buffer = tuple(
                    np.concatenate((buffer[i], additional_trip_data[i])) for i in range(len(buffer))
                )

            # Yield the whole dataset
            yield buffer

        else:
            # Yield batches of size batch_size while going through all the trips
            index = 0
            outer_break = False
            while index < num_trips:
                # Fill the buffer with trips' augmented data until it reaches the batch size
                while len(buffer[0]) < batch_size:
                    if index >= num_trips:
                        # Then the buffer is not full but there are no more trips to consider
                        # Yield the batch partially filled
                        yield buffer

                        # Exit the TWO while loops when all trips have been considered
                        outer_break = True
                        break

                    else:
                        # Consider a new trip to fill the buffer
                        if data_method == "shopper":
                            additional_trip_data = (
                                self.get_subbaskets_augmented_data_from_trip_index(
                                    trip_indexes[index]
                                )
                            )
                        elif data_method == "aleacarta":
                            additional_trip_data = (
                                self.get_one_vs_all_augmented_data_from_trip_index(
                                    trip_indexes[index]
                                )
                            )
                        else:
                            raise ValueError(f"Unknown data method: {data_method}")
                        index += 1

                        # Fill the buffer with the new trip
                        buffer = tuple(
                            np.concatenate((buffer[i], additional_trip_data[i]))
                            for i in range(len(buffer))
                        )

                if outer_break:
                    break

                # Once the buffer is full, get the batch and update the next buffer
                batch = tuple(buffer[i][:batch_size] for i in range(len(buffer)))
                buffer = tuple(buffer[i][batch_size:] for i in range(len(buffer)))

                # Yield the batch
                yield batch

    def __getitem__(self, index: Union[int, list, np.ndarray, range, slice]) -> object:
        """Return a TripDataset object populated with the trips at index.

        Parameters
        ----------
        index: int, list[int], np.ndarray, range or list
            Index or list of indices of the trip(s) to get

        Returns
        -------
        Trip or list[Trip]
            Trip at the given index or list of trips at the given indices
        """
        if isinstance(index, int):
            return TripDataset(
                trips=[self.trips[index]],
                available_items=self.available_items,
            )
        if isinstance(index, (list, np.ndarray, range)):
            return TripDataset(
                trips=[self.trips[i] for i in index],
                available_items=self.available_items,
            )
        if isinstance(index, slice):
            return TripDataset(
                trips=self.trips[index],
                available_items=self.available_items,
            )

        raise TypeError("Type of index must be int, list, np.ndarray, range or slice.")

`n_assortments: int` `property`

Return the number of assortments in the dataset.

Returns:

Type	Description
`int`	Number of assortments in the dataset

`n_items: int` `property`

Return the number of items available in the dataset.

Returns:

Type	Description
`int`	Number of items available in the dataset

`n_stores: int` `property`

Return the number of stores in the dataset.

Returns:

Type	Description
`int`	Number of stores in the dataset

`getitem(index)`

Return a TripDataset object populated with the trips at index.

Parameters:

Name	Type	Description	Default
`index`	`Union[int, list, ndarray, range, slice]`	Index or list of indices of the trip(s) to get	required

Returns:

Type	Description
`Trip or list[Trip]`	Trip at the given index or list of trips at the given indices

Source code in choice_learn/basket_models/dataset.py

def __getitem__(self, index: Union[int, list, np.ndarray, range, slice]) -> object:
    """Return a TripDataset object populated with the trips at index.

    Parameters
    ----------
    index: int, list[int], np.ndarray, range or list
        Index or list of indices of the trip(s) to get

    Returns
    -------
    Trip or list[Trip]
        Trip at the given index or list of trips at the given indices
    """
    if isinstance(index, int):
        return TripDataset(
            trips=[self.trips[index]],
            available_items=self.available_items,
        )
    if isinstance(index, (list, np.ndarray, range)):
        return TripDataset(
            trips=[self.trips[i] for i in index],
            available_items=self.available_items,
        )
    if isinstance(index, slice):
        return TripDataset(
            trips=self.trips[index],
            available_items=self.available_items,
        )

    raise TypeError("Type of index must be int, list, np.ndarray, range or slice.")

`init(trips, available_items)`

Initialize the dataset.

Parameters:

Name	Type	Description	Default
`trips`	`list[Trip]`	List of trips Length must be n_trips	required
`available_items`	`ndarray`	Array of availability matrices available_items[i]: availability matrix of the assortment whose ID is i (The availability matrix is a binary vector of length n_items where 1 means the item is available and 0 means the item is not available) Shape must be (n_assortments, n_items)	required

Source code in choice_learn/basket_models/dataset.py

def __init__(self, trips: list[Trip], available_items: np.ndarray) -> None:
    """Initialize the dataset.

    Parameters
    ----------
    trips: list[Trip]
        List of trips
        Length must be n_trips
    available_items: np.ndarray
        Array of availability matrices
        available_items[i]: availability matrix of the assortment whose ID is i
        (The availability matrix is a binary vector of length n_items
        where 1 means the item is available and 0 means the item is not available)
        Shape must be (n_assortments, n_items)
    """
    self.trips = trips
    self.max_length = max([trip.trip_length for trip in self.trips])
    self.n_samples = len(self.get_transactions())
    self.available_items = available_items

`iter()`

Iterate over the trips in the dataset.

Returns:

Type	Description
`iter`	Iterator over the trips

Source code in choice_learn/basket_models/dataset.py

def __iter__(self) -> iter:
    """Iterate over the trips in the dataset.

    Returns
    -------
    iter
        Iterator over the trips
    """
    return iter(self.trips)

`len()`

Return the number of trips in the dataset.

Returns:

Type	Description
`int`	Number of trips in the dataset

Source code in choice_learn/basket_models/dataset.py

def __len__(self) -> int:
    """Return the number of trips in the dataset.

    Returns
    -------
    int
        Number of trips in the dataset
    """
    return len(self.trips)

`str()`

Return short representation of the dataset.

Returns:

Type	Description
`str`	Representation of the dataset

Source code in choice_learn/basket_models/dataset.py

def __str__(self) -> str:
    """Return short representation of the dataset.

    Returns
    -------
    str
        Representation of the dataset
    """
    return f"TripDataset with {len(self)} trips"

`concatenate(other, inplace=False)`

Add a dataset to another.

Parameters:

Name	Type	Description	Default
`other`	`object`	Dataset to add	required
`inplace`	`bool`	Whether to add the dataset in-place or not, by default False	`False`

Returns:

Type	Description
`TripDataset`	Concatenated dataset

Source code in choice_learn/basket_models/dataset.py

def concatenate(self, other: object, inplace: bool = False) -> object:
    """Add a dataset to another.

    Parameters
    ----------
    other: TripDataset
        Dataset to add
    inplace: bool
        Whether to add the dataset in-place or not, by default False

    Returns
    -------
    TripDataset
        Concatenated dataset
    """
    if inplace:  # Add another dataset to the current one (in-place)
        # Concatenate the list of trips
        self.trips += other.trips
        # Update the attributes of the TripDataset
        self.max_length = max([trip.trip_length for trip in self.trips])
        self.n_samples = len(self.get_transactions())
        # Concatenate the arrays of availability matrices
        # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
        # changes
        self.available_items = np.concatenate(
            (self.available_items, other.available_items), axis=0
        )
        return self

    # Else: create a new dataset by adding 2 datasets together
    return TripDataset(
        # Concatenate the list of trips
        trips=self.trips + other.trips,
        # Concatenate the arrays of availability matrices
        # /!\ When concatenating 2 TripDatasets, the indices of the availability matrices
        # changes
        available_items=np.concatenate((self.available_items, other.available_items), axis=0),
    )

`get_all_baskets()`

Return the list of all baskets in the dataset.

Returns:

Type	Description
`ndarray`	List of baskets in the dataset

Source code in choice_learn/basket_models/dataset.py

def get_all_baskets(self) -> np.ndarray:
    """Return the list of all baskets in the dataset.

    Returns
    -------
    np.ndarray
        List of baskets in the dataset
    """
    return np.array([self.trips[i].purchases for i in range(len(self))])

`get_all_items()`

Return the list of all items available in the dataset.

Returns:

Type	Description
`ndarray`	List of items available in the dataset

Source code in choice_learn/basket_models/dataset.py

def get_all_items(self) -> np.ndarray:
    """Return the list of all items available in the dataset.

    Returns
    -------
    np.ndarray
        List of items available in the dataset
    """
    return np.arange(self.n_items)

`get_all_prices()`

Return the list of all price arrays in the dataset.

Returns:

Type	Description
`ndarray`	List of price arrays in the dataset

Source code in choice_learn/basket_models/dataset.py

def get_all_prices(self) -> np.ndarray:
    """Return the list of all price arrays in the dataset.

    Returns
    -------
    np.ndarray
        List of price arrays in the dataset
    """
    return np.array([self.trips[i].prices for i in range(len(self))])

`get_all_stores()`

Return the list of all stores in the dataset.

Returns:

Type	Description
`ndarray`	List of stores in the dataset

Source code in choice_learn/basket_models/dataset.py

def get_all_stores(self) -> np.ndarray:
    """Return the list of all stores in the dataset.

    Returns
    -------
    np.ndarray
        List of stores in the dataset
    """
    # If preprocessing working well, equal to [0, 1, ..., n_stores - 1]
    return np.array(list({self.trips[i].store for i in range(len(self))}))

`get_all_weeks()`

Return the list of all weeks in the dataset.

Returns:

Type	Description
`ndarray`	List of weeks in the dataset

Source code in choice_learn/basket_models/dataset.py

def get_all_weeks(self) -> np.ndarray:
    """Return the list of all weeks in the dataset.

    Returns
    -------
    np.ndarray
        List of weeks in the dataset
    """
    # If preprocessing working well, equal to [0, 1, ..., 51 or 52]
    return np.array(list({self.trips[i].week for i in range(len(self))}))

`get_one_vs_all_augmented_data_from_trip_index(trip_index)`

Get augmented data from a trip index.

Augmented data consists in removing one item from the basket that will be used as a target from the remaining items. It is done for all items, leading to returning: - items, - padded baskets with an item removed, - stores, - weeks, - prices, - available items.

Parameters:

Name	Type	Description	Default
`trip_index`	`int`	Index of the trip from which to get the data	required

Returns:

Type	Description
`tuple[ndarray]`	For each sample (ie transaction) from the trip: item, basket, store, week, prices, available items Length must be 6

Source code in choice_learn/basket_models/dataset.py

def get_one_vs_all_augmented_data_from_trip_index(
    self,
    trip_index: int,
) -> tuple[np.ndarray]:
    """Get augmented data from a trip index.

    Augmented data consists in removing one item from the basket that will be used
    as a target from the remaining items. It is done for all items, leading to returning:
        - items,
        - padded baskets with an item removed,
        - stores,
        - weeks,
        - prices,
        - available items.

    Parameters
    ----------
    trip_index: int
        Index of the trip from which to get the data

    Returns
    -------
    tuple[np.ndarray]
        For each sample (ie transaction) from the trip:
        item, basket, store, week, prices, available items
        Length must be 6
    """
    # Get the trip from the index
    trip = self.trips[trip_index]
    length_trip = len(trip.purchases)
    permuted_purchases = np.array(trip.purchases)

    # Create new baskets with one item removed that will be used as target
    # (len(basket) new baskets created)
    # And pad the truncated baskets with -1 to have the same length (because we need
    # numpy arrays for tiling and numpy arrays must have the same length)
    padded_purchases_lacking_one_item = np.array(
        [
            np.concatenate(
                (
                    permuted_purchases[:i],
                    # Pad the removed item with -1
                    [-1],
                    permuted_purchases[i + 1 :],
                    # Pad to have the same length
                    -1 * np.ones(self.max_length - length_trip),
                )
            )
            for i in range(0, length_trip)
        ],
        dtype=int,
    )

    if not (isinstance(trip.assortment, np.ndarray) or isinstance(trip.assortment, list)):
        # Then it is the assortment ID (ie its index in self.available_items)
        assortment = self.available_items[trip.assortment]
    else:  # np.ndarray
        # Then it is directly the availability matrix
        assortment = trip.assortment

    if not (isinstance(trip.prices, np.ndarray) or isinstance(trip.prices, list)):
        # Then it is the assortment ID (ie its index in self.available_items)
        prices = self.prices[trip.prices]
    else:  # np.ndarray
        # Then it is directly the availability matrix
        prices = trip.prices

    # Each item is linked to a basket, a store, a week, prices and an assortment
    return (
        permuted_purchases,  # Items
        padded_purchases_lacking_one_item,  # Baskets
        np.empty((0, self.max_length), dtype=int),  # Future purchases
        np.full(length_trip, trip.store),  # Stores
        np.full(length_trip, trip.week),  # Weeks
        np.tile(prices, (length_trip, 1)),  # Prices
        np.tile(assortment, (length_trip, 1)),  # Available items
    )

`get_subbaskets_augmented_data_from_trip_index(trip_index)`

Get augmented data from a trip index.

Augmented data includes all the transactions obtained sequentially from the trip. In particular, items in the basket are shuffled and sub-baskets are built iteratively with the next item that will be used as a target. In particular, it leads to: - permuted items, - permuted, truncated and padded baskets, - padded future purchases based on the baskets, - stores, - weeks, - prices, - available items.

Parameters:

Name	Type	Description	Default
`trip_index`	`int`	Index of the trip from which to get the data	required

Returns:

Type	Description
`tuple[ndarray]`	For each sample (ie transaction) from the trip: item, basket, future purchases, store, week, prices, available items Length must be 7

Source code in choice_learn/basket_models/dataset.py

def get_subbaskets_augmented_data_from_trip_index(
    self,
    trip_index: int,
) -> tuple[np.ndarray]:
    """Get augmented data from a trip index.

    Augmented data includes all the transactions obtained sequentially from the trip.
    In particular, items in the basket are shuffled and sub-baskets are built iteratively
    with the next item that will be used as a target. In particular, it leads to:
        - permuted items,
        - permuted, truncated and padded baskets,
        - padded future purchases based on the baskets,
        - stores,
        - weeks,
        - prices,
        - available items.

    Parameters
    ----------
    trip_index: int
        Index of the trip from which to get the data

    Returns
    -------
    tuple[np.ndarray]
        For each sample (ie transaction) from the trip:
        item, basket, future purchases, store, week, prices, available items
        Length must be 7
    """
    # Get the trip from the index
    trip = self.trips[trip_index]
    length_trip = len(trip.purchases)

    # Draw a random permutation of the items in the basket without the checkout item 0
    # TODO at a later stage: improve by sampling several permutations here
    permutation_list = list(permutations(range(length_trip - 1)))
    permutation = random.sample(permutation_list, 1)[0]

    # Permute the basket while keeping the checkout item 0 at the end
    permuted_purchases = np.array([trip.purchases[j] for j in permutation] + [0])

    # Truncate the baskets: for each batch sample, we consider the truncation possibilities
    # ranging from an empty basket to the basket with all the elements except the checkout item
    # And pad the truncated baskets with -1 to have the same length (because we need
    # numpy arrays for tiling and numpy arrays must have the same length)
    padded_truncated_purchases = np.array(
        [
            np.concatenate((permuted_purchases[:i], -1 * np.ones(self.max_length - i)))
            for i in range(0, length_trip)
        ],
        dtype=int,
    )

    # padded_future_purchases are the complements of padded_truncated_purchases, ie the
    # items that are not yet in the (permuted) basket but that we know will be purchased
    # during the next steps of the trip
    # Pad the future purchases with -1 to have the same length
    padded_future_purchases = np.array(
        [
            np.concatenate(
                (
                    permuted_purchases[i + 1 :],
                    -1 * np.ones(self.max_length - len(permuted_purchases) + i + 1),
                )
            )
            for i in range(0, length_trip)
        ],
        dtype=int,
    )

    if isinstance(trip.assortment, int):
        # Then it is the assortment ID (ie its index in self.available_items)
        assortment = self.available_items[trip.assortment]
    else:  # np.ndarray
        # Then it is directly the availability matrix
        assortment = trip.assortment

    # Each item is linked to a basket, the future purchases,
    # a store, a week, prices and an assortment
    return (
        permuted_purchases,  # Items
        padded_truncated_purchases,  # Baskets
        padded_future_purchases,  # Future purchases
        np.full(length_trip, trip.store),  # Stores
        np.full(length_trip, trip.week),  # Weeks
        np.tile(trip.prices, (length_trip, 1)),  # Prices
        np.tile(assortment, (length_trip, 1)),  # Available items
    )

`get_transactions()`

Return the transactions of the TripDataset.

One transaction is a triplet (store, trip, item).

Returns:

Type	Description
`dict`	Transactions of the TripDataset keys: trans_id values: (store, trip, item)

Source code in choice_learn/basket_models/dataset.py

def get_transactions(self) -> np.ndarray:
    """Return the transactions of the TripDataset.

    One transaction is a triplet (store, trip, item).

    Returns
    -------
    dict
        Transactions of the TripDataset
        keys: trans_id
        values: (store, trip, item)
    """
    transactions = {}

    trans_id = 0
    for i, trip in enumerate(self.trips):
        for item in trip.purchases:
            transactions[trans_id] = (trip.store, i, item)
            trans_id += 1

    return transactions

`get_trip(index)`

Return the trip at the given index.

Parameters:

Name	Type	Description	Default
`index`	`int`	Index of the trip to get	required

Returns:

Type	Description
`Trip`	Trip at the given index

Source code in choice_learn/basket_models/dataset.py

def get_trip(self, index: int) -> Trip:
    """Return the trip at the given index.

    Parameters
    ----------
    index: int
        Index of the trip to get

    Returns
    -------
    Trip
        Trip at the given index
    """
    return self.trips[index]

`iter_batch(batch_size, shuffle=False, data_method='shopper')`

Iterate over a TripDataset to return batches of items of length batch_size.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	Batch size (number of items in the batch)	required
`shuffle`	`bool`	Whether or not to shuffle the dataset	`False`

Yields:

Type	Description
`tuple[ndarray]`	For each item in the batch: item, basket, future purchases, store, week, prices, available items Length must 7

Source code in choice_learn/basket_models/dataset.py

def iter_batch(
    self,
    batch_size: int,
    shuffle: bool = False,
    data_method: str = "shopper",
) -> object:
    """Iterate over a TripDataset to return batches of items of length batch_size.

    Parameters
    ----------
    batch_size: int
        Batch size (number of items in the batch)
    shuffle: bool
        Whether or not to shuffle the dataset

    Yields
    ------
    tuple[np.ndarray]
        For each item in the batch: item, basket, future purchases,
        store, week, prices, available items
        Length must 7
    """
    # Get trip indexes
    num_trips = len(self)
    trip_indexes = np.arange(num_trips)

    # Shuffle trip indexes
    # TODO: shuffling on the trip indexes or on the item indexes?
    if shuffle:
        trip_indexes = np.random.default_rng().permutation(trip_indexes)

    # Initialize the buffer
    buffer = (
        np.empty(0, dtype=int),  # Items
        np.empty((0, self.max_length), dtype=int),  # Baskets
        np.empty((0, self.max_length), dtype=int),  # Future purchases
        np.empty(0, dtype=int),  # Stores
        np.empty(0, dtype=int),  # Weeks
        np.empty((0, self.n_items), dtype=int),  # Prices
        np.empty((0, self.n_items), dtype=int),  # Available items
    )

    if batch_size == -1:
        # Get the whole dataset in one batch
        for trip_index in trip_indexes:
            if data_method == "shopper":
                additional_trip_data = self.get_subbaskets_augmented_data_from_trip_index(
                    trip_index
                )
            elif data_method == "aleacarta":
                additional_trip_data = self.get_one_vs_all_augmented_data_from_trip_index(
                    trip_index
                )
            else:
                raise ValueError(f"Unknown data method: {data_method}")
            buffer = tuple(
                np.concatenate((buffer[i], additional_trip_data[i])) for i in range(len(buffer))
            )

        # Yield the whole dataset
        yield buffer

    else:
        # Yield batches of size batch_size while going through all the trips
        index = 0
        outer_break = False
        while index < num_trips:
            # Fill the buffer with trips' augmented data until it reaches the batch size
            while len(buffer[0]) < batch_size:
                if index >= num_trips:
                    # Then the buffer is not full but there are no more trips to consider
                    # Yield the batch partially filled
                    yield buffer

                    # Exit the TWO while loops when all trips have been considered
                    outer_break = True
                    break

                else:
                    # Consider a new trip to fill the buffer
                    if data_method == "shopper":
                        additional_trip_data = (
                            self.get_subbaskets_augmented_data_from_trip_index(
                                trip_indexes[index]
                            )
                        )
                    elif data_method == "aleacarta":
                        additional_trip_data = (
                            self.get_one_vs_all_augmented_data_from_trip_index(
                                trip_indexes[index]
                            )
                        )
                    else:
                        raise ValueError(f"Unknown data method: {data_method}")
                    index += 1

                    # Fill the buffer with the new trip
                    buffer = tuple(
                        np.concatenate((buffer[i], additional_trip_data[i]))
                        for i in range(len(buffer))
                    )

            if outer_break:
                break

            # Once the buffer is full, get the batch and update the next buffer
            batch = tuple(buffer[i][:batch_size] for i in range(len(buffer)))
            buffer = tuple(buffer[i][batch_size:] for i in range(len(buffer)))

            # Yield the batch
            yield batch

Trip and TripDataset Data Structures

Trip

__init__(purchases, prices, assortment, store=0, week=0)

__str__()

get_items_up_to_index(i)

TripDataset

n_assortments: int property

n_items: int property

n_stores: int property

__getitem__(index)

__init__(trips, available_items)

__iter__()

__len__()

__str__()

concatenate(other, inplace=False)

get_all_baskets()

get_all_items()

get_all_prices()

get_all_stores()

get_all_weeks()

get_one_vs_all_augmented_data_from_trip_index(trip_index)

get_subbaskets_augmented_data_from_trip_index(trip_index)

get_transactions()

get_trip(index)

iter_batch(batch_size, shuffle=False, data_method='shopper')

`Trip`

`init(purchases, prices, assortment, store=0, week=0)`

`str()`

`get_items_up_to_index(i)`

`TripDataset`

`n_assortments: int` `property`

`n_items: int` `property`

`n_stores: int` `property`

`getitem(index)`

`init(trips, available_items)`

`iter()`

`len()`

`str()`

`concatenate(other, inplace=False)`

`get_all_baskets()`

`get_all_items()`

`get_all_prices()`

`get_all_stores()`

`get_all_weeks()`

`get_one_vs_all_augmented_data_from_trip_index(trip_index)`

`get_subbaskets_augmented_data_from_trip_index(trip_index)`

`get_transactions()`

`get_trip(index)`

`iter_batch(batch_size, shuffle=False, data_method='shopper')`