Source code for beta_rec.datasets.instacart

import os
import random

import pandas as pd

from ..datasets.dataset_base import DatasetBase
from ..utils.common_util import un_zip
from ..utils.constants import (
    DEFAULT_FLAG_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_ORDER_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_USER_COL,
)

# Download URL.
INSTACART_URL = "https://www.kaggle.com/c/instacart-market-basket-analysis/data"

# processed data url
INSTACART_RANDOM_SPLIT_URL = (
    r"https://1drv.ms/u/s!AjMahLyQeZqugX4W4zLO6Jkx8P-W?e=oKymnV"
)
INSTACART_TEMPORAL_SPLIT_URL = (
    r"https://1drv.ms/u/s!AjMahLyQeZquggAblxVFSYeu3nzh?e=pzBaAa"
)
INSTACART_LEAVE_ONE_OUT_URL = (
    r"https://1drv.ms/u/s!AjMahLyQeZquggLQynzcCWfNUdIg?e=HDhUjL"
)

# Tips
INSTACART_TIPS = """
    Instacart dataset can not be downloaded by this url automatically, and you need to do:
    1. Download this dataset via 'https://www.kaggle.com/c/instacart-market-basket-analysis/data',
    2. Put 'instacart-market-basket-analysis.zip' into the directory `instacart/raw/`,
    3. Unzip 'instacart-market-basket-analysis.zip', put all the *.csv files into 'instacart/raw/'.
    4. Rerun this program.
"""


[docs]class Instacart(DatasetBase):
    """Instacart Dataset.

    If the dataset can not be download by the url,
    you need to down the dataset by the link:
        'https://s3.amazonaws.com/instacart-datasets/instacart_online_grocery_shopping_2017_05_01.tar.gz'
    then put it into the directory `instacart/raw`, unzip this file and rename the directory in 'instacart'.

    Instacart dataset is used to predict when users buy
    product for the next time, we construct it with structure [order_id, product_id].
    """

    def __init__(
        self, dataset_name="instacart", min_u_c=0, min_i_c=3, min_o_c=0, root_dir=None
    ):
        """Init Instacart Class."""
        super().__init__(
            dataset_name=dataset_name,
            min_u_c=min_u_c,
            min_i_c=min_i_c,
            min_o_c=min_o_c,
            root_dir=root_dir,
            manual_download_url=INSTACART_URL,
            processed_leave_one_out_url=INSTACART_LEAVE_ONE_OUT_URL,
            processed_random_split_url=INSTACART_RANDOM_SPLIT_URL,
            processed_temporal_split_url=INSTACART_TEMPORAL_SPLIT_URL,
            tips=INSTACART_TIPS,
        )

[docs]    def preprocess(self):
        """Preprocess the raw file.

        Preprocess the file downloaded via the url,
        convert it to a dataframe consist of the user-item interaction
        and save in the processed directory.

        Download and load datasets
        1. Download instacart dataset if this dataset is not existed.
        2. Load <order> table and <order_products> table from "orders.csv" and "order_products__train.csv".
        3. Merge the two tables above.
        4. Add additional columns [rating, timestamp].
        5. Rename columns and save data model.
        """
        # Step 1: Download instacart dataset if this dataset is not existed.

        print("Start loading data from raw data")
        order_products_prior_file = os.path.join(
            self.raw_path, "order_products__prior.csv"
        )
        order_products_train_file = os.path.join(
            self.raw_path, "order_products__train.csv"
        )
        if not os.path.exists(order_products_prior_file) or not os.path.exists(
            order_products_train_file
        ):
            print("Raw file doesn't exist, try to download it.")
            self.download()

        orders_file = os.path.join(self.raw_path, "orders.csv")

        #  order_products__*.csv: order_id,product_id,add_to_cart_order,reordered
        prior_products = pd.read_csv(
            order_products_prior_file,
            usecols=["order_id", "product_id", "add_to_cart_order"],
        )
        train_products = pd.read_csv(
            order_products_train_file,
            usecols=["order_id", "product_id", "add_to_cart_order"],
        )
        order_products = pd.concat([prior_products, train_products])

        #  orders.csv:  order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
        orders = pd.read_csv(
            orders_file, usecols=["user_id", "order_id", "order_number", "eval_set"]
        )

        user_products = order_products.merge(orders, how="left", on="order_id")

        # user_item_id = user_products.groupby(["user_id"]).count()

        # user_order_number = user_products.groupby(["user_id", "order_number"]).count()

        order_addtocart_user = (
            user_products.groupby(
                ["order_id", "add_to_cart_order", "user_id", "product_id", "eval_set"]
            )
            .size()
            .rename("ratings")
            .reset_index()
        )
        order_addtocart_user.rename(
            columns={
                "order_id": DEFAULT_ORDER_COL,
                "user_id": DEFAULT_USER_COL,
                "product_id": DEFAULT_ITEM_COL,
                "ratings": DEFAULT_RATING_COL,
                "eval_set": DEFAULT_FLAG_COL,
            },
            inplace=True,
        )
        timestamp_col = {DEFAULT_TIMESTAMP_COL: order_addtocart_user.index}
        order_addtocart_user = order_addtocart_user.assign(**timestamp_col)
        print("Loading raw data completed")
        # save processed data into the disk.
        self.save_dataframe_as_npz(
            order_addtocart_user,
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"),
        )


[docs]class Instacart_25(DatasetBase):
    """Instacart Dataset.

    If the dataset can not be download by the url,
    you need to down the dataset by the link:
        'https://s3.amazonaws.com/instacart-datasets/instacart_online_grocery_shopping_2017_05_01.tar.gz'
    then put it into the directory `instacart/raw`, unzip this file and rename the directory in 'instacart'.

    Instacart dataset is used to predict when users buy
    product for the next time, we construct it with structure [order_id, product_id].
    """

    def __init__(
        self,
        dataset_name="instacart_25",
        min_u_c=0,
        min_i_c=3,
        min_o_c=0,
    ):
        """Init Instacart_25 Class."""
        super().__init__(
            dataset_name=dataset_name,
            min_u_c=min_u_c,
            min_i_c=min_i_c,
            min_o_c=min_o_c,
            manual_download_url="https://www.kaggle.com/c/6644/download-all",
            processed_random_split_url=INSTACART_RANDOM_SPLIT_URL,
            processed_temporal_split_url=INSTACART_TEMPORAL_SPLIT_URL,
        )

[docs]    def preprocess(self):
        """Preprocess the raw file.

        Preprocess the file downloaded via the url,
        convert it to a dataframe consist of the user-item interaction
        and save in the processed directory

        Download and load datasets
        1. Download instacart dataset if this dataset is not existed.
        2. Load <order> table and <order_products> table from "orders.csv" and "order_products__train.csv".
        3. Merge the two tables above.
        4. Add additional columns [rating, timestamp].
        5. Rename columns and save data model.
        """
        # Step 1: Download instacart dataset if this dataset is not existed.

        print("Start loading data from raw data")
        order_products_prior_file = os.path.join(
            self.raw_path, "order_products__prior.csv"
        )
        order_products_train_file = os.path.join(
            self.raw_path, "order_products__train.csv"
        )
        if not os.path.exists(order_products_prior_file) or not os.path.exists(
            order_products_train_file
        ):
            print("Raw file doesn't exist, try to download it.")
            self.download()
            file_name = os.path.join(self.raw_path + ".gz")
            un_zip(file_name)

        orders_file = os.path.join(self.raw_path, "orders.csv")

        #  order_products__*.csv: order_id,product_id,add_to_cart_order,reordered
        prior_products = pd.read_csv(
            order_products_prior_file,
            usecols=["order_id", "product_id", "add_to_cart_order"],
        )
        train_products = pd.read_csv(
            order_products_train_file,
            usecols=["order_id", "product_id", "add_to_cart_order"],
        )
        order_products = pd.concat([prior_products, train_products])

        #  orders.csv:  order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
        orders = pd.read_csv(
            orders_file, usecols=["user_id", "order_id", "order_number", "eval_set"]
        )

        user_products = order_products.merge(orders, how="left", on="order_id")

        order_addtocart_user = (
            user_products.groupby(
                ["order_id", "add_to_cart_order", "user_id", "product_id", "eval_set"]
            )
            .size()
            .rename("ratings")
            .reset_index()
        )
        order_addtocart_user.rename(
            columns={
                "order_id": DEFAULT_ORDER_COL,
                "user_id": DEFAULT_USER_COL,
                "product_id": DEFAULT_ITEM_COL,
                "ratings": DEFAULT_RATING_COL,
                "eval_set": DEFAULT_FLAG_COL,
            },
            inplace=True,
        )
        timestamp_col = {DEFAULT_TIMESTAMP_COL: order_addtocart_user.index}
        order_addtocart_user = order_addtocart_user.assign(**timestamp_col)
        print("Start sampling 25% users from the raw data")
        users = list(order_addtocart_user[DEFAULT_USER_COL].unique())
        sampled_users = random.sample(users, int(len(users) * 0.25))
        order_addtocart_user = order_addtocart_user[
            order_addtocart_user[DEFAULT_USER_COL].isin(sampled_users)
        ]

        print("Loading raw data completed")
        # save processed data into the disk.
        self.save_dataframe_as_npz(
            order_addtocart_user,
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"),
        )