Source code for beta_rec.datasets.retailrocket

import os

import pandas as pd

from ..datasets.dataset_base import DatasetBase
from ..utils.constants import DEFAULT_ITEM_COL, DEFAULT_TIMESTAMP_COL, DEFAULT_USER_COL

# Download URL
RETAIL_ROCKET_URL = "https://www.kaggle.com/retailrocket/ecommerce-dataset/download"

# Tips
RETAIL_ROCKET_TIPS = """
    RetailRocket dataset can not be downloaded by this url automatically, and you need to do:
    1. Download this dataset via 'https://www.kaggle.com/retailrocket/ecommerce-dataset/download',
    2. Put 'ecommerce-dataset.zip' into the directory `retailrocket/raw`,
    3. Unzip 'ecommerce-dataset.zip',
    4. Rerun this program.
"""


[docs]class RetailRocket(DatasetBase): """RetailRocket Dataset. This data has been collected from a real-world e-commerce website. It is raw data without any content transformations, however, all values are hashed due to confidential issue. The purpose of publishing is to motivate researches in the field of recommendation systems with implicit feedback. If the dataset can not be download by the url, you need to down the dataset by the link: https://www.kaggle.com/retailrocket/ecommerce-dataset/download. then put it into the directory `retailrocket/raw` and unzip it. """ def __init__( self, dataset_name="retailrocket", min_u_c=0, min_i_c=3, root_dir=None ): """Init RetailRocket Class.""" super().__init__( dataset_name=dataset_name, min_u_c=min_u_c, min_i_c=min_i_c, root_dir=root_dir, manual_download_url=RETAIL_ROCKET_URL, tips=RETAIL_ROCKET_TIPS, )
[docs] def preprocess(self): """Preprocess the raw file. Preprocess the file downloaded via the url, convert it to a DataFrame consist of the user-item interaction and save in the processed directory. Download dataset if not existed. retail_rocket_name: UserBehavior.csv 1. Download RetailRocket dataset if this dataset is not existed. 2. Load RetailRocket <retail-rocket-interaction> table from 'events.csv'. 3. Save dataset model. """ # Step 1: Download RetailRocket dataset if this dataset is not existed. retail_rocket_path = os.path.join(self.raw_path, "events.csv") if not os.path.exists(retail_rocket_path): self.download() # Step 2: Load RetailRocket <retail-rocket-interaction> table from 'events.csv'. prior_transactions = pd.read_csv( retail_rocket_path, engine="python", encoding="utf-8", header=0, usecols=[0, 1, 3], names=[DEFAULT_TIMESTAMP_COL, DEFAULT_USER_COL, DEFAULT_ITEM_COL], ) # Add rating column into the table. prior_transactions.insert(2, "col_rating", 1.0) # Step 3: Save dataset model. # Check the validation of this dataset. print(prior_transactions.head()) # Save this data model. self.save_dataframe_as_npz( prior_transactions, os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"), ) print("Done.")