Source code for beta_rec.datasets.diginetica

import os
import time

import pandas as pd

from ..datasets.dataset_base import DatasetBase
from ..utils.constants import DEFAULT_ITEM_COL, DEFAULT_TIMESTAMP_COL, DEFAULT_USER_COL

# Download URL.
DIGINETICA_URL = "https://cikm2016.cs.iupui.edu/cikm-cup/"

# Tips
DIGINETICA_TIPS = """
    Diginetica dataset can not be downloaded by this url automatically, and you need to do:
    1. Download this dataset via 'https://cikm2016.cs.iupui.edu/cikm-cup/',
    2. Put 'CIKMCUP2016_Track2_DIGINETICA-20200426T024501Z-001.zip' into the directory `diginetica/raw`,
    3. Unzip 'CIKMCUP2016_Track2_DIGINETICA-20200426T024501Z-001.zip',
    4. Rename dir 'CIKMCUP2016_Track2_DIGINETICA' to 'diginetica',
    5. Enter dir 'diginetica' and unzip 'dataset-train-diginetica.zip',
    6. Rename file 'train-item-views.csv' to 'diginetica.csv'
    7. Rerun this program.
"""


[docs]def process_time(standard_time=None):
    """Transform time format "xxxx-xx-xx" into format "xxxx-xx-xx xx-xx-xx".

    If there is no specified hour-minute-second data, we use 00:00:00 as default value.

    Args:
        standard_time: str with format "xxxx-xx-xx".
    Returns:
        timestamp: timestamp data.
    """
    standard_time = standard_time + " 00:00:00"
    date_arr = time.strptime(standard_time, "%Y-%m-%d %H:%M:%S")
    timestamp = int(time.mktime(date_arr))
    return timestamp


[docs]class Diginetica(DatasetBase):
    r"""Diginetica Dataset.

    This is a dataset provided by DIGINETICA and its partners containing anonymized search and browsing logs,
    product data, anonymized transactions, and a large data set of product images. The participants have to
    predict search relevance of products according to the personal shopping, search, and browsing preferences of
    the users. Both 'query-less' and 'query-full' sessions are possible. The evaluation is based on click and
    transaction data.

    The dataset can not be download by the url,
    you need to down the dataset by 'https://cikm2016.cs.iupui.edu/cikm-cup/'
    then put it into the directory `diginetica/raw`,
    then unzip this file and rename the new directory to 'diginetica'.

    Note: you also need unzip files in 'diginetica/raw/diginetica'.
    """

    def __init__(self, dataset_name="diginetica", min_u_c=0, min_i_c=3, root_dir=None):
        """Init Diginetica Class."""
        super().__init__(
            dataset_name=dataset_name,
            min_u_c=min_u_c,
            min_i_c=min_i_c,
            root_dir=root_dir,
            manual_download_url=DIGINETICA_URL,
            tips=DIGINETICA_TIPS,
        )

[docs]    def preprocess(self):
        """Preprocess the raw file.

        Preprocess the file downloaded via the url,
        convert it to a dataframe consist of the user-item interaction
        and save in the processed directory

        Download datasets if not existed.
        diginetica_name: train-item-views.csv

        1. Download diginetica dataset if this dataset is not existed.
        2. Load diginetica <diginetica-item-views> table from 'diginetica.csv'.
        3. Add rating column and create timestamp column.
        4. Save data model.
        """
        # Step 1: Download diginetica dataset if this dataset is not existed.
        diginetica_path = os.path.join(
            self.raw_path, self.dataset_name, "diginetica.csv"
        )
        if not os.path.exists(diginetica_path):
            self.download()

        # Step 2: Load diginetica <diginetica-item-views> table from 'diginetica.csv'.
        prior_transactions = pd.read_csv(
            diginetica_path,
            header=0,
            encoding="utf-8",
            engine="python",
            sep=";",
            usecols=[0, 2, 4],
            names=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_TIMESTAMP_COL],
        )

        # Step 3: Add rating column and create timestamp column.
        # Add rating column into this table.
        prior_transactions.insert(2, "col_rating", 1.0)

        # Create timestamp column.
        prior_transactions[DEFAULT_TIMESTAMP_COL] = prior_transactions[
            DEFAULT_TIMESTAMP_COL
        ].apply(lambda t: process_time(t))

        # Check the validation of this dataset.
        print(prior_transactions.head())

        # Step 4: Save data model.
        self.save_dataframe_as_npz(
            prior_transactions,
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"),
        )

        print("Done.")