Source code for beta_rec.datasets.diginetica

import os
import time

import pandas as pd

from ..datasets.dataset_base import DatasetBase

# Download URL.

# Tips
    Diginetica dataset can not be downloaded by this url automatically, and you need to do:
    1. Download this dataset via '',
    2. Put '' into the directory `diginetica/raw`,
    3. Unzip '',
    4. Rename dir 'CIKMCUP2016_Track2_DIGINETICA' to 'diginetica',
    5. Enter dir 'diginetica' and unzip '',
    6. Rename file 'train-item-views.csv' to 'diginetica.csv'
    7. Rerun this program.

[docs]def process_time(standard_time=None): """Transform time format "xxxx-xx-xx" into format "xxxx-xx-xx xx-xx-xx". If there is no specified hour-minute-second data, we use 00:00:00 as default value. Args: standard_time: str with format "xxxx-xx-xx". Returns: timestamp: timestamp data. """ standard_time = standard_time + " 00:00:00" date_arr = time.strptime(standard_time, "%Y-%m-%d %H:%M:%S") timestamp = int(time.mktime(date_arr)) return timestamp
[docs]class Diginetica(DatasetBase): r"""Diginetica Dataset. This is a dataset provided by DIGINETICA and its partners containing anonymized search and browsing logs, product data, anonymized transactions, and a large data set of product images. The participants have to predict search relevance of products according to the personal shopping, search, and browsing preferences of the users. Both 'query-less' and 'query-full' sessions are possible. The evaluation is based on click and transaction data. The dataset can not be download by the url, you need to down the dataset by '' then put it into the directory `diginetica/raw`, then unzip this file and rename the new directory to 'diginetica'. Note: you also need unzip files in 'diginetica/raw/diginetica'. """ def __init__(self, dataset_name="diginetica", min_u_c=0, min_i_c=3, root_dir=None): """Init Diginetica Class.""" super().__init__( dataset_name=dataset_name, min_u_c=min_u_c, min_i_c=min_i_c, root_dir=root_dir, manual_download_url=DIGINETICA_URL, tips=DIGINETICA_TIPS, )
[docs] def preprocess(self): """Preprocess the raw file. Preprocess the file downloaded via the url, convert it to a dataframe consist of the user-item interaction and save in the processed directory Download datasets if not existed. diginetica_name: train-item-views.csv 1. Download diginetica dataset if this dataset is not existed. 2. Load diginetica <diginetica-item-views> table from 'diginetica.csv'. 3. Add rating column and create timestamp column. 4. Save data model. """ # Step 1: Download diginetica dataset if this dataset is not existed. diginetica_path = os.path.join( self.raw_path, self.dataset_name, "diginetica.csv" ) if not os.path.exists(diginetica_path): # Step 2: Load diginetica <diginetica-item-views> table from 'diginetica.csv'. prior_transactions = pd.read_csv( diginetica_path, header=0, encoding="utf-8", engine="python", sep=";", usecols=[0, 2, 4], names=[DEFAULT_USER_COL, DEFAULT_ITEM_COL, DEFAULT_TIMESTAMP_COL], ) # Step 3: Add rating column and create timestamp column. # Add rating column into this table. prior_transactions.insert(2, "col_rating", 1.0) # Create timestamp column. prior_transactions[DEFAULT_TIMESTAMP_COL] = prior_transactions[ DEFAULT_TIMESTAMP_COL ].apply(lambda t: process_time(t)) # Check the validation of this dataset. print(prior_transactions.head()) # Step 4: Save data model. self.save_dataframe_as_npz( prior_transactions, os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"), ) print("Done.")