Source code for beta_rec.datasets.tafeng

import os

import numpy as np
import pandas as pd

from ..datasets.dataset_base import DatasetBase
from ..utils.constants import (
    DEFAULT_FLAG_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_ORDER_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_USER_COL,
)

# download_url
TAFENG_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugjc2k3eCAwKavccB?e=Qn5ppw"

# processed data url
TAFENG_LEAVE_ONE_OUT_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugWw1iWQHgI2NNbuM?e=LwEbEc"
TAFENG_RANDOM_SPLIT_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugWbXQ__YWqF9v_7x?e=NjX5VQ"
TAFENG_TEMPORAL_SPLIT_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugWp1Y1JefMXZr0ng?e=OoAgwD"


[docs]class Tafeng(DatasetBase): """Tafeng Dataset. The dataset can not be download by the url, you need to down the dataset by 'https://1drv.ms/u/s!AjMahLyQeZqugjc2k3eCAwKavccB?e=Qn5ppw' then put it into the directory `tafeng/raw`. """ def __init__( self, dataset_name="tafeng", min_u_c=0, min_i_c=3, min_o_c=0, root_dir=None ): """Init Tafeng Class.""" super().__init__( dataset_name=dataset_name, min_u_c=min_u_c, min_i_c=min_i_c, min_o_c=min_o_c, root_dir=root_dir, url=TAFENG_URL, manual_download_url=TAFENG_URL, processed_random_split_url=TAFENG_RANDOM_SPLIT_URL, )
[docs] def preprocess(self): """Preprocess the raw file. Preprocess the file downloaded via the url, convert it to a dataframe consist of the user-item interaction and save in the processed directory. """ file_name = os.path.join(self.raw_path, "train.txt") if not os.path.exists(file_name): self.download() original_train_file = os.path.join(self.raw_path, "train.txt") original_test_file = os.path.join(self.raw_path, "test.txt") # initial dataframe interaction_list = [] with open(original_train_file) as ori_test_df: for line in ori_test_df: temp_list = line.replace("\n", "\t").split("\t") # replace '\n' in the end of the line by '\t' # split line by '\t' # store split items in a list order_id = temp_list[0] item_ids_list = temp_list[1:-3] # item_ids time_order = temp_list[-2].replace("-", "") user_id = temp_list[-3] for item_id in item_ids_list: interaction_list.append( [order_id, user_id, item_id, "train", "1", time_order] ) print(len(interaction_list)) with open(original_test_file) as ori_test_df: for line in ori_test_df: temp_list = line.replace("\n", "\t").split("\t") # replace '\n' in the end of the line by '\t' # split line by '\t' # store split items in a list order_id = temp_list[0] item_ids_list = temp_list[1:-3] # item_ids time_order = temp_list[-2].replace("-", "") user_id = temp_list[-3] for item_id in item_ids_list: interaction_list.append( [order_id, user_id, item_id, "train", "1", time_order] ) print(len(interaction_list)) interactions = np.array(interaction_list) full_data = pd.DataFrame( data={ DEFAULT_ORDER_COL: interactions[:, 0], DEFAULT_USER_COL: interactions[:, 1], DEFAULT_ITEM_COL: interactions[:, 2], DEFAULT_FLAG_COL: interactions[:, 3], DEFAULT_RATING_COL: interactions[:, 4], DEFAULT_TIMESTAMP_COL: interactions[:, 5], } ) self.save_dataframe_as_npz( full_data, os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"), )