Source code for beta_rec.datasets.hetrec

import csv
import os

import pandas as pd

from ..datasets.dataset_base import DatasetBase
from ..utils.constants import (
    DEFAULT_ITEM_COL,
    DEFAULT_ORDER_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_USER_COL,
)

# Download URLs
ML_2K_URL = (
    "http://files.grouplens.org/datasets/hetrec2011/hetrec2011-movielens-2k-v2.zip"
)
DL_2K_URL = "http://files.grouplens.org/datasets/hetrec2011/hetrec2011-delicious-2k.zip"
LF_2K_URL = "http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip"

# processed data url
ML_2K_LEAVE_ONE_OUT_URL = "https://1drv.ms/u/s!AjMahLyQeZqugjLTDesYUMDz7m4-?e=YjAeFC"
ML_2K_RANDOM_URL = "https://1drv.ms/u/s!AjMahLyQeZqugjQGQk6tnU_abBhJ?e=ewH1dM"
ML_2K_TEMPORAL_URL = "https://1drv.ms/u/s!AjMahLyQeZqugja3hXLGo-74UziC?e=6MN7jh"
DL_2K_LEAVE_ONE_BASKET_URL = "https://1drv.ms/u/s!AjMahLyQeZqugiwhklT9W7jC1jCs?e=aO2IZz"
DL_2K_LEAVE_ONE_OUT_URL = "https://1drv.ms/u/s!AjMahLyQeZqugi6ZpsxBQ-6KbXmE?e=wTyIX7"
LF_2K_LEAVE_ONE_BASKET_URL = "https://1drv.ms/u/s!AjMahLyQeZqugh0S50BjwNBDIVSi?e=ZSgjoB"
LF_2K_LEAVE_ONE_OUT_URL = "https://1drv.ms/u/s!AjMahLyQeZqugh9SOy0tpGT-DyGO?e=djcSUS"
LF_2K_RANDOM_URL = "https://1drv.ms/u/s!AjMahLyQeZqugiE-a65YO3G7ziq4?e=XKla4F"
LF_2K_RANDOM_BASKET_URL = "https://1drv.ms/u/s!AjMahLyQeZqugiM00QUQJYPWA5YE?e=s5o7By"
LF_2K_TEMPORAL_URL = "https://1drv.ms/u/s!AjMahLyQeZqugiWDnPcNDr-5Hyri?e=m8fqQa"
LF_2K_TEMPORAL_BASKET_URL = "https://1drv.ms/u/s!AjMahLyQeZqugic_tmXvkpxN_RE8?e=ZKuSbB"


[docs]class MovieLens_2k(DatasetBase): """MovieLens-2k Dataset. If the dataset can not be download by the url, you need to down the dataset by the link: 'http://files.grouplens.org/datasets/hetrec2011/hetrec2011-movielens-2k-v2.zip' then put it into the directory `movielens-2k/raw. """ def __init__( self, dataset_name="movielens-2k", min_u_c=0, min_i_c=3, root_dir=None ): """Init Movielens_2k Class.""" super().__init__( dataset_name=dataset_name, min_u_c=min_u_c, min_i_c=min_i_c, root_dir=root_dir, manual_download_url=ML_2K_URL, url=ML_2K_URL, processed_leave_one_out_url="", processed_random_split_url="", processed_temporal_split_url="", )
[docs] def preprocess(self): """Preprocess the raw file. Preprocess the file downloaded via the url, convert it to a dataframe consist of the user-item interaction and save in the processed directory. """ movie_2k_file = os.path.join(self.raw_path, "user_ratedmovies-timestamps.dat") if not os.path.exists(movie_2k_file): self.download() # Load in [user, bookmark, tags, timestamps] format. prior_transactions = pd.read_csv( movie_2k_file, header=0, encoding="utf-8", delimiter="\t", quoting=csv.QUOTE_NONE, ) # Rename this table to fix the standard. prior_transactions.rename( columns={ "userID": DEFAULT_USER_COL, "movieID": DEFAULT_ITEM_COL, "rating": DEFAULT_RATING_COL, "timestamp": DEFAULT_TIMESTAMP_COL, }, inplace=True, ) # Check the validation of prior_transactions. print(prior_transactions.head()) # Save data. self.save_dataframe_as_npz( prior_transactions, os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"), ) print("Done.")
[docs]class Delicious_2k(DatasetBase): """delicious-2k Dataset. This dataset contains social networking, bookmarking, and tagging information from a set of 2K users from Delicious social bookmarking system. http://www.delicious.com. If the dataset can not be download by the url, you need to down the dataset in the following link: 'http://files.grouplens.org/datasets/hetrec2011/hetrec2011-delicious-2k.zip' then put it into the directory `delicious-2k/raw`. """ def __init__( self, dataset_name="delicious-2k", min_u_c=0, min_i_c=3, root_dir=None, ): """Init Delicious_2k Class.""" super().__init__( dataset_name=dataset_name, min_u_c=min_u_c, min_i_c=min_i_c, root_dir=root_dir, manual_download_url=DL_2K_URL, url=DL_2K_URL, processed_leave_one_out_url="", processed_random_split_url="", processed_temporal_split_url="", )
[docs] def preprocess(self): """Preprocess the raw file. Preprocess the file downloaded via the url, convert it to a dataframe consist of the user-item interaction and save in the processed directory. """ delicious_file = os.path.join( self.raw_path, "user_taggedbookmarks-timestamps.dat" ) if not os.path.exists(delicious_file): self.download() # Load in [user, bookmark, tags, timestamps] format. prior_transactions = pd.read_csv( delicious_file, header=0, encoding="utf-8", delimiter="\t", quoting=csv.QUOTE_NONE, ) # Add rating feature into this table. prior_transactions.insert(3, "rating", 1) # Rename this table to fix the standard. prior_transactions.rename( columns={ "userID": DEFAULT_USER_COL, "bookmarkID": DEFAULT_ITEM_COL, "tagID": DEFAULT_ORDER_COL, "rating": DEFAULT_RATING_COL, "timestamp": DEFAULT_TIMESTAMP_COL, }, inplace=True, ) # Check the validation of prior_transactions. # print(prior_transactions.head()) # Save data. self.save_dataframe_as_npz( prior_transactions, os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"), ) print("Done.")
[docs]class LastFM_2k(DatasetBase): """Lastfm-2k Dataset. This dataset contains social networking, tagging, and music artist listening information from a set of 2K users from Last.fm online music system. If the dataset can not be download by the url, you need to down the dataset by the link: 'http://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip' then put it into the directory `delicious-2k/raw`. """ def __init__(self, dataset_name="lastfm-2k", min_u_c=0, min_i_c=3, root_dir=None): """Init LastFM_2k Class.""" super().__init__( dataset_name=dataset_name, min_u_c=min_u_c, min_i_c=min_i_c, root_dir=root_dir, manual_download_url=LF_2K_URL, url=LF_2K_URL, processed_leave_one_basket_url=LF_2K_LEAVE_ONE_BASKET_URL, processed_leave_one_out_url=LF_2K_LEAVE_ONE_OUT_URL, processed_random_basket_split_url=LF_2K_RANDOM_BASKET_URL, processed_random_split_url=LF_2K_RANDOM_URL, processed_temporal_basket_split_url=LF_2K_TEMPORAL_BASKET_URL, processed_temporal_split_url=LF_2K_TEMPORAL_URL, )
[docs] def preprocess(self): """Preprocess the raw file. Preprocess the file downloaded via the url, convert it to a dataframe consist of the user-item interaction and save in the processed directory. """ lastfm_file = os.path.join(self.raw_path, "user_taggedartists-timestamps.dat") if not os.path.exists(lastfm_file): self.download() # Load in [user, bookmark, tags, timestamps] format. prior_transactions = pd.read_csv( lastfm_file, header=0, encoding="utf-8", delimiter="\t", quoting=csv.QUOTE_NONE, ) # Add rating feature into this table. prior_transactions.insert(3, "rating", 1) # Rename this table to fix the standard. prior_transactions.rename( columns={ "userID": DEFAULT_USER_COL, "artistID": DEFAULT_ITEM_COL, "tagID": DEFAULT_ORDER_COL, "rating": DEFAULT_RATING_COL, "timestamp": DEFAULT_TIMESTAMP_COL, }, inplace=True, ) # Check the validation of prior_transactions. print(prior_transactions.head()) # Save data. self.save_dataframe_as_npz( prior_transactions, os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"), ) print("Done.")