Source code for beta_rec.datasets.movielens

import os

import numpy as np
import pandas as pd

from beta_rec.datasets.dataset_base import DatasetBase
from beta_rec.utils.constants import (
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_USER_COL,
)

# download_url
ML_100K_URL = r"http://files.grouplens.org/datasets/movielens/ml-100k.zip"
ML_1M_URL = r"http://files.grouplens.org/datasets/movielens/ml-1m.zip"
ML_10M_URL = r"http://files.grouplens.org/datasets/movielens/ml-10m.zip"
ML_25M_URL = r"http://files.grouplens.org/datasets/movielens/ml-25m.zip"

# processed data url
ML_100K_LEAVE_ONE_OUT_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugU-siALoN5y9eaCq?e=jsgoOB"
ML_100K_RANDOM_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugVD4bv1iR6KgZn63?e=89eToa"
ML_100K_TEMPORAL_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugVG_vS_DggoFaySY?e=HpcD9b"

ML_1M_LEAVE_ONE_OUT_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugVMZ5TK2sTGBUSr0?e=32CmFJ"
ML_1M_RANDOM_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugVW2Bl1A1kORNuTY?e=iEabat"
ML_1M_TEMPORAL_URL = r"https://1drv.ms/u/s!AjMahLyQeZqugVf8PRlo82hSnblP?e=VpZa0L"

# indicators of the colunmn name
par_abs_dir = os.path.abspath(os.path.join(os.path.abspath("."), os.pardir))

# raw dataset
ml_1m_raw_dir = "datasets/ml-1m/raw/ratings.dat"
# dataset dir under temporal split
ml_1m_temporal_dir = "datasets/ml-1m/temporal"
# dataset dir under leave-one-out split
ml_1m_l1o_dir = os.path.join(par_abs_dir, "datasets/ml-1m/leave_one_out")


[docs]class Movielens_100k(DatasetBase):
    """Movielens 100k Dataset."""

    def __init__(self, dataset_name="ml_100k", min_u_c=0, min_i_c=3, root_dir=None):
        """Init Movielens_100k Class."""
        super().__init__(
            dataset_name=dataset_name,
            min_u_c=min_u_c,
            min_i_c=min_i_c,
            root_dir=root_dir,
            url=ML_100K_URL,
            processed_leave_one_out_url=ML_100K_LEAVE_ONE_OUT_URL,
            processed_random_split_url=ML_100K_RANDOM_URL,
            processed_temporal_split_url=ML_100K_TEMPORAL_URL,
        )

[docs]    def preprocess(self):
        """Preprocess the raw file.
        Preprocess the file downloaded via the url, convert it to a dataframe consisting of the user-item
        interactions and save it in the processed directory.
        """
        file_name = os.path.join(self.raw_path, self.dataset_name, "u.data")
        if not os.path.exists(file_name):
            self.download()

        data = pd.read_table(
            file_name,
            header=None,
            sep="\t",
            engine="python",
            names=[
                DEFAULT_USER_COL,
                DEFAULT_ITEM_COL,
                DEFAULT_RATING_COL,
                DEFAULT_TIMESTAMP_COL,
            ],
        )
        self.save_dataframe_as_npz(
            data,
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"),
        )

[docs]    def make_fea_vec(self):
        """Make feature vectors for users and items.
        1. For items (movies), we use the last 19 fields as feature, which are the genres,
        with 1 indicating the movie is of that genre, and 0 indicating it is not;
        movies can be in several genres at once.
        2. For users, we construct one_hot encoding for age, gender and occupation as their
        feature, where ages are categorized into 8 groups.
        Returns:
            user_feat (numpy.ndarray): The first column is the user id, rest column are feat vectors.
            item_feat (numpy.ndarray): The first column is the item id, rest column are feat vectors.
        """
        print(f"Making user and item feature vectors for dataset {self.dataset_name}")
        data = pd.read_table(
            f"{self.dataset_dir}/raw/ml_100k/u.item",
            header=None,
            sep="|",
            engine="python",
        )
        item_feat = data[[0] + [i for i in range(5, 24)]].to_numpy()
        # first column is the item id, other 19 columns are feature
        data = pd.read_table(
            f"{self.dataset_dir}/raw/ml_100k/u.user",
            header=None,
            sep="|",
            engine="python",
        )
        age_one_hot = np.eye(8).astype(np.int)
        # categorize age into 8 groups
        age_mapping = {
            1: age_one_hot[0],
            2: age_one_hot[1],
            3: age_one_hot[2],
            4: age_one_hot[3],
            5: age_one_hot[4],
            6: age_one_hot[5],
            7: age_one_hot[6],
            8: age_one_hot[7],
        }
        data["age_one_hot"] = data[1].apply(lambda x: age_mapping[int(x / 10) + 1])
        col_2 = data[2].unique()
        col_2_one_hot = np.eye(len(col_2)).astype(np.int)
        col_2_mapping = {}
        for idx, col in enumerate(col_2):
            col_2_mapping[col] = col_2_one_hot[idx]
        data["col_2_one_hot"] = data[2].apply(lambda x: col_2_mapping[x])
        col_3 = data[3].unique()
        col_3_one_hot = np.eye(len(col_3)).astype(np.int)

        col_3_mapping = {}
        for idx, col in enumerate(col_3):
            col_3_mapping[col] = col_3_one_hot[idx]
        data["col_3_one_hot"] = data[3].apply(lambda x: col_3_mapping[x])
        A = []
        for i in data.index:
            A.append(
                [data.loc[i][0]]
                + list(data.loc[i]["age_one_hot"])
                + list(data.loc[i]["col_2_one_hot"])
                + list(data.loc[i]["col_3_one_hot"])
            )
        user_feat = np.stack(A)

        np.savez_compressed(
            f"{self.dataset_dir}/processed/feature_vec.npz",
            user_feat=user_feat,
            item_feat=item_feat,
        )
        return user_feat, item_feat

[docs]    def load_fea_vec(self):
        """Load feature vectors for users and items.
        1. For items (movies), we use the last 19 fields as feature, which are the genres,
        with 1 indicating the movie is of that genre, and 0 indicating it is not;
        movies can be in several genres at once.
        2. For users, we construct one_hot encoding for age, gender and occupation as their
        feature, where ages are categorized into 8 groups.
        Returns:
            user_feat (numpy.ndarray): The first column is the user id, rest column are feat vectors.
            item_feat (numpy.ndarray): The first column is the itm id, rest column are feat vectors.
        """
        if not os.path.exists(self.dataset_dir):
            self.preprocess()
        if not os.path.exists(f"{self.dataset_dir}/processed/feature_vec.npz"):
            self.make_fea_vec()
        print(f"Loading user and item feature vectors for dataset {self.dataset_name}")
        loaded = np.load(f"{self.dataset_dir}/processed/feature_vec.npz")
        return loaded["user_feat"], loaded["item_feat"]


[docs]class Movielens_1m(DatasetBase):
    """Movielens 1m Dataset."""

    def __init__(self, dataset_name="ml_1m", min_u_c=0, min_i_c=3, root_dir=None):
        """Init Movielens_1m Class."""
        super().__init__(
            dataset_name=dataset_name,
            min_u_c=min_u_c,
            min_i_c=min_i_c,
            root_dir=root_dir,
            url=ML_1M_URL,
        )

[docs]    def preprocess(self):
        """Preprocess the raw file.
        Preprocess the file downloaded via the url, convert it to a DataFrame consisting of the user-item
        interactions and save it in the processed directory.
        """
        file_name = os.path.join(self.raw_path, self.dataset_name, "ratings.dat")
        if not os.path.exists(file_name):
            self.download()

        data = pd.read_table(
            file_name,
            header=None,
            sep="::",
            engine="python",
            names=[
                DEFAULT_USER_COL,
                DEFAULT_ITEM_COL,
                DEFAULT_RATING_COL,
                DEFAULT_TIMESTAMP_COL,
            ],
        )
        self.save_dataframe_as_npz(
            data,
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"),
        )


[docs]class Movielens_25m(DatasetBase):
    """Movielens 25m Dataset."""

    def __init__(self, dataset_name="ml_25m", min_u_c=0, min_i_c=3, root_dir=None):
        """Init Movielens_25m Class."""
        super().__init__(
            dataset_name=dataset_name,
            min_u_c=min_u_c,
            min_i_c=min_i_c,
            root_dir=root_dir,
            url=ML_25M_URL,
        )

[docs]    def preprocess(self):
        """Preprocess the raw file.
        Preprocess the file downloaded via the url, convert it to a DataFrame consisting of the user-item
        interactions and save it in the processed directory.
        """
        file_name = os.path.join(self.raw_path, self.dataset_name, "ratings.csv")
        if not os.path.exists(file_name):
            self.download()

        data = pd.read_table(
            file_name,
            header=None,
            sep="::",
            engine="python",
            names=[
                DEFAULT_USER_COL,
                DEFAULT_ITEM_COL,
                DEFAULT_RATING_COL,
                DEFAULT_TIMESTAMP_COL,
            ],
        )
        self.save_dataframe_as_npz(
            data,
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"),
        )


[docs]class Movielens_10m(DatasetBase):
    """Movielens 10m Dataset."""

    def __init__(self, dataset_name="ml_10m", min_u_c=0, min_i_c=3, root_dir=None):
        """Init Movielens_10m Class."""
        super().__init__(
            dataset_name=dataset_name,
            min_u_c=min_u_c,
            min_i_c=min_i_c,
            root_dir=root_dir,
            url=ML_10M_URL,
        )

[docs]    def preprocess(self):
        """Preprocess the raw file.
        Preprocess the file downloaded via the url, convert it to a DataFrame consisting of the user-item
        interactions and save it in the processed directory.
        """
        file_name = os.path.join(self.raw_path, self.dataset_name, "ratings.csv")
        if not os.path.exists(file_name):
            self.download()

        data = pd.read_table(
            file_name,
            header=None,
            sep="::",
            engine="python",
            names=[
                DEFAULT_USER_COL,
                DEFAULT_ITEM_COL,
                DEFAULT_RATING_COL,
                DEFAULT_TIMESTAMP_COL,
            ],
        )
        self.save_dataframe_as_npz(
            data,
            os.path.join(self.processed_path, f"{self.dataset_name}_interaction.npz"),
        )