Source code for beta_rec.data.deprecated_data_base

import json
import os
import random
from copy import deepcopy

import numpy as np
import pandas as pd
import scipy.sparse as sp
import torch
from scipy.sparse.linalg import eigsh
from torch.utils.data import DataLoader, Dataset

from ..utils.common_util import ensureDir, normalized_adj_single
from ..utils.constants import (
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
    DEFAULT_TIMESTAMP_COL,
    DEFAULT_USER_COL,
)


[docs]class UserItemRatingDataset(Dataset):
    """Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset."""

    def __init__(self, user_tensor, item_tensor, target_tensor):
        """Init UserItemRatingDataset Class.

        Args:
            target_tensor: torch.Tensor, the corresponding rating for <user, item> pair.
        """
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor

    def __getitem__(self, index):
        """Get an item from dataset."""
        return (
            self.user_tensor[index],
            self.item_tensor[index],
            self.target_tensor[index],
        )

    def __len__(self):
        """Get the size of the dataset."""
        return self.user_tensor.size(0)


[docs]class RatingNegativeDataset(Dataset):
    """RatingNegativeDataset.

    Wrapper, convert <user, item, rating> Tensor into Pytorch Dataset, which contains negative items with rating
    being 0.0.
    """

    def __init__(self, user_tensor, item_tensor, rating_tensor):
        """Init RatingNegativeDataset Class.

        Args:
            target_tensor: torch.Tensor, the corresponding rating for <user, item> pair.
        """
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.rating_tensor = rating_tensor

    def __getitem__(self, index):
        """Get an item from the dataset.

        Args:
            index:

        Returns: users, pos_items, neg_items, pos_ratings, neg_ratings
        """
        return (
            self.user_tensor[index],
            self.item_tensor[index],
            self.rating_tensor[index],
        )

        # index = torch.LongTensor(index, device=self.user_tensor.device)
        # pos_index = index[self.rating_tensor[index] > 0]
        # neg_index = index[self.rating_tensor[index] >= 0]
        # return (
        #     self.user_tensor[pos_index],
        #     self.item_tensor[pos_index],
        #     self.rating_tensor[pos_index],
        #     self.user_tensor[neg_index],
        #     self.item_tensor[neg_index],
        #     self.rating_tensor[neg_index],
        # )

    def __len__(self):
        """Get the size of the dataset."""
        return self.user_tensor.size(0)


[docs]class PairwiseNegativeDataset(Dataset):
    """Wrapper, convert <user, pos_item, neg_item> Tensor into Pytorch Dataset."""

    def __init__(self, user_tensor, pos_item_tensor, neg_item_tensor):
        """Init PairwiseNegativeDataset Class.

        Args:
            target_tensor: torch.Tensor, the corresponding rating for <user, item> pair.
        """
        self.user_tensor = user_tensor
        self.pos_item_tensor = pos_item_tensor
        self.neg_item_tensor = neg_item_tensor

    def __getitem__(self, index):
        """Get an item from the dataset."""
        return (
            self.user_tensor[index],
            self.pos_item_tensor[index],
            self.neg_item_tensor[index],
        )

    def __len__(self):
        """Get the size of the dataset."""
        return self.user_tensor.size(0)


[docs]class DataLoaderBase(object):
    """Construct dataset for NCF."""

    def __init__(self, ratings):
        """Init DataLoaderBase Class.

        Args:
            ratings: pd.DataFrame, which contains 4 columns = ['userId', 'itemId', 'rating', 'timestamp']
        """
        assert DEFAULT_USER_COL in ratings.columns
        assert DEFAULT_ITEM_COL in ratings.columns
        assert DEFAULT_RATING_COL in ratings.columns
        assert DEFAULT_TIMESTAMP_COL in ratings.columns

        self.ratings = ratings
        # explicit feedback using _normalize and implicit using _binarize
        # self.preprocess_ratings = self._normalize(ratings)
        self.preprocess_ratings = self._binarize(ratings)
        self.user_pool = set(self.ratings[DEFAULT_USER_COL].unique())
        self.item_pool = set(self.ratings[DEFAULT_ITEM_COL].unique())
        self.n_users = len(self.user_pool)
        self.n_items = len(self.item_pool)
        # create negative item samples for NCF learning
        self.negatives = self._sample_negative(ratings)

    def _normalize(self, ratings):
        """Normalize into [0, 1] from [0, max_rating], explicit feedback."""
        ratings = deepcopy(ratings)
        max_rating = ratings.rating.max()
        ratings[DEFAULT_RATING_COL] = ratings.rating * 1.0 / max_rating
        return ratings

    def _binarize(self, ratings):
        """Binarize into 0 or 1, imlicit feedback."""
        ratings = deepcopy(ratings)
        ratings[DEFAULT_RATING_COL][ratings[DEFAULT_RATING_COL] > 0] = 1.0
        return ratings

    def _sample_negative(self, ratings):
        """Return all negative items & 100 sampled negative items."""
        interact_status = (
            ratings.groupby(DEFAULT_USER_COL)[DEFAULT_ITEM_COL]
            .apply(set)
            .reset_index()
            .rename(columns={DEFAULT_ITEM_COL: "interacted_items"})
        )
        interact_status["negative_items"] = interact_status["interacted_items"].apply(
            lambda x: self.item_pool - x
        )
        interact_status["negative_samples"] = interact_status["negative_items"].apply(
            lambda x: random.sample(x, 99)
        )
        return interact_status[[DEFAULT_USER_COL, "negative_items", "negative_samples"]]

[docs]    def instance_a_train_loader(self, num_negatives, batch_size):
        """Instance train loader for one training epoch."""
        users, items, ratings = [], [], []
        train_ratings = pd.merge(
            self.ratings,
            self.negatives[[DEFAULT_USER_COL, "negative_items"]],
            on=DEFAULT_USER_COL,
        )
        train_ratings["negatives"] = train_ratings["negative_items"].apply(
            lambda x: random.sample(x, num_negatives)
        )
        for _, row in train_ratings.iterrows():
            users.append(int(row[DEFAULT_USER_COL]))
            items.append(int(row[DEFAULT_ITEM_COL]))
            ratings.append(float(row[DEFAULT_RATING_COL]))
            for i in range(num_negatives):
                users.append(int(row[DEFAULT_USER_COL]))
                items.append(int(row.negatives[i]))
                ratings.append(float(0))  # negative samples get 0 rating
        dataset = UserItemRatingDataset(
            user_tensor=torch.LongTensor(users),
            item_tensor=torch.LongTensor(items),
            target_tensor=torch.FloatTensor(ratings),
        )
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

[docs]    def uniform_negative_train_loader(self, num_negatives, batch_size, device):
        """Instance a Data_loader for training.

        Sample 'num_negatives' negative items for each user, and shuffle them with positive items.
        A batch of data in this DataLoader is suitable for a binary cross-entropy loss.
        # todo implement the item popularity-biased sampling
        """
        users, items, ratings = [], [], []
        train_ratings = pd.merge(
            self.ratings,
            self.negatives[[DEFAULT_USER_COL, "negative_items"]],
            on=DEFAULT_USER_COL,
        )
        train_ratings["negatives"] = train_ratings["negative_items"].apply(
            lambda x: random.sample(x, num_negatives)
        )
        for _, row in train_ratings.iterrows():
            users.append(int(row[DEFAULT_USER_COL]))
            items.append(int(row[DEFAULT_ITEM_COL]))
            ratings.append(float(row[DEFAULT_RATING_COL]))
            for i in range(num_negatives):
                users.append(int(row[DEFAULT_USER_COL]))
                items.append(int(row.negatives[i]))
                ratings.append(float(0))  # negative samples get 0 rating
        dataset = RatingNegativeDataset(
            user_tensor=torch.LongTensor(users).to(device),
            item_tensor=torch.LongTensor(items).to(device),
            rating_tensor=torch.FloatTensor(ratings).to(device),
        )
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

[docs]    def pairwise_negative_train_loader(self, batch_size, device):
        """Instance a pairwise Data_loader for training.

        Sample ONE negative items for each user-item pare, and shuffle them with positive items.
        A batch of data in this DataLoader is suitable for a binary cross-entropy loss.
        # todo implement the item popularity-biased sampling
        """
        users, pos_items, neg_items = [], [], []
        train_ratings = pd.merge(
            self.ratings,
            self.negatives[[DEFAULT_USER_COL, "negative_items"]],
            on=DEFAULT_USER_COL,
        )
        train_ratings["one_negative"] = train_ratings["negative_items"].apply(
            lambda x: random.sample(x, 1)
        )
        for _, row in train_ratings.iterrows():
            users.append(int(row[DEFAULT_USER_COL]))
            pos_items.append(int(row[DEFAULT_ITEM_COL]))
            neg_items.append(int(row.one_negative[0]))
        dataset = PairwiseNegativeDataset(
            user_tensor=torch.LongTensor(users).to(device),
            pos_item_tensor=torch.LongTensor(pos_items).to(device),
            neg_item_tensor=torch.LongTensor(neg_items).to(device),
        )
        return DataLoader(dataset, batch_size=batch_size, shuffle=True)

    @property
    def evaluate_data(self):
        """Create evaluation data."""
        test_ratings = pd.merge(
            self.test_ratings,
            self.negatives[[DEFAULT_USER_COL, "negative_samples"]],
            on=DEFAULT_USER_COL,
        )
        test_users, test_items, ratings = [], [], []
        for row in test_ratings.itertuples():
            test_users.append(int(row[DEFAULT_USER_COL]))
            test_items.append(int(row[DEFAULT_ITEM_COL]))
            ratings.append(1)
            for i in range(len(row.negative_samples)):
                test_users.append(int(row[DEFAULT_USER_COL]))
                test_items.append(int(row.negative_samples[i]))
                ratings.append(0)

        test_df = pd.DataFrame(
            {
                DEFAULT_USER_COL: test_users,
                DEFAULT_ITEM_COL: test_items,
                DEFAULT_RATING_COL: ratings,
            }
        )
        return test_df

[docs]    def get_adj_mat(self, config):
        """Get the adjacent matrix, if not previously stored then call the function to create.

        This method is for NGCF model.

        Returns:
            Different types of adjacment matrix.
        """
        process_file_name = (
            "ngcf_"
            + config["dataset"]["dataset"]
            + "_"
            + config["dataset"]["data_split"]
            + (
                ("_" + str(config["dataset"]["percent"] * 100))
                if "percent" in config
                else ""
            )
        )
        process_path = os.path.join(
            config["system"]["process_dir"],
            config["dataset"]["dataset"] + "/",
        )
        process_file_name = os.path.join(process_path, process_file_name)
        ensureDir(process_file_name)
        print(process_file_name)
        try:
            adj_mat = sp.load_npz(os.path.join(process_file_name, "s_adj_mat.npz"))
            norm_adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_norm_adj_mat.npz")
            )
            mean_adj_mat = sp.load_npz(
                os.path.join(process_file_name, "s_mean_adj_mat.npz")
            )
            print("already load adj matrix", adj_mat.shape)
        except Exception:
            adj_mat, norm_adj_mat, mean_adj_mat = self.create_adj_mat()
            sp.save_npz(os.path.join(process_file_name, "s_adj_mat.npz"), adj_mat)
            sp.save_npz(
                os.path.join(process_file_name, "s_norm_adj_mat.npz"), norm_adj_mat
            )
            sp.save_npz(
                os.path.join(process_file_name, "s_mean_adj_mat.npz"), mean_adj_mat
            )
        return adj_mat, norm_adj_mat, mean_adj_mat

[docs]    def create_adj_mat(self):
        """Create adjacent matirx from the user-item interaction matrix."""
        adj_mat = sp.dok_matrix(
            (self.n_users + self.n_items, self.n_users + self.n_items), dtype=np.float32
        )
        adj_mat = adj_mat.tolil()

        R = sp.dok_matrix((self.n_users, self.n_items), dtype=np.float32)
        user_np = np.array(self.ratings[DEFAULT_USER_COL])
        item_np = np.array(self.ratings[DEFAULT_ITEM_COL])
        for u in range(self.n_users):
            index = list(np.where(user_np == u)[0])
            i = item_np[index]
            for item in i:
                R[u, item] = 1
        R = R.tolil()
        adj_mat[: self.n_users, self.n_users :] = R
        adj_mat[self.n_users :, : self.n_users] = R.T
        adj_mat = adj_mat.todok()
        print("already create adjacency matrix", adj_mat.shape)
        norm_adj_mat = normalized_adj_single(adj_mat + sp.eye(adj_mat.shape[0]))
        mean_adj_mat = normalized_adj_single(adj_mat)
        print("already normalize adjacency matrix")
        return adj_mat.tocsr(), norm_adj_mat.tocsr(), mean_adj_mat.tocsr()

[docs]    def get_graph_embeddings(self, config):
        """Get the graph embedding, if not previously stored then call the function to create.

        This method is for LCFN model.

        Returns:
            eigsh of the graph matrix
        """
        process_file_name = (
            "lcfn_"
            + config["dataset"]["dataset"]
            + "_"
            + config["dataset"]["data_split"]
            + (
                ("_" + str(config["dataset"]["percent"] * 100))
                if "percent" in config
                else ""
            )
        )
        process_path = os.path.join(
            config["system"]["process_dir"],
            config["dataset"]["dataset"] + "/",
        )
        process_file_name = os.path.join(process_path, process_file_name)
        ensureDir(process_file_name)
        print(process_file_name)

        try:

            with open(process_file_name + "/graph_embeddings.json") as f:
                line = f.readline()
                graph_embeddings = json.loads(line)
            f.close()

            print("already load graph embeddings")
        except Exception:
            graph_embeddings = self.create_graph_embeddings(config)

            f = open(process_file_name + "/graph_embeddings.json", "w")
            jsObj = json.dumps(graph_embeddings)
            f.write(jsObj)
            f.close()

        cut_off = config["model"]["cut_off"]
        [graph_u, graph_i] = graph_embeddings
        graph_u = np.array(graph_u)[:, 0 : int(cut_off * self.n_users)].astype(
            np.float32
        )
        graph_i = np.array(graph_i)[:, 0 : int(cut_off * self.n_items)].astype(
            np.float32
        )
        return [graph_u, graph_i]

[docs]    def create_graph_embeddings(self, config):
        """Create graph embeddings from the user and item hypergraph."""
        cut_off = config["model"]["cut_off"]
        user_np = np.array(self.ratings[DEFAULT_USER_COL])
        item_np = np.array(self.ratings[DEFAULT_ITEM_COL])
        user_number = self.n_users
        item_number = self.n_items
        tolerant = 0.1 ** 5
        epsilon = 0.1 ** 10
        H_u = sp.lil_matrix((user_number, item_number))
        H_v = sp.lil_matrix((item_number, user_number))
        D_u = sp.lil_matrix((user_number, user_number))
        D_v = sp.lil_matrix((item_number, item_number))
        I_u = sp.lil_matrix(np.eye(user_number, user_number))
        I_v = sp.lil_matrix(np.eye(item_number, item_number))

        for user in range(self.n_users):
            index = list(np.where(user_np == user)[0])
            i = item_np[index]
            for item in i:
                H_u[user, item] = 1
                H_v[item, user] = 1
                D_u[user, user] += 1
                D_v[item, item] += 1

        print("   constructing user matrix...")
        D_n = sp.lil_matrix((user_number, user_number))
        D_e = sp.lil_matrix((item_number, item_number))
        for i in range(user_number):
            D_n[i, i] = 1.0 / max(np.sqrt(D_u[i, i]), epsilon)
        for i in range(item_number):
            D_e[i, i] = 1.0 / max(D_v[i, i], epsilon)
        L_u = I_u - D_n * H_u * D_e * H_u.T * D_n

        print("   constructing item matrix...")
        D_n = sp.lil_matrix((item_number, item_number))
        D_e = sp.lil_matrix((user_number, user_number))
        for i in range(item_number):
            D_n[i, i] = 1.0 / max(np.sqrt(D_v[i, i]), epsilon)
        for i in range(user_number):
            D_e[i, i] = 1.0 / max(D_u[i, i], epsilon)
        L_v = I_v - D_n * H_v * D_e * H_v.T * D_n

        print("Decomposing the laplacian matrices...")
        print("   decomposing user matrix...")

        [Lamda, user_graph_embeddings] = eigsh(
            L_u, k=int(cut_off * self.n_users), which="SM", tol=tolerant
        )
        print(Lamda[0:10])
        print("   decomposing item matrix...")
        [Lamda, item_graph_embeddings] = eigsh(
            L_v, k=int(cut_off * self.n_items), which="SM", tol=tolerant
        )
        print(Lamda[0:10])

        return [user_graph_embeddings.tolist(), item_graph_embeddings.tolist()]