Source code for beta_rec.core.eval_engine

# coding=utf-8

"""This is the core implementation of the evaluation."""
import concurrent.futures
from threading import Lock, Thread

import numpy as np
import pandas as pd
import torch
from tensorboardX import SummaryWriter
from tqdm.autonotebook import tqdm

from ..utils import evaluation as eval_model
from ..utils.common_util import print_dict_as_table, save_to_csv, timeit
from ..utils.constants import (
    DEFAULT_ITEM_COL,
    DEFAULT_PREDICTION_COL,
    DEFAULT_RATING_COL,
    DEFAULT_USER_COL,
)
from ..utils.seq_evaluation import mrr, ndcg, precision, recall

lock_train_eval = Lock()
lock_test_eval = Lock()


[docs]def computeRePos(time_seq, time_span): """Compute position matrix for a user. Args: time_seq ([type]): [description] time_span ([type]): [description] Returns: [type]: [description] """ size = time_seq.shape[0] time_matrix = np.zeros([size, size], dtype=np.int32) for i in range(size): for j in range(size): span = abs(time_seq[i] - time_seq[j]) if span > time_span: time_matrix[i][j] = time_span else: time_matrix[i][j] = span return time_matrix
[docs]def evaluate(data_df, predictions, metrics, k_li): """Evaluate the performance of a prediction by different metrics. Args: data_df (DataFrame): the dataset to be evaluated. predictions (narray): 1-D array. The predicted scores for each user-item pair in the dataset. metrics (list): metrics to be evaluated. k_li (int or list): top k (s) to be evaluated. Returns: result_dic (dict): Performance result. """ user_ids = data_df[DEFAULT_USER_COL].to_numpy() item_ids = data_df[DEFAULT_ITEM_COL].to_numpy() pred_df = pd.DataFrame( { DEFAULT_USER_COL: user_ids, DEFAULT_ITEM_COL: item_ids, DEFAULT_PREDICTION_COL: predictions, } ) metric_mapping = { "rmse": eval_model.rmse, "mae": eval_model.mae, "rsquared": eval_model.rsquared, "ndcg": eval_model.ndcg_at_k, "map": eval_model.map_at_k, "precision": eval_model.precision_at_k, "recall": eval_model.recall_at_k, } result_dic = {} if type(k_li) != list: k_li = [k_li] for k in k_li: for metric in metrics: result = metric_mapping[metric](data_df, pred_df, k=k) result_dic[f"{metric}@{k}"] = result return result_dic
[docs]@timeit def train_eval_worker(testEngine, valid_df, test_df, valid_pred, test_pred, epoch): """Start a worker for the evaluation during training. Args: testEngine: valid_df: test_df: valid_pred: test_pred: epoch (int): Returns: (dict,dict): dictionary with performances on validation and testing sets. """ testEngine.n_worker += 1 valid_result = evaluate( valid_df, valid_pred, testEngine.metrics, testEngine.valid_k ) test_result = evaluate(test_df, test_pred, testEngine.metrics, testEngine.valid_k) lock_train_eval.acquire() testEngine.record_performance(valid_result, test_result, epoch) if ( valid_result[f"{testEngine.valid_metric}@{testEngine.valid_k}"] > testEngine.best_valid_performance ): testEngine.n_no_update = 0 print( "Current testEngine.best_valid_performance" f" {testEngine.best_valid_performance}" ) testEngine.best_valid_performance = valid_result[ f"{testEngine.valid_metric}@{testEngine.valid_k}" ] print_dict_as_table( valid_result, tag=f"performance on validation at epoch {epoch}", columns=["metrics", "values"], ) print_dict_as_table( test_result, tag=f"performance on testing at epoch {epoch}", columns=["metrics", "values"], ) else: testEngine.n_no_update += 1 print(f"number of epochs that have no update {testEngine.n_no_update}") testEngine.n_worker -= 1 lock_train_eval.release() # lock record and get best performance return valid_result, test_result
[docs]@timeit def test_eval_worker(testEngine, eval_data_df, prediction): """Start a worker for the evaluation during training. Prediction and evaluation on the testing set. """ config = testEngine.config["system"] result_para = { "run_time": [testEngine.config["run_time"]], } testEngine.n_worker += 1 for cfg in ["model", "dataset"]: for col in testEngine.config[cfg]["result_col"]: result_para[col] = [testEngine.config[cfg][col]] test_result_dic = evaluate( eval_data_df, prediction, testEngine.metrics, testEngine.k ) print_dict_as_table( test_result_dic, tag="performance on test", columns=["metrics", "values"], ) test_result_dic.update(result_para) lock_test_eval.acquire() # need to be test result_df = pd.DataFrame(test_result_dic) save_to_csv(result_df, config["result_file"]) lock_test_eval.release() testEngine.n_worker -= 1 save_mode = config["save_mode"] if save_mode == "per_user": rating_pre_df = eval_data_df.drop(columns=["col_timestamp"]) rating_pre_df["col_prediction"] = prediction save_to_csv( rating_pre_df, config["root_dir"] + "/" + config["result_dir"] + config["model_run_id"], ) else: pass return test_result_dic
[docs]class EvalEngine(object): """The base evaluation engine.""" def __init__(self, config): """Init EvalEngine Class. Args: config (dict): parameters for the model """ self.config = config # model configuration, should be a dic self.metrics = config["system"]["metrics"] self.k = config["system"]["k"] self.valid_metric = config["system"]["valid_metric"] self.valid_k = config["system"]["valid_k"] self.batch_eval = ( config["model"]["batch_eval"] if "batch_eval" in config["model"] else False ) self.batch_size = config["model"]["batch_size"] self.writer = SummaryWriter( log_dir=config["model"]["run_dir"] ) # tensorboard writer self.writer.add_text( "config/system", pd.DataFrame( config["system"].items(), columns=["parameters", "values"] ).to_string(), 0, ) self.writer.add_text( "config/model", pd.DataFrame( config["model"].items(), columns=["parameters", "values"] ).to_string(), 0, ) self.n_worker = 0 self.n_no_update = 0 self.best_valid_performance = 0 print("Initializing test engine ...")
[docs] def flush(self): """Flush eval_engine.""" self.n_no_update = 0 self.best_valid_performance = 0
[docs] def predict(self, data_df, model, batch_eval=False): """Make prediction for a trained model. Args: data_df (DataFrame): A dataset to be evaluated. model: A trained model. batch_eval (Boolean): A signal to indicate if the model is evaluated in batches. Returns: array: predicted scores. """ user_ids = data_df[DEFAULT_USER_COL].to_numpy() item_ids = data_df[DEFAULT_ITEM_COL].to_numpy() if batch_eval: n_batch = len(data_df) // self.batch_size + 1 predictions = np.array([]) stop_batch = False for idx in range(n_batch): if not stop_batch: start_idx = idx * self.batch_size end_idx = min((idx + 1) * self.batch_size, len(data_df)) if len(data_df) == end_idx + 1: end_idx = len(data_df) stop_batch = True sub_user_ids = user_ids[start_idx:end_idx] sub_item_ids = item_ids[start_idx:end_idx] sub_predictions = np.array( model.predict(sub_user_ids, sub_item_ids) .flatten() .to(torch.device("cpu")) .detach() .numpy() ) predictions = np.append(predictions, sub_predictions) else: predictions = np.array( model.predict(user_ids, item_ids) .flatten() .to(torch.device("cpu")) .detach() .numpy() ) return predictions
[docs] def seq_predict(self, train_seq, data_df, model, maxlen): """Make prediction for a trained model. Args: data_df (DataFrame): A dataset to be evaluated. model: A trained model. batch_eval (Boolean): A signal to indicate if the model is evaluated in batches. Returns: array: predicted scores. """ user_ids = data_df[DEFAULT_USER_COL].to_numpy() item_ids = data_df[DEFAULT_ITEM_COL].to_numpy() user_item_list = data_df.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].apply( list ) result_dic = {} with torch.no_grad(): for u, items in user_item_list.items(): if len(train_seq[u]) < 1: continue seq = np.zeros([maxlen], dtype=np.int32) idx = maxlen - 1 for i in reversed(train_seq[u]): seq[idx] = i idx -= 1 if idx == -1: break score = model.predict( np.array([u]), np.array([seq]), np.array(items), ) score = np.array( score.flatten().to(torch.device("cpu")).detach().numpy() * -1 ) for i, item in enumerate(items): result_dic[(u, item)] = score[i] predictions = [] for u, i in zip(user_ids, item_ids): predictions.append(result_dic[(u, i)]) return np.array(predictions)
[docs] def test_seq_predict(self, train_seq, valid_data_df, test_data_df, model, maxlen): """Make prediction for a trained model. Args: data_df (DataFrame): A dataset to be evaluated. model: A trained model. batch_eval (Boolean): A signal to indicate if the model is evaluated in batches. Returns: array: predicted scores. """ valid_user_item_list = ( valid_data_df[valid_data_df[DEFAULT_RATING_COL] > 0] .groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL] .apply(list) ) user_item_list = test_data_df.groupby([DEFAULT_USER_COL])[ DEFAULT_ITEM_COL ].apply(list) user_ids = test_data_df[DEFAULT_USER_COL].to_numpy() item_ids = test_data_df[DEFAULT_ITEM_COL].to_numpy() predictions = [] result_dic = {} with torch.no_grad(): for u, items in user_item_list.items(): if len(train_seq[u]) < 1: continue seq = np.zeros([maxlen], dtype=np.int32) idx = maxlen - 1 if u in valid_user_item_list: for i in reversed( valid_user_item_list[u] ): # add validate item list seq[idx] = i idx -= 1 if idx == -1: break for i in reversed(train_seq[u]): if idx <= -1: break seq[idx] = i idx -= 1 if idx == -1: break score = model.predict( np.array([u]), np.array([seq]), np.array(items), ) score = np.array( score.flatten().to(torch.device("cpu")).detach().numpy() * -1 ) for i, item in enumerate(items): result_dic[(u, item)] = score[i] for u, i in zip(user_ids, item_ids): predictions.append(result_dic[(u, i)]) return np.array(predictions)
# implemented this due to differences in indexing for TiSASRec, not sure if needed at all
[docs] def seq_predict_time(self, train_seq, data_df, model, maxlen, time_span): """Make prediction for a trained model. Args: data_df (DataFrame): A dataset to be evaluated. model: A trained model. batch_eval (Boolean): A signal to indicate if the model is evaluated in batches. Returns: array: predicted scores. """ user_ids = data_df[DEFAULT_USER_COL].to_numpy() item_ids = data_df[DEFAULT_ITEM_COL].to_numpy() user_item_list = data_df.groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL].apply( list ) result_dic = {} with torch.no_grad(): print("Train seq", type(train_seq)) for u, items in user_item_list.items(): if len(train_seq[u]) < 1: continue seq = np.zeros([maxlen], dtype=np.int32) time_seq = np.zeros([maxlen], dtype=np.int32) time_matrix = computeRePos(time_seq, time_span) idx = maxlen - 1 for i in reversed(train_seq[u]): seq[idx] = i[0] time_seq[idx] = i[1] idx -= 1 if idx == -1: break score = model.predict( np.array([u]), np.array([seq]), np.array([time_matrix]), np.array(items), ) score = np.array( score.flatten().to(torch.device("cpu")).detach().numpy() * -1 ) for i, item in enumerate(items): result_dic[(u, item)] = score[i] predictions = [] for u, i in zip(user_ids, item_ids): predictions.append(result_dic[(u, i)]) return np.array(predictions)
# implemented this due to differences in indexing, not sure if needed
[docs] def test_seq_predict_time( self, train_seq, valid_data_df, test_data_df, model, maxlen, time_span ): """Make prediction for a trained model. Args: data_df (DataFrame): A dataset to be evaluated. model: A trained model. batch_eval (Boolean): A signal to indicate if the model is evaluated in batches. Returns: array: predicted scores. """ valid_user_item_list = ( valid_data_df[valid_data_df[DEFAULT_RATING_COL] > 0] .groupby([DEFAULT_USER_COL])[DEFAULT_ITEM_COL] .apply(list) ) user_item_list = test_data_df.groupby([DEFAULT_USER_COL])[ DEFAULT_ITEM_COL ].apply(list) user_ids = test_data_df[DEFAULT_USER_COL].to_numpy() item_ids = test_data_df[DEFAULT_ITEM_COL].to_numpy() predictions = [] result_dic = {} with torch.no_grad(): for u, items in user_item_list.items(): if len(train_seq[u]) < 1: continue seq = np.zeros([maxlen], dtype=np.int32) time_seq = np.zeros([maxlen], dtype=np.int32) time_matrix = computeRePos(time_seq, time_span) idx = maxlen - 1 if u in valid_user_item_list: for i in reversed( valid_user_item_list[u] ): # add validate item list seq[idx] = i[0] time_seq[idx] = i[1] idx -= 1 if idx == -1: break for i in reversed(train_seq[u]): if idx <= -1: break seq[idx] = i[0] time_seq[idx] = i[1] idx -= 1 if idx == -1: break score = model.predict( np.array([u]), np.array([seq]), np.array([time_matrix]), np.array(items), ) score = np.array( score.flatten().to(torch.device("cpu")).detach().numpy() * -1 ) for i, item in enumerate(items): result_dic[(u, item)] = score[i] for u, i in zip(user_ids, item_ids): predictions.append(result_dic[(u, i)]) return np.array(predictions)
[docs] def train_eval(self, valid_data_df, test_data_df, model, epoch_id=0): """Evaluate the performance for a (validation) dataset with multiThread. Args: valid_data_df (DataFrame): A validation dataset. test_data_df (DataFrame): A testing dataset. model: trained model. epoch_id: epoch_id. k (int or list): top k result to be evaluate. """ valid_pred = self.predict(valid_data_df, model, self.batch_eval) test_pred = self.predict(test_data_df, model, self.batch_eval) worker = Thread( target=train_eval_worker, args=( self, valid_data_df, test_data_df, valid_pred, test_pred, epoch_id, ), ) worker.start()
[docs] def seq_train_eval( self, train_seq, valid_data_df, test_data_df, model, maxlen, epoch_id=0 ): """Evaluate the performance for a (validation) dataset with multiThread. Args: valid_data_df (DataFrame): A validation dataset. test_data_df (DataFrame): A testing dataset. model: trained model. epoch_id: epoch_id. k (int or list): top k result to be evaluate. """ valid_pred = self.seq_predict(train_seq, valid_data_df, model, maxlen) test_pred = self.test_seq_predict( train_seq, valid_data_df, test_data_df, model, maxlen ) worker = Thread( target=train_eval_worker, args=( self, valid_data_df, test_data_df, valid_pred, test_pred, epoch_id, ), ) worker.start()
# implemented this due to differences in indexing, not sure if needed
[docs] def seq_train_eval_time( self, train_seq, valid_data_df, test_data_df, model, maxlen, time_span, epoch_id=0, ): """Evaluate the performance for a (validation) dataset with multiThread. Args: valid_data_df (DataFrame): A validation dataset. test_data_df (DataFrame): A testing dataset. model: trained model. epoch_id: epoch_id. k (int or list): top k result to be evaluate. """ valid_pred = self.seq_predict_time( train_seq, valid_data_df, model, maxlen, time_span ) test_pred = self.test_seq_predict_time( train_seq, valid_data_df, test_data_df, model, maxlen, time_span ) worker = Thread( target=train_eval_worker, args=( self, valid_data_df, test_data_df, valid_pred, test_pred, epoch_id, ), ) worker.start()
[docs] @timeit def test_eval(self, test_df_list, model): """Evaluate the performance for a (testing) dataset list with multiThread. Args: test_df_list (list): (testing) dataset list. model: trained model. """ if type(test_df_list) is not list: # compatible for testing a single test set test_df_list = [test_df_list] return_value_list = [] for i, test_data_df in enumerate(test_df_list): test_pred = self.predict(test_data_df, model, self.batch_eval) with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit( test_eval_worker, self, test_data_df, test_pred ) return_value = future.result() return_value_list.append(return_value) return return_value_list
[docs] def record_performance(self, valid_result, test_result, epoch_id): """Record perforance result on tensorboard. Args: valid_result (dict): Performance result of validation set. test_result (dict): Performance result of testing set. epoch_id (int): epoch_id. """ for metric in self.metrics: self.writer.add_scalars( "performance/" + metric, { "valid": valid_result[f"{metric}@{self.valid_k}"], "test": test_result[f"{metric}@{self.valid_k}"], }, epoch_id, )
[docs]class SeqEvalEngine(object): """The base evaluation engine for sequential recommendation.""" def __init__(self, config): """Init SeqEvalEngine Class. Args: config (dict): parameters for the model. """ self.config = config # model configuration, should be a dic self.metrics = config["system"]["metrics"] self.valid_metric = config["system"]["valid_metric"]
[docs] def sequential_evaluation( self, recommender, test_sequences, evaluation_functions, users=None, given_k=1, look_ahead=1, top_n=10, scroll=True, step=1, ): """Run sequential evaluation of a recommender over a set of test sequences. Args: recommender (object): the instance of the recommender to test. test_sequences (List): the set of test sequences evaluation_functions (dict): list of evaluation metric functions. users (List): (optional) the list of user ids associated to each test sequence. given_k (int): (optional) the initial size of each user profile, starting from the first interaction in the sequence. If <0, start counting from the end of the sequence. It must be != 0. look_ahead (int): (optional) number of subsequent interactions in the sequence to be considered as ground truth. It can be any positive number or 'all' to extend the ground truth until the end of the sequence. top_n (int): (optional) size of the recommendation list scroll (boolean): (optional) whether to scroll the ground truth until the end of the sequence. If True, expand the user profile and move the ground truth forward of `step` interactions. Recompute and evaluate recommendations every time. If False, evaluate recommendations once per sequence without expanding the user profile. step (int): (optional) number of interactions that will be added to the user profile at each step of the sequential evaluation. Returns: metrics/len(test_sequences) (1d array): the list of the average values for each evaluation metric. """ if given_k == 0: raise ValueError("given_k must be != 0") metrics = np.zeros(len(evaluation_functions)) with tqdm(total=len(test_sequences)) as pbar: for i, test_seq in enumerate(test_sequences): if users is not None: user = users[i] else: user = None if scroll: metrics += self.sequence_sequential_evaluation( recommender, test_seq, evaluation_functions, user, given_k, look_ahead, top_n, step, ) else: metrics += self.evaluate_sequence( recommender, test_seq, evaluation_functions, user, given_k, look_ahead, top_n, ) pbar.update(1) return metrics / len(test_sequences)
[docs] def evaluate_sequence( self, recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n, ): """Compute metrics for each sequence. Args: recommender (object): which recommender to use seq (List): the user_profile/ context given_k (int): last element used as ground truth. NB if <0 it is interpreted as first elements to keep evaluation_functions (dict): which function to use to evaluate the rec performance look_ahead (int): number of elements in ground truth to consider. If look_ahead = 'all' then all the ground_truth sequence is considered Returns: np.array(tmp_results) (1d array): performance of recommender. """ # safety checks if given_k < 0: given_k = len(seq) + given_k user_profile = seq[:given_k] ground_truth = seq[given_k:] # restrict ground truth to look_ahead ground_truth = ( ground_truth[:look_ahead] if look_ahead != "all" else ground_truth ) ground_truth = list(map(lambda x: [x], ground_truth)) # list of list format if len(user_profile) == 0 or len(ground_truth) == 0: # if any of the two missing all evaluation functions are 0 return np.zeros(len(evaluation_functions)) r = recommender.recommend(user_profile, user)[:top_n] if not r: # no recommendation found return np.zeros(len(evaluation_functions)) reco_list = recommender.get_recommendation_list(r) tmp_results = [] for f in evaluation_functions: tmp_results.append(f(ground_truth, reco_list)) return np.array(tmp_results)
[docs] def sequence_sequential_evaluation( self, recommender, seq, evaluation_functions, user, given_k, look_ahead, top_n, step, ): """Compute metrics for each sequence incrementally. Args: recommender (object): which recommender to use seq (List): the user_profile/ context given_k (int): last element used as ground truth. NB if <0 it is interpreted as first elements to keep evaluation_functions (dict): which function to use to evaluate the rec performance look_ahead (int): number of elements in ground truth to consider. If look_ahead = 'all' then all the ground_truth sequence is considered Returns: eval_res/eval_cnt (1d array): performance of recommender. """ if given_k < 0: given_k = len(seq) + given_k eval_res = 0.0 eval_cnt = 0 for gk in range(given_k, len(seq), step): eval_res += self.evaluate_sequence( recommender, seq, evaluation_functions, user, gk, look_ahead, top_n, ) eval_cnt += 1 return eval_res / eval_cnt
[docs] def get_test_sequences(self, test_data, given_k): """Run evaluation only over sequences longer than abs(LAST_K). Args: test_data (pandas.DataFrame): Test set. given_k (int): last element used as ground truth. Returns: test_sequences (List): list of sequences for testing. """ # we can run evaluation only over sequences longer than abs(LAST_K) test_sequences = test_data.loc[ test_data["col_sequence"].map(len) > abs(given_k), "col_sequence" ].values return test_sequences
[docs] def train_eval_seq(self, valid_data, test_data, recommender, epoch_id=0): """Compute performance of the sequential models with validation and test datasets for each epoch during training. Args: valid_data (pandas.DataFrame): validation dataset. test_data (pandas.DataFrame): test dataset. recommender (Object): Sequential recommender. epoch_id (int): id of the epoch. k (int): size of the recommendation list Returns: None """ METRICS = { "ndcg": ndcg, "precision": precision, "recall": recall, "mrr": mrr, } TOPN = self.config["system"]["valid_k"] # length of the recommendation list # GIVEN_K=-1, LOOK_AHEAD=1, STEP=1 corresponds to the classical next-item evaluation GIVEN_K = self.config["model"]["GIVEN_K"] LOOK_AHEAD = self.config["model"]["LOOK_AHEAD"] STEP = self.config["model"]["STEP"] scroll = self.config["model"]["scroll"] # valid data valid_sequences = self.get_test_sequences(valid_data, GIVEN_K) print("{} sequences available for evaluation".format(len(valid_sequences))) valid_results = self.sequential_evaluation( recommender, test_sequences=valid_sequences, given_k=GIVEN_K, look_ahead=LOOK_AHEAD, evaluation_functions=METRICS.values(), top_n=TOPN, scroll=scroll, # scrolling averages metrics over all profile lengths step=STEP, ) print( "Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})".format( GIVEN_K, LOOK_AHEAD, STEP ) ) for mname, mvalue in zip(METRICS.keys(), valid_results): print("\t{}@{}: {:.4f}".format(mname, TOPN, mvalue)) # test data test_sequences = self.get_test_sequences(test_data, GIVEN_K) print("{} sequences available for evaluation".format(len(test_sequences))) test_results = self.sequential_evaluation( recommender, test_sequences=test_sequences, given_k=GIVEN_K, look_ahead=LOOK_AHEAD, evaluation_functions=METRICS.values(), top_n=TOPN, scroll=scroll, # scrolling averages metrics over all profile lengths step=STEP, ) print( "Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})".format( GIVEN_K, LOOK_AHEAD, STEP ) ) for mname, mvalue in zip(METRICS.keys(), test_results): print("\t{}@{}: {:.4f}".format(mname, TOPN, mvalue))
[docs] def test_eval_seq(self, test_data, recommender): """Compute performance of the sequential models with test dataset. Args: test_data (pandas.DataFrame): test dataset. recommender (Object): Sequential recommender. k (int): size of the recommendation list Returns: None """ METRICS = { "ndcg": ndcg, "precision": precision, "recall": recall, "mrr": mrr, } TOPNs = self.config["system"]["k"] # length of the recommendation list # GIVEN_K=-1, LOOK_AHEAD=1, STEP=1 corresponds to the classical next-item evaluation GIVEN_K = self.config["model"]["GIVEN_K"] LOOK_AHEAD = self.config["model"]["LOOK_AHEAD"] STEP = self.config["model"]["STEP"] scroll = self.config["model"]["scroll"] # test data test_sequences = self.get_test_sequences(test_data, GIVEN_K) print("{} sequences available for evaluation".format(len(test_sequences))) for TOPN in TOPNs: test_results = self.sequential_evaluation( recommender, test_sequences=test_sequences, given_k=GIVEN_K, look_ahead=LOOK_AHEAD, evaluation_functions=METRICS.values(), top_n=TOPN, scroll=scroll, # scrolling averages metrics over all profile lengths step=STEP, ) print( "Sequential evaluation (GIVEN_K={}, LOOK_AHEAD={}, STEP={})".format( GIVEN_K, LOOK_AHEAD, STEP ) ) for mname, mvalue in zip(METRICS.keys(), test_results): print("\t{}@{}: {:.4f}".format(mname, TOPN, mvalue))