Dear author:
There are some issue about how run the baseline. The NeuMF user MLP repleace the inner product, which mean need more resource during running. when i try run it on yelp dataset based on the same data process manner with the NCL, it show that a larger memory is need, over 100G, but i find that the ncl is run in 1080 TPX. so i want to know how run the NueMF with te all rank startegy.
Error log:
command line args [--dataset yelp --model NeuMF --dropout_prob =0.0] will not be used in RecBole
19 Dec 23:29 INFO
General Hyper Parameters:
gpu_id = 1
use_gpu = True
seed = 2023
state = INFO
reproducibility = True
data_path = dataset/yelp
show_progress = False
save_dataset = False
save_dataloaders = False
benchmark_filename = None
Training Hyper Parameters:
checkpoint_dir = saved
epochs = 300
train_batch_size = 2048
learner = adam
learning_rate = 0.001
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4
Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
metrics = ['Recall', 'NDCG', 'Precision', 'Hit']
topk = [5, 10, 15, 20, 25, 40, 50, 60, 100, 150, 200]
valid_metric = Recall@20
valid_metric_bigger = True
eval_batch_size = 4096000
metric_decimal_place = 4
Dataset Hyper Parameters:
field_separator =
seq_separator =
USER_ID_FIELD = user_id
ITEM_ID_FIELD = item_id
RATING_FIELD = rating
TIME_FIELD = timestamp
seq_len = None
LABEL_FIELD = label
threshold = None
NEG_PREFIX = neg_
load_col = {'inter': ['user_id', 'item_id', 'rating']}
unload_col = None
unused_col = None
additional_feat_suffix = None
rm_dup_inter = None
val_interval = {'rating': '[3,inf)'}
filter_inter_by_user_or_item = True
user_inter_num_interval = [15,inf)
item_inter_num_interval = [15,inf)
alias_of_user_id = None
alias_of_item_id = None
alias_of_entity_id = None
alias_of_relation_id = None
preload_weight = None
normalize_field = None
normalize_all = None
ITEM_LIST_LENGTH_FIELD = item_length
LIST_SUFFIX = _list
MAX_ITEM_LIST_LENGTH = 50
POSITION_FIELD = position_id
HEAD_ENTITY_ID_FIELD = head_id
TAIL_ENTITY_ID_FIELD = tail_id
RELATION_ID_FIELD = relation_id
ENTITY_ID_FIELD = entity_id
Other Hyper Parameters:
neg_sampling = {'uniform': 1}
repeatable = False
mf_embedding_size = 64
mlp_embedding_size = 64
mlp_hidden_size = [32, 16, 8]
dropout_prob = 0.1
mf_train = True
mlp_train = True
use_pretrain = False
mf_pretrain_path = None
mlp_pretrain_path = None
MODEL_TYPE = ModelType.GENERAL
eval_setting = {'split': {'RS': [0.8, 0.1, 0.1]}, 'order': 'RO', 'group_by': 'user', 'mode': 'full'}
embedding_size = 64
reg_weight = 0.0001
warm_up_step = -1
MODEL_INPUT_TYPE = InputType.POINTWISE
eval_type = EvaluatorType.RANKING
device = cuda
train_neg_sample_args = {'strategy': 'by', 'by': 1, 'distribution': 'uniform'}
eval_neg_sample_args = {'strategy': 'full', 'distribution': 'uniform'}
19 Dec 23:29 INFO yelp
The number of users: 45478
Average actions of users: 39.09151878971788
The number of items: 30709
Average actions of items: 57.89256871173635
The number of inters: 1777765
The sparsity of the dataset: 99.87270617988263%
Remain Fields: ['user_id', 'item_id', 'rating']
19 Dec 23:30 INFO [Training]: train_batch_size = [2048] negative sampling: [{'uniform': 1}]
19 Dec 23:30 INFO [Evaluation]: eval_batch_size = [4096000] eval_args: [{'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]
19 Dec 23:30 INFO NeuMF(
(user_mf_embedding): Embedding(45478, 64)
(item_mf_embedding): Embedding(30709, 64)
(user_mlp_embedding): Embedding(45478, 64)
(item_mlp_embedding): Embedding(30709, 64)
(mlp_layers): MLPLayers(
(mlp_layers): Sequential(
(0): Dropout(p=0.1, inplace=False)
(1): Linear(in_features=128, out_features=32, bias=True)
(2): ReLU()
(3): Dropout(p=0.1, inplace=False)
(4): Linear(in_features=32, out_features=16, bias=True)
(5): ReLU()
(6): Dropout(p=0.1, inplace=False)
(7): Linear(in_features=16, out_features=8, bias=True)
(8): ReLU()
)
)
(predict_layer): Linear(in_features=72, out_features=1, bias=True)
(sigmoid): Sigmoid()
(loss): BCELoss()
)
Trainable parameters: 9756801
19 Dec 23:31 INFO epoch 0 training [time: 60.39s, train loss: 783.6968]
Traceback (most recent call last):
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/trainer/trainer.py", line 376, in _full_sort_batch_eval
scores = self.model.full_sort_predict(interaction.to(self.device))
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/model/abstract_recommender.py", line 66, in full_sort_predict
raise NotImplementedError
NotImplementedError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "run.py", line 35, in
run_recbole(model=args.model, dataset=args.dataset, config_file_list=args.config_file_list)
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/quick_start/quick_start.py", line 60, in run_recbole
train_data, valid_data, saved=saved, show_progress=config['show_progress']
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/trainer/trainer.py", line 334, in fit
valid_score, valid_result = self._valid_epoch(valid_data, show_progress=show_progress)
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/trainer/trainer.py", line 196, in _valid_epoch
valid_result = self.evaluate(valid_data, load_best_model=False, show_progress=show_progress)
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/torch/autograd/grad_mode.py", line 26, in decorate_context
return func(*args, **kwargs)
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/trainer/trainer.py", line 459, in evaluate
interaction, scores, positive_u, positive_i = eval_func(batched_data)
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/trainer/trainer.py", line 383, in _full_sort_batch_eval
scores = self.model.predict(new_inter)
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/model/general_recommender/neumf.py", line 133, in predict
return self.forward(user, item)
File "/home/xxx/anaconda3/envs/recbole/lib/python3.7/site-packages/recbole/model/general_recommender/neumf.py", line 111, in forward
mlp_output = self.mlp_layers(torch.cat((user_mlp_e, item_mlp_e), -1)) # [batch_size, layers[-1]]
RuntimeError: CUDA out of memory. Tried to allocate 1.95 GiB (GPU 0; 10.92 GiB total capacity; 5.09 GiB already allocated; 885.00 MiB free; 5.11 GiB reserved in total by PyTorch)