update some datasets links, tag feature process

datawhalechina · Dec 2, 2024 · 5e2354a · 5e2354a
1 parent f24c446
commit 5e2354a
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 5 deletions.
diff --git a/examples/matching/README.md b/examples/matching/README.md
@@ -77,7 +77,11 @@
 
 [数据集说明](http://millionsongdataset.com/tasteprofile/)
 
+## Book-Crossing
 
+针对美国 Book-crossing 网站关于用户对书籍的评级行为进行分析，数据包含 27.8万个匿名用户，提供 115万 个评级(显式/隐式)，涉及约 27.1万本书。增删改查合并后的数据包含约71.9万个评级。
+
+[数据集下载和说明](https://www.kaggle.com/datasets/ruchi798/bookcrossing-dataset)
 
 
 ## Session based recommendation datasets

diff --git a/examples/ranking/README.md b/examples/ranking/README.md
@@ -78,8 +78,8 @@ TBD
 - 注意事项
   - 原始数据已划分训练集和测试集，预处理过程将原始数据的测试集随机划分一半作为验证集，预处理后的训练集、验证集、测试集比例为2:1:1。
   - 预处理完的数据集的sparse特征已经Lable Encode，dense特征采用归一化处理。数据预处理方式参考[AITM模型预处理脚本](https://github.com/xidongbo/AITM/blob/main/process_public_dataset.py)，我们也提供了处理原始数据的脚本`preprocess_ali_ccp.py`。
-- 原始数据地址：https://tianchi.aliyun.com/dataset/dataDetail?dataId=408
-- 预处理后的全量数据下载地址：https://cowtransfer.com/s/1903cab699fa49
+- 原始数据地址：https://tianchi.aliyun.com/dataset/408
+- 预处理后的全量数据下载地址：https://aistudio.baidu.com/datasetdetail/281072
 
 
 

diff --git a/torch_rechub/trainers/matching.md b/torch_rechub/trainers/matching.md
@@ -0,0 +1,3 @@
+# Matching
+
+召回使用文档
diff --git a/torch_rechub/utils/match.py b/torch_rechub/utils/match.py
@@ -9,10 +9,10 @@
 from pymilvus import Collection,CollectionSchema,DataType,FieldSchema,connections,utility
 
 def gen_model_input(df, user_profile, user_col, item_profile, item_col, seq_max_len, padding='pre', truncating='pre'):
-    """Merge user_profile and item_profile to df, pad and truncate history seuence feature
+    """Merge user_profile and item_profile to df, pad and truncate history sequence feature
 
     Args:
-        df (pd.DataFrame): data with history seuence feature
+        df (pd.DataFrame): data with history sequence feature
         user_profile (pd.DataFrame): user data
         user_col (str): user column name
         item_profile (pd.DataFrame): item data
@@ -29,6 +29,10 @@ def gen_model_input(df, user_profile, user_col, item_profile, item_col, seq_max_
     for col in df.columns.to_list():
         if col.startswith("hist_"):
             df[col] = pad_sequences(df[col], maxlen=seq_max_len, value=0, padding=padding, truncating=truncating).tolist()
+    for col in df.columns.to_list():
+        if col.startswith("tag_"):
+            df[col] = pad_sequences(df[col], maxlen=seq_max_len, value=0, padding=padding, truncating=truncating).tolist()
+
     input_dict = df_to_dict(df)
     return input_dict
 
@@ -161,7 +165,7 @@ def generate_seq_feature_match(data,
     random.shuffle(test_set)
 
     print("n_train: %d, n_test: %d" % (len(train_set), len(test_set)))
-    print("%d cold start user droped " % (n_cold_user))
+    print("%d cold start user dropped " % n_cold_user)
 
     attr_hist_col = ["hist_" + col for col in item_attribute_cols]
     df_train = pd.DataFrame(train_set,