From d5a48c34e0aca9b5f4c0e473e2e2003d1789d650 Mon Sep 17 00:00:00 2001 From: "shenping.matt" Date: Thu, 29 Feb 2024 18:15:06 +0800 Subject: [PATCH] Elkeid v1.9 huge memory occupation on Rocky 8 (v4.18) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit oncall: https://oncall.bytedance.net/chats/user/userCase?cross_oncall_flow_id=46075810&picked_detail=46075810 stack trace: @[ __kmalloc+376 __kmalloc+376 smith_init_ent+26 tt_rb_insert_key_nolock+30 smith_insert_ent+152 security_inode_create_pre_handler+4174 kprobe_ftrace_handler+144 ftrace_ops_assist_func+110 intel_nhlt_get_dmic_geo+286056 security_inode_create+5 path_openat+3372 do_filp_open+147 do_sys_open+388 do_syscall_64+91 entry_SYSCALL_64_after_hwframe+101 ]: 1440 影响范围:内核早于4.19的火山云环境 (1.9版本的驱动,支持文件落盘扫描功能),CentOS 6/7/8均在此范围之内。 产生机制:针对4.19之前的内核(不支持FMODE_CREATE通知功能),HIDS驱动只能自身缓存及管理新文件的创建信息,针对大量创建新文件并长时间占用的情况会导致内存使用量一直累加,因为没有文件关闭事件故不会触发基于LRU的问题控制机制故此内存不会释放,而正常程序会主动关闭文件句柄所以不会触发此问题。 修复方案:针对大量文件创建并保持占用的情况将强制启用LRU回收,已测试通过,发布前还需要进一步强化测试及验证。临时解决办法就是先下线HIDS驱动,等新版本发布后再次拉起即可,不需要系统重启;另外针对新文件创建量不大的系统,可以卸载并重新加载驱动可做到缓解内存占用的持续增加 问题分析与复盘: https://bytedance.larkoffice.com/docx/ZYsid6QOzo93fDx5evncgXLInHd Signed-off-by: shenping.matt --- driver/LKM/src/smith_hook.c | 65 ++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/driver/LKM/src/smith_hook.c b/driver/LKM/src/smith_hook.c index 886abf5ca..078380944 100644 --- a/driver/LKM/src/smith_hook.c +++ b/driver/LKM/src/smith_hook.c @@ -4019,8 +4019,8 @@ void smith_enum_img(void) static struct tt_rb g_rb_ent; /* rbtree of cached ents */ static LIST_HEAD(g_lru_ent); /* lru list of cached ents */ -#define SMITH_ENT_REAPER (600) /* 10 minutes */ -#define SMITH_ENT_MAX (2048) /* max cached imgs */ +#define SMITH_ENT_REAPER (60) /* 60 seconds */ +#define SMITH_ENT_MAX (1UL << 16) /* max pathes to be cached */ static int smith_build_ent(struct smith_ent *ent, struct smith_ent *obj) { @@ -4093,7 +4093,7 @@ static void smith_release_ent(struct tt_rb *rb, struct tt_node *tnod) * support routines for entry cache */ -static int smith_drop_head_ent(void) +static int smith_drop_head_ent(int count) { struct list_head *link; struct smith_ent *ent; @@ -4105,18 +4105,14 @@ static int smith_drop_head_ent(void) if (list_empty(&g_lru_ent)) goto errorout; - if (0 == atomic_read(&ent->se_node.refs)) { - if (smith_get_seconds() > ent->se_age) { - list_del_init(&ent->se_link); - /* this entry hasn't been touched for seconds */ - /* so remove the ent from rbtree and drop it */ - tt_rb_remove_node_nolock(&g_rb_ent, &ent->se_node); - rc++; - } - } else { + if (smith_get_seconds() > ent->se_age || count > SMITH_ENT_MAX) { list_del_init(&ent->se_link); - /* smith_put_ent will put it back to lru list */ + /* this entry hasn't been touched for seconds */ + /* so remove the ent from rbtree and drop it */ + tt_rb_remove_node_nolock(&g_rb_ent, &ent->se_node); + rc++; } + errorout: write_unlock(&g_rb_ent.lock); @@ -4127,10 +4123,10 @@ static void smith_drop_head_ents(struct tt_rb *rb) { int count = atomic_read(&rb->count); - do { - if (!smith_drop_head_ent()) + while (--count > SMITH_ENT_MAX) { + if (!smith_drop_head_ent(count)) break; - } while (--count > SMITH_ENT_MAX); + } } static void smith_prepare_ent(char *path, struct smith_ent *ent) @@ -4143,30 +4139,35 @@ static void smith_prepare_ent(char *path, struct smith_ent *ent) int smith_insert_ent(char *path) { - struct smith_ent obj; + struct smith_ent obj, *ent; struct tt_node *tnod = NULL; + /* init obj */ smith_prepare_ent(path, &obj); /* check whether the entry was already inserted ? */ read_lock(&g_rb_ent.lock); tnod = tt_rb_lookup_nolock(&g_rb_ent, &obj); - if (tnod) { - atomic_inc(&tnod->refs); - read_unlock(&g_rb_ent.lock); + read_unlock(&g_rb_ent.lock); + if (tnod) goto out; - } else { - read_unlock(&g_rb_ent.lock); - } /* insert new node to rbtree */ write_lock(&g_rb_ent.lock); tnod = tt_rb_insert_key_nolock(&g_rb_ent, &obj.se_node); - if (tnod) - atomic_inc(&tnod->refs); + if (tnod) { + ent = container_of(tnod, struct smith_ent, se_node); + /* remove ent from LRU if it's already LRUed */ + list_del_init(&ent->se_link); + ent->se_age = smith_get_seconds() + SMITH_ENT_REAPER; + /* insert ent to the tail of LRU list */ + list_add_tail(&ent->se_link, &g_lru_ent); + } write_unlock(&g_rb_ent.lock); + smith_drop_head_ents(&g_rb_ent); + out: return (!!tnod); } @@ -4182,31 +4183,21 @@ int smith_remove_ent(char *path) /* check whether the entry was already inserted ? */ read_lock(&g_rb_ent.lock); tnod = tt_rb_lookup_nolock(&g_rb_ent, &obj); - if (tnod) { - ent = container_of(tnod, struct smith_ent, se_node); - if (ent->se_tgid != current->tgid) - tnod = NULL; - } read_unlock(&g_rb_ent.lock); if (!tnod) goto out; write_lock(&g_rb_ent.lock); + /* do 2nd search to assure it's in lru list */ tnod = tt_rb_lookup_nolock(&g_rb_ent, &obj); if (tnod) { ent = container_of(tnod, struct smith_ent, se_node); list_del_init(&ent->se_link); - if (0 == atomic_dec_return(&ent->se_node.refs)) { - tt_rb_remove_node_nolock(&g_rb_ent, tnod); - } else { - ent->se_age = smith_get_seconds() + SMITH_ENT_REAPER; - list_add_tail(&ent->se_link, &g_lru_ent); - } + tt_rb_remove_node_nolock(&g_rb_ent, tnod); } write_unlock(&g_rb_ent.lock); out: - smith_drop_head_ents(&g_rb_ent); return (!!tnod); }