From e5995b826942a5cb4b16a6665633beec842bb8d7 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 23 Apr 2021 13:39:37 -0700 Subject: [PATCH 1/7] lmkd: Handle cases when proc_get_name() might return NULL proc_get_name() can return NULL if the corresponding process has died or open fails with ENOMEM due to memory shortages. Ensure such cases are handled without NULL pointer access. Bug: 186157675 Signed-off-by: Suren Baghdasaryan Change-Id: I05b288e3808bec0bdb73db32de02ba3a322ca6e1 --- lmkd.cpp | 6 ++++-- statslog.cpp | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/lmkd.cpp b/lmkd.cpp index c646479..3515fdb 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -1157,9 +1157,10 @@ static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred } else { if (!claim_record(procp, cred->pid)) { char buf[LINE_MAX]; + char *taskname = proc_get_name(cred->pid, buf, sizeof(buf)); /* Only registrant of the record can remove it */ ALOGE("%s (%d, %d) attempts to modify a process registered by another client", - proc_get_name(cred->pid, buf, sizeof(buf)), cred->uid, cred->pid); + taskname ? taskname : "A process ", cred->uid, cred->pid); return; } proc_unslot(procp); @@ -1194,9 +1195,10 @@ static void cmd_procremove(LMKD_CTRL_PACKET packet, struct ucred *cred) { if (!claim_record(procp, cred->pid)) { char buf[LINE_MAX]; + char *taskname = proc_get_name(cred->pid, buf, sizeof(buf)); /* Only registrant of the record can remove it */ ALOGE("%s (%d, %d) attempts to unregister a process registered by another client", - proc_get_name(cred->pid, buf, sizeof(buf)), cred->uid, cred->pid); + taskname ? taskname : "A process ", cred->uid, cred->pid); return; } diff --git a/statslog.cpp b/statslog.cpp index a9606f9..410a108 100644 --- a/statslog.cpp +++ b/statslog.cpp @@ -260,7 +260,7 @@ void stats_remove_taskname(int pid) { } void stats_store_taskname(int pid, const char* taskname) { - if (!enable_stats_log) { + if (!enable_stats_log || !taskname) { return; } From 6f11aff418eca582916b41f1fd6baa1564c46510 Mon Sep 17 00:00:00 2001 From: Vova Sharaienko Date: Sat, 24 Apr 2021 00:30:06 +0000 Subject: [PATCH 2/7] lmkd: reroute atoms logging to AMS - Added new lmkd message for clients to subscribe LMK_ASYNC_EVENT_STAT - Added support to write kill & mem stats information via data socket to be read & parsed on the AMS Java side for future logging to statsd Bug: 184698933 Test: lmkd_unit_test - test check_for_oom tests lmkd message send to AMS Test: statsd_testdrive 51 54 to inspect statsd logged atoms data Change-Id: Id682a438c87b3e4503261d26461f6cee641d86c4 --- Android.bp | 34 +-------- include/lmkd.h | 21 +++--- lmkd.cpp | 49 ++++++++++++- statslog.cpp | 195 ++++++++++++++++++++++++++++++------------------- statslog.h | 70 ++++++++++++------ 5 files changed, 223 insertions(+), 146 deletions(-) diff --git a/Android.bp b/Android.bp index 267cd48..fcf26ca 100644 --- a/Android.bp +++ b/Android.bp @@ -18,11 +18,9 @@ cc_binary { "liblog", "libprocessgroup", "libpsi", - "libstatssocket", ], static_libs: [ "libstatslogc", - "libstatslog_lmkd", "liblmkd_utils", ], local_include_dirs: ["include"], @@ -40,6 +38,7 @@ cc_binary { cc_library_static { name: "libstatslogc", srcs: ["statslog.cpp"], + local_include_dirs: ["include"], cflags: [ "-Wall", "-Werror", @@ -49,37 +48,6 @@ cc_library_static { shared_libs: [ "liblog", ], - static_libs: [ - "libstatslog_lmkd", - ], -} - -genrule { - name: "statslog_lmkd.h", - tools: ["stats-log-api-gen"], - cmd: "$(location stats-log-api-gen) --header $(genDir)/statslog_lmkd.h --module lmkd --namespace android,lmkd,stats", - out: [ - "statslog_lmkd.h", - ], -} - -genrule { - name: "statslog_lmkd.cpp", - tools: ["stats-log-api-gen"], - cmd: "$(location stats-log-api-gen) --cpp $(genDir)/statslog_lmkd.cpp --module lmkd --namespace android,lmkd,stats --importHeader statslog_lmkd.h", - out: [ - "statslog_lmkd.cpp", - ], -} - -cc_library_static { - name: "libstatslog_lmkd", - generated_sources: ["statslog_lmkd.cpp"], - generated_headers: ["statslog_lmkd.h"], - export_generated_headers: ["statslog_lmkd.h"], - shared_libs: [ - "libstatssocket", - ], } cc_library_static { diff --git a/include/lmkd.h b/include/lmkd.h index ad5dc75..c814d17 100644 --- a/include/lmkd.h +++ b/include/lmkd.h @@ -27,14 +27,16 @@ __BEGIN_DECLS * Supported LMKD commands */ enum lmk_cmd { - LMK_TARGET = 0, /* Associate minfree with oom_adj_score */ - LMK_PROCPRIO, /* Register a process and set its oom_adj_score */ - LMK_PROCREMOVE, /* Unregister a process */ - LMK_PROCPURGE, /* Purge all registered processes */ - LMK_GETKILLCNT, /* Get number of kills */ - LMK_SUBSCRIBE, /* Subscribe for asynchronous events */ - LMK_PROCKILL, /* Unsolicited msg to subscribed clients on proc kills */ - LMK_UPDATE_PROPS, /* Reinit properties */ + LMK_TARGET = 0, /* Associate minfree with oom_adj_score */ + LMK_PROCPRIO, /* Register a process and set its oom_adj_score */ + LMK_PROCREMOVE, /* Unregister a process */ + LMK_PROCPURGE, /* Purge all registered processes */ + LMK_GETKILLCNT, /* Get number of kills */ + LMK_SUBSCRIBE, /* Subscribe for asynchronous events */ + LMK_PROCKILL, /* Unsolicited msg to subscribed clients on proc kills */ + LMK_UPDATE_PROPS, /* Reinit properties */ + LMK_STAT_KILL_OCCURRED, /* Unsolicited msg to subscribed clients on proc kills for statsd log */ + LMK_STAT_STATE_CHANGED, /* Unsolicited msg to subscribed clients on state changed */ }; /* @@ -204,10 +206,11 @@ static inline size_t lmkd_pack_set_getkillcnt_repl(LMKD_CTRL_PACKET packet, int return 2 * sizeof(int); } -/* Types of asyncronous events sent from lmkd to its clients */ +/* Types of asynchronous events sent from lmkd to its clients */ enum async_event_type { LMK_ASYNC_EVENT_FIRST, LMK_ASYNC_EVENT_KILL = LMK_ASYNC_EVENT_FIRST, + LMK_ASYNC_EVENT_STAT, LMK_ASYNC_EVENT_COUNT, }; diff --git a/lmkd.cpp b/lmkd.cpp index e1c5276..dc93ac4 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -758,6 +757,49 @@ static void ctrl_data_write_lmk_kill_occurred(pid_t pid, uid_t uid) { } } +/* + * Write the kill_stat/memory_stat over the data socket to be propagated via AMS to statsd + */ +static void stats_write_lmk_kill_occurred(struct kill_stat *kill_st, + struct memory_stat *mem_st) { + LMK_KILL_OCCURRED_PACKET packet; + const size_t len = lmkd_pack_set_kill_occurred(packet, kill_st, mem_st); + if (len == 0) { + return; + } + + for (int i = 0; i < MAX_DATA_CONN; i++) { + if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) { + ctrl_data_write(i, packet, len); + } + } + +} + +static void stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st, + struct memory_stat *mem_st) { + kill_st->taskname = stats_get_task_name(pid); + if (kill_st->taskname != NULL) { + stats_write_lmk_kill_occurred(kill_st, mem_st); + } +} + +/* + * Write the state_changed over the data socket to be propagated via AMS to statsd + */ +static void stats_write_lmk_state_changed(enum lmk_state state) { + LMKD_CTRL_PACKET packet_state_changed; + const size_t len = lmkd_pack_set_state_changed(packet_state_changed, state); + if (len == 0) { + return; + } + for (int i = 0; i < MAX_DATA_CONN; i++) { + if (data_sock[i].sock >= 0 && data_sock[i].async_event_mask & 1 << LMK_ASYNC_EVENT_STAT) { + ctrl_data_write(i, (char*)packet_state_changed, len); + } + } +} + static void poll_kernel(int poll_fd) { if (poll_fd == -1) { // not waiting @@ -2183,8 +2225,7 @@ static int find_and_kill_process(int min_score_adj, enum kill_reasons kill_reaso if (killed_size >= 0) { if (!lmk_state_change_start) { lmk_state_change_start = true; - stats_write_lmk_state_changed( - android::lmkd::stats::LMK_STATE_CHANGED__STATE__START); + stats_write_lmk_state_changed(STATE_START); } break; } @@ -2195,7 +2236,7 @@ static int find_and_kill_process(int min_score_adj, enum kill_reasons kill_reaso } if (lmk_state_change_start) { - stats_write_lmk_state_changed(android::lmkd::stats::LMK_STATE_CHANGED__STATE__STOP); + stats_write_lmk_state_changed(STATE_STOP); } return killed_size; diff --git a/statslog.cpp b/statslog.cpp index 5560b6b..ba39f54 100644 --- a/statslog.cpp +++ b/statslog.cpp @@ -24,21 +24,27 @@ #include #include #include +#include +#include #include #include #include #ifdef LMKD_LOG_STATS -#define LINE_MAX 128 #define STRINGIFY(x) STRINGIFY_INTERNAL(x) #define STRINGIFY_INTERNAL(x) #x +/** + * Used to make sure that the payload is always smaller than LMKD_REPLY_MAX_SIZE + */ +#define BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)])) + static bool enable_stats_log = property_get_bool("ro.lmk.log_stats", true); struct proc { int pid; - char taskname[LINE_MAX]; + char taskname[MAX_TASKNAME_LEN]; struct proc* pidhash_next; }; @@ -46,20 +52,6 @@ struct proc { static struct proc** pidhash = NULL; #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) -/** - * Logs the change in LMKD state which is used as start/stop boundaries for logging - * LMK_KILL_OCCURRED event. - * Code: LMK_STATE_CHANGED = 54 - */ -int -stats_write_lmk_state_changed(int32_t state) { - if (enable_stats_log) { - return android::lmkd::stats::stats_write(android::lmkd::stats::LMK_STATE_CHANGED, state); - } else { - return -EINVAL; - } -} - static struct proc* pid_lookup(int pid) { struct proc* procp; @@ -71,68 +63,11 @@ static struct proc* pid_lookup(int pid) { return procp; } -inline int32_t map_kill_reason(enum kill_reasons reason) { - switch (reason) { - case PRESSURE_AFTER_KILL: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__PRESSURE_AFTER_KILL; - case NOT_RESPONDING: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__NOT_RESPONDING; - case LOW_SWAP_AND_THRASHING: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__LOW_SWAP_AND_THRASHING; - case LOW_MEM_AND_SWAP: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__LOW_MEM_AND_SWAP; - case LOW_MEM_AND_THRASHING: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__LOW_MEM_AND_THRASHING; - case DIRECT_RECL_AND_THRASHING: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__DIRECT_RECL_AND_THRASHING; - case LOW_MEM_AND_SWAP_UTIL: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__LOW_MEM_AND_SWAP_UTIL; - default: - return android::lmkd::stats::LMK_KILL_OCCURRED__REASON__UNKNOWN; - } -} - -/** - * Logs the event when LMKD kills a process to reduce memory pressure. - * Code: LMK_KILL_OCCURRED = 51 - */ -int stats_write_lmk_kill_occurred(struct kill_stat *kill_st, struct memory_stat *mem_st) { - if (enable_stats_log) { - return android::lmkd::stats::stats_write( - android::lmkd::stats::LMK_KILL_OCCURRED, - kill_st->uid, - kill_st->taskname, - kill_st->oom_score, - mem_st ? mem_st->pgfault : -1, - mem_st ? mem_st->pgmajfault : -1, - mem_st ? mem_st->rss_in_bytes : -1, - mem_st ? mem_st->cache_in_bytes : -1, - mem_st ? mem_st->swap_in_bytes : -1, - mem_st ? mem_st->process_start_time_ns : -1, - kill_st->min_oom_score, - kill_st->free_mem_kb, - kill_st->free_swap_kb, - map_kill_reason(kill_st->kill_reason) - ); - } else { - return -EINVAL; - } -} - -int stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st, - struct memory_stat* mem_st) { - struct proc* proc = pid_lookup(pid); - if (!proc) return -EINVAL; - - kill_st->taskname = proc->taskname; - return stats_write_lmk_kill_occurred(kill_st, mem_st); -} - static void memory_stat_parse_line(char* line, struct memory_stat* mem_st) { - char key[LINE_MAX + 1]; + char key[MAX_TASKNAME_LEN + 1]; int64_t value; - sscanf(line, "%" STRINGIFY(LINE_MAX) "s %" SCNd64 "", key, &value); + sscanf(line, "%" STRINGIFY(MAX_TASKNAME_LEN) "s %" SCNd64 "", key, &value); if (strcmp(key, "total_") < 0) { return; @@ -275,8 +210,8 @@ void stats_store_taskname(int pid, const char* taskname) { } procp = static_cast(malloc(sizeof(struct proc))); procp->pid = pid; - strncpy(procp->taskname, taskname, LINE_MAX - 1); - procp->taskname[LINE_MAX - 1] = '\0'; + strncpy(procp->taskname, taskname, MAX_TASKNAME_LEN - 1); + procp->taskname[MAX_TASKNAME_LEN - 1] = '\0'; proc_insert(procp); } @@ -299,4 +234,110 @@ void stats_purge_tasknames() { memset(pidhash, 0, PIDHASH_SZ * sizeof(*pidhash)); } +const char* stats_get_task_name(int pid) { + struct proc* proc = pid_lookup(pid); + return proc ? proc->taskname : NULL; +} + +/** + * Writes int32 in a machine independent way + * https://docs.oracle.com/javase/7/docs/api/java/io/DataOutput.html#writeInt(int) + */ +static inline size_t pack_int32(LMK_KILL_OCCURRED_PACKET packet, + size_t index, + int32_t value) { + int32_t* int_buffer = (int32_t*)(packet + index); + + *int_buffer = htonl(value); + + return index + sizeof(int32_t); +} + +/** + * Writes int64 in a machine independent way + * https://docs.oracle.com/javase/7/docs/api/java/io/DataOutput.html#writeLong(long) + */ +static inline size_t pack_int64(LMK_KILL_OCCURRED_PACKET packet, + size_t index, + int64_t value) { + int64_t* int64_buffer = (int64_t*)(packet + index); + + *int64_buffer = htonq(value); + + return index + sizeof(int64_t); +} + +/** + * Writes ANSI string in a machine independent way + * https://docs.oracle.com/javase/7/docs/api/java/io/DataOutput.html#writeShort(int) + * 2 bytes str len following n chars + * to be read on the Java side with + * https://docs.oracle.com/javase/7/docs/api/java/io/DataInput.html#readUTF() + * Truncates the value string & packs up to MAX_TASKNAME_LEN - 1 chars + */ +static inline size_t pack_string(LMK_KILL_OCCURRED_PACKET packet, + size_t index, + const char* value) { + const size_t len_proc_name = MIN(strlen(value), MAX_TASKNAME_LEN - 1); + int16_t* short_buffer = (int16_t*)(packet + index); + *short_buffer = htons((int16_t)len_proc_name); + + char* byte_buffer = (char*)(short_buffer + 1); + strncpy(byte_buffer, value, MAX_TASKNAME_LEN - 1); + byte_buffer[MAX_TASKNAME_LEN - 1] = '\0'; + + return index + sizeof(int16_t) + len_proc_name + 1; +} + +size_t lmkd_pack_set_kill_occurred(LMK_KILL_OCCURRED_PACKET packet, + struct kill_stat *kill_stat, + struct memory_stat *mem_stat) { + BUILD_BUG_ON(sizeof(LMK_KILL_OCCURRED_PACKET) > LMKD_REPLY_MAX_SIZE); + + if (!enable_stats_log) { + return 0; + } + + int32_t index = 0; + index = pack_int32(packet, index, LMK_STAT_KILL_OCCURRED); + + if (mem_stat) { + index = pack_int64(packet, index, mem_stat->pgfault); + index = pack_int64(packet, index, mem_stat->pgmajfault); + index = pack_int64(packet, index, mem_stat->rss_in_bytes); + index = pack_int64(packet, index, mem_stat->cache_in_bytes); + index = pack_int64(packet, index, mem_stat->swap_in_bytes); + index = pack_int64(packet, index, mem_stat->process_start_time_ns); + } else { + index = pack_int64(packet, index, -1); + index = pack_int64(packet, index, -1); + index = pack_int64(packet, index, -1); + index = pack_int64(packet, index, -1); + index = pack_int64(packet, index, -1); + index = pack_int64(packet, index, -1); + } + + index = pack_int32(packet, index, kill_stat->uid); + index = pack_int32(packet, index, kill_stat->oom_score); + index = pack_int32(packet, index, kill_stat->min_oom_score); + index = pack_int32(packet, index, (int)kill_stat->free_mem_kb); + index = pack_int32(packet, index, (int)kill_stat->free_swap_kb); + index = pack_int32(packet, index, (int)kill_stat->kill_reason); + + index = pack_string(packet, index, kill_stat->taskname); + return index; +} + +size_t lmkd_pack_set_state_changed(LMKD_CTRL_PACKET packet, + enum lmk_state state) { + if (!enable_stats_log) { + return 0; + } + + packet[0] = htonl(LMK_STAT_STATE_CHANGED); + packet[1] = htonl(state); + + return 2 * sizeof(int); +} + #endif /* LMKD_LOG_STATS */ diff --git a/statslog.h b/statslog.h index 628775f..e0859fa 100644 --- a/statslog.h +++ b/statslog.h @@ -17,9 +17,10 @@ #ifndef _STATSLOG_H_ #define _STATSLOG_H_ +#include + #include #include -#include #include #include #include @@ -28,6 +29,21 @@ __BEGIN_DECLS +#define MAX_TASKNAME_LEN 128 + +/* + * Max LMKD reply packet length in bytes + * Notes about size calculation: + * 4 bytes for packet type + * 80 bytes for the LmkKillOccurred fields: memory_stat + kill_stat + * 2 bytes for process name string size + * MAX_TASKNAME_LEN bytes for the process name string + * + * Must be in sync with LmkdConnection.java + */ +#define LMKD_REPLY_MAX_SIZE 214 + +/* LMK_MEMORY_STATS packet payload */ struct memory_stat { int64_t pgfault; int64_t pgmajfault; @@ -37,7 +53,7 @@ struct memory_stat { int64_t process_start_time_ns; }; -// If you update this, also update the corresponding stats enum mapping. +// If you update this, also update the corresponding stats enum mapping and LmkdStatsReporter.java enum kill_reasons { NONE = -1, /* To denote no kill condition */ PRESSURE_AFTER_KILL = 0, @@ -50,9 +66,10 @@ enum kill_reasons { KILL_REASON_COUNT }; +/* LMK_KILL_STAT packet payload */ struct kill_stat { int32_t uid; - char *taskname; + const char *taskname; enum kill_reasons kill_reason; int32_t oom_score; int32_t min_oom_score; @@ -60,6 +77,16 @@ struct kill_stat { int64_t free_swap_kb; }; +/* LMKD reply packet to hold data for the LmkKillOccurred statsd atom */ +typedef char LMK_KILL_OCCURRED_PACKET[LMKD_REPLY_MAX_SIZE]; + +// If you update this, also update the corresponding stats enum mapping. +enum lmk_state { + STATE_UNKNOWN = 0, + STATE_START, + STATE_STOP, +}; + #ifdef LMKD_LOG_STATS #define MEMCG_PROCESS_MEMORY_STAT_PATH "/dev/memcg/apps/uid_%u/pid_%d/memory.stat" @@ -68,25 +95,20 @@ struct kill_stat { #define BYTES_IN_KILOBYTE 1024 /** - * Logs the change in LMKD state which is used as start/stop boundaries for logging + * Produces packet with the change in LMKD state which is used as start/stop boundaries for logging * LMK_KILL_OCCURRED event. * Code: LMK_STATE_CHANGED = 54 */ -int -stats_write_lmk_state_changed(int32_t state); +size_t lmkd_pack_set_state_changed(LMKD_CTRL_PACKET packet, + enum lmk_state state); /** - * Logs the event when LMKD kills a process to reduce memory pressure. + * Produces packet with the event when LMKD kills a process to reduce memory pressure. * Code: LMK_KILL_OCCURRED = 51 */ -int stats_write_lmk_kill_occurred(struct kill_stat *kill_st, struct memory_stat *mem_st); - -/** - * Logs the event when LMKD kills a process to reduce memory pressure. - * Code: LMK_KILL_OCCURRED = 51 - */ -int stats_write_lmk_kill_occurred_pid(int pid, struct kill_stat *kill_st, - struct memory_stat* mem_st); +size_t lmkd_pack_set_kill_occurred(LMK_KILL_OCCURRED_PACKET packet, + struct kill_stat *kill_st, + struct memory_stat *mem_st); /** * Reads memory stats used to log the statsd atom. Returns non-null ptr on success. @@ -109,19 +131,19 @@ void stats_purge_tasknames(); */ void stats_remove_taskname(int pid); +const char* stats_get_task_name(int pid); + #else /* LMKD_LOG_STATS */ -static inline int -stats_write_lmk_state_changed(int32_t state __unused) { return -EINVAL; } - -static inline int -stats_write_lmk_kill_occurred(struct kill_stat *kill_st __unused, - struct memory_stat *mem_st __unused) { +static inline size_t +lmkd_pack_set_state_changed(LMKD_CTRL_PACKET packet __unused, enum lmk_state state __unused) { return -EINVAL; } -int stats_write_lmk_kill_occurred_pid(int pid __unused, struct kill_stat *kill_st __unused, - struct memory_stat* mem_st __unused) { +static inline size_t +lmkd_pack_set_kill_occurred(LMK_KILL_OCCURRED_PACKET packet __unused, + struct kill_stat *kill_st __unused, + struct memory_stat *mem_st __unused) { return -EINVAL; } @@ -137,6 +159,8 @@ static inline void stats_purge_tasknames() {} static inline void stats_remove_taskname(int pid __unused) {} +static inline const char* stats_get_task_name(int pid __unused) { return NULL; } + #endif /* LMKD_LOG_STATS */ __END_DECLS From a01f251ff271dafbd441a3372364957613ff9162 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 27 May 2021 18:15:44 -0700 Subject: [PATCH 3/7] lmkd: Include total GPU memory usage in killinfo reports /sys/fs/bpf/map_gpu_mem_gpu_mem_total_map BPF map exposes total GPU allocations size. Include this value into killinfo reports to track GPU allocation size at the time of the kill. Bug: 189366037 Test: lmkd_unit_test Signed-off-by: Suren Baghdasaryan Change-Id: Icc1ed8ab2593530fa293ff9c82f6c8dc400485f5 Merged-In: Icc1ed8ab2593530fa293ff9c82f6c8dc400485f5 --- Android.bp | 3 +++ event.logtags | 2 +- lmkd.cpp | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/Android.bp b/Android.bp index fcf26ca..4f53ded 100644 --- a/Android.bp +++ b/Android.bp @@ -23,6 +23,9 @@ cc_binary { "libstatslogc", "liblmkd_utils", ], + header_libs: [ + "bpf_syscall_wrappers", + ], local_include_dirs: ["include"], cflags: [ "-Wall", diff --git a/event.logtags b/event.logtags index 0de32b7..5382b49 100644 --- a/event.logtags +++ b/event.logtags @@ -35,4 +35,4 @@ # TODO: generate ".java" and ".h" files with integer constants from this file. # for killinfo logs -10195355 killinfo (Pid|1|5),(Uid|1|5),(OomAdj|1),(MinOomAdj|1),(TaskSize|1),(enum kill_reasons|1|5),(MemFree|1),(Cached|1),(SwapCached|1),(Buffers|1),(Shmem|1),(Unevictable|1),(SwapTotal|1),(SwapFree|1),(ActiveAnon|1),(InactiveAnon|1),(ActiveFile|1),(InactiveFile|1),(SReclaimable|1),(SUnreclaim|1),(KernelStack|1),(PageTables|1),(IonHeap|1),(IonHeapPool|1),(CmaFree|1),(MsSinceEvent|1),(MsSincePrevWakeup|1),(WakeupsSinceEvent|1),(SkippedWakeups|1),(TaskSwapSize|1) +10195355 killinfo (Pid|1|5),(Uid|1|5),(OomAdj|1),(MinOomAdj|1),(TaskSize|1),(enum kill_reasons|1|5),(MemFree|1),(Cached|1),(SwapCached|1),(Buffers|1),(Shmem|1),(Unevictable|1),(SwapTotal|1),(SwapFree|1),(ActiveAnon|1),(InactiveAnon|1),(ActiveFile|1),(InactiveFile|1),(SReclaimable|1),(SUnreclaim|1),(KernelStack|1),(PageTables|1),(IonHeap|1),(IonHeapPool|1),(CmaFree|1),(MsSinceEvent|1),(MsSincePrevWakeup|1),(WakeupsSinceEvent|1),(SkippedWakeups|1),(TaskSwapSize|1),(GPU|1) diff --git a/lmkd.cpp b/lmkd.cpp index dc93ac4..0afcf2f 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -53,6 +53,9 @@ #include "statslog.h" +#define BPF_FD_JUST_USE_INT +#include "BpfSyscallWrappers.h" + /* * Define LMKD_TRACE_KILLS to record lmkd kills in kernel traces * to profile and correlate with OOM kills @@ -431,6 +434,7 @@ union meminfo { int64_t cma_free; /* fields below are calculated rather than read from the file */ int64_t nr_file_pages; + int64_t total_gpu_kb; } field; int64_t arr[MI_FIELD_COUNT]; }; @@ -1785,6 +1789,21 @@ static bool meminfo_parse_line(char *line, union meminfo *mi) { return (match_res != PARSE_FAIL); } +static int64_t read_gpu_total_kb() { + static int fd = android::bpf::bpfFdGet( + "/sys/fs/bpf/map_gpu_mem_gpu_mem_total_map", BPF_F_RDONLY); + static constexpr uint64_t kBpfKeyGpuTotalUsage = 0; + uint64_t value; + + if (fd < 0) { + return 0; + } + + return android::bpf::findMapEntry(fd, &kBpfKeyGpuTotalUsage, &value) + ? 0 + : (int32_t)(value / 1024); +} + static int meminfo_parse(union meminfo *mi) { static struct reread_data file_data = { .filename = MEMINFO_PATH, @@ -1809,6 +1828,7 @@ static int meminfo_parse(union meminfo *mi) { } mi->field.nr_file_pages = mi->field.cached + mi->field.swap_cached + mi->field.buffers; + mi->field.total_gpu_kb = read_gpu_total_kb(); return 0; } @@ -1923,6 +1943,7 @@ static void killinfo_log(struct proc* procp, int min_oom_score, int rss_kb, android_log_write_int32(ctx, wi->wakeups_since_event); android_log_write_int32(ctx, wi->skipped_wakeups); android_log_write_int32(ctx, (int32_t)min(swap_kb, INT32_MAX)); + android_log_write_int32(ctx, (int32_t)mi->field.total_gpu_kb); android_log_write_list(ctx, LOG_ID_EVENTS); android_log_reset(ctx); From 57f12d13ce501dfdc12665c3f3f69df47cd81cb8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 14 Jul 2021 17:57:52 -0700 Subject: [PATCH 4/7] lmkd: Add ro.lmk.filecache_min_kb property for min filecache watermark We see many cases when device keeps thrashing despite lmkd kills. This happens because killed processes do not free enough filecache to fit the current workingset completely. To prevent such cases, introduce ro.lmk.filecache_min_kb property to specify min filecache size in KB that should be reached after thrashing is detected. Lmkd will keep killing background processes until this filecache size limit is satisfied. Bug: 193293513 Signed-off-by: Suren Baghdasaryan Change-Id: I49ca4cd2f33b27fdbc432d9ce6944b1a1794b749 Merged-In: I49ca4cd2f33b27fdbc432d9ce6944b1a1794b749 --- lmkd.cpp | 27 +++++++++++++++++++++++++-- statslog.h | 1 + 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/lmkd.cpp b/lmkd.cpp index 0afcf2f..f3c301e 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -201,6 +201,7 @@ static int thrashing_limit_pct; static int thrashing_limit_decay_pct; static int thrashing_critical_pct; static int swap_util_max; +static int64_t filecache_min_kb; static bool use_psi_monitors = false; static int kpoll_fd; static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = { @@ -2397,6 +2398,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ static struct wakeup_info wi; static struct timespec thrashing_reset_tm; static int64_t prev_thrash_growth = 0; + static bool check_filecache = false; union meminfo mi; union vmstat vs; @@ -2475,7 +2477,10 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ init_pgscan_kswapd = vs.field.pgscan_kswapd; reclaim = KSWAPD_RECLAIM; } else if (workingset_refault_file == prev_workingset_refault) { - /* Device is not thrashing and not reclaiming, bail out early until we see these stats changing*/ + /* + * Device is not thrashing and not reclaiming, bail out early until we see these stats + * changing + */ goto no_kill; } @@ -2488,7 +2493,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ * important for a slow growing refault case. While retrying, we should keep * monitoring new thrashing counter as someone could release the memory to mitigate * the thrashing. Thus, when thrashing reset window comes, we decay the prev thrashing - * counter by window counts. if the counter is still greater than thrashing limit, + * counter by window counts. If the counter is still greater than thrashing limit, * we preserve the current prev_thrash counter so we will retry kill again. Otherwise, * we reset the prev_thrash counter so we will stop retrying. */ @@ -2570,6 +2575,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) { min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } + check_filecache = true; } else if (swap_is_low && wmark < WMARK_HIGH) { /* Both free memory and swap are low */ kill_reason = LOW_MEM_AND_SWAP; @@ -2600,6 +2606,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ if (thrashing < thrashing_critical_pct) { min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } + check_filecache = true; } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) { /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */ kill_reason = DIRECT_RECL_AND_THRASHING; @@ -2610,6 +2617,21 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ if (thrashing < thrashing_critical_pct) { min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } + check_filecache = true; + } else if (check_filecache) { + int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k; + + if (file_lru_kb < filecache_min_kb) { + /* File cache is too low after thrashing, keep killing background processes */ + kill_reason = LOW_FILECACHE_AFTER_THRASHING; + snprintf(kill_desc, sizeof(kill_desc), + "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing", + file_lru_kb, filecache_min_kb); + min_score_adj = PERCEPTIBLE_APP_ADJ + 1; + } else { + /* File cache is big enough, stop checking */ + check_filecache = false; + } } /* Kill a process if necessary */ @@ -3409,6 +3431,7 @@ static void update_props() { thrashing_critical_pct = max(0, property_get_int32("ro.lmk.thrashing_limit_critical", thrashing_limit_pct * 2)); swap_util_max = clamp(0, 100, property_get_int32("ro.lmk.swap_util_max", 100)); + filecache_min_kb = property_get_int64("ro.lmk.filecache_min_kb", 0); } int main(int argc, char **argv) { diff --git a/statslog.h b/statslog.h index e0859fa..44af35f 100644 --- a/statslog.h +++ b/statslog.h @@ -63,6 +63,7 @@ enum kill_reasons { LOW_MEM_AND_THRASHING, DIRECT_RECL_AND_THRASHING, LOW_MEM_AND_SWAP_UTIL, + LOW_FILECACHE_AFTER_THRASHING, KILL_REASON_COUNT }; From e1ffef4e36bcbdf4cc441137e25e35d851c19f86 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 20 Jul 2021 10:52:00 -0700 Subject: [PATCH 5/7] lmkd: Disable critical thrashing limit by default Critical thrashing limit determines the balance between how much thrashing should be tolerated before killing a perceptible app. This threshold might differ between devices, therefore we disable critical thrashing limit by default allowing each device to set it individually. This is done to prevent excessive kills of perceptible apps. Bug: 194199500 Signed-off-by: Suren Baghdasaryan Change-Id: Idd1715564c3727b09fcb0a109ab3d6bae9d0b99a --- lmkd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmkd.cpp b/lmkd.cpp index 0afcf2f..70651c7 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -3407,7 +3407,7 @@ static void update_props() { thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay", low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY)); thrashing_critical_pct = max(0, property_get_int32("ro.lmk.thrashing_limit_critical", - thrashing_limit_pct * 2)); + INT_MAX)); swap_util_max = clamp(0, 100, property_get_int32("ro.lmk.swap_util_max", 100)); } From dded82877fece7ba7d267ac0a4e742c229fa5472 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 22 Jul 2021 20:59:31 +0000 Subject: [PATCH 6/7] Revert "lmkd: Disable critical thrashing limit by default" This reverts commit e1ffef4e36bcbdf4cc441137e25e35d851c19f86. Reason for revert: Restore the default thrashing limits to prevent unresponsive devices. Bug: 194199500 Change-Id: I15be5b3d67a71b68bca6dea9c2d5b4aa54d6c260 --- lmkd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lmkd.cpp b/lmkd.cpp index 04e19fa..f3c301e 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -3429,7 +3429,7 @@ static void update_props() { thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay", low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY)); thrashing_critical_pct = max(0, property_get_int32("ro.lmk.thrashing_limit_critical", - INT_MAX)); + thrashing_limit_pct * 2)); swap_util_max = clamp(0, 100, property_get_int32("ro.lmk.swap_util_max", 100)); filecache_min_kb = property_get_int64("ro.lmk.filecache_min_kb", 0); } From 81bb801dd2d681d1742914b7da9c1d07c7a7be9f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 22 Jul 2021 16:21:21 -0700 Subject: [PATCH 7/7] lmkd: Add current and max thrashing levels in LMK_MEMORY_STATS reports Thrashing threshold tuning requires collecting thrashing level data from the field and correlating these levels with other indications of device being non-responsive. Include current and max thrashing levels in the lmkd kill reports. Max thrashing level captures the highest level seen since the last kill report. Bug: 194433891 Signed-off-by: Suren Baghdasaryan Change-Id: I8a34dc41e7f03668bfad4ac2cbcb5d2570a10752 --- lmkd.cpp | 52 ++++++++++++++++++++++++++++++++++++---------------- statslog.cpp | 2 ++ statslog.h | 6 ++++-- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/lmkd.cpp b/lmkd.cpp index f3c301e..2f0df91 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -2110,10 +2110,16 @@ static void start_wait_for_proc_kill(int pid_or_fd) { maxevents++; } +struct kill_info { + enum kill_reasons kill_reason; + const char *kill_desc; + int thrashing; + int max_thrashing; +}; + /* Kill one process specified by procp. Returns the size (in pages) of the process killed */ -static int kill_one_process(struct proc* procp, int min_oom_score, enum kill_reasons kill_reason, - const char *kill_desc, union meminfo *mi, struct wakeup_info *wi, - struct timespec *tm) { +static int kill_one_process(struct proc* procp, int min_oom_score, struct kill_info *ki, + union meminfo *mi, struct wakeup_info *wi, struct timespec *tm) { int pid = procp->pid; int pidfd = procp->pidfd; uid_t uid = procp->uid; @@ -2180,19 +2186,25 @@ static int kill_one_process(struct proc* procp, int min_oom_score, enum kill_rea inc_killcnt(procp->oomadj); - killinfo_log(procp, min_oom_score, rss_kb, swap_kb, kill_reason, mi, wi, tm); - - if (kill_desc) { + if (ki) { + kill_st.kill_reason = ki->kill_reason; + kill_st.thrashing = ki->thrashing; + kill_st.max_thrashing = ki->max_thrashing; + killinfo_log(procp, min_oom_score, rss_kb, swap_kb, ki->kill_reason, mi, wi, tm); ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64 - "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb, kill_desc); + "kB swap; reason: %s", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb, + ki->kill_desc); } else { + kill_st.kill_reason = NONE; + kill_st.thrashing = 0; + kill_st.max_thrashing = 0; + killinfo_log(procp, min_oom_score, rss_kb, swap_kb, NONE, mi, wi, tm); ALOGI("Kill '%s' (%d), uid %d, oom_score_adj %d to free %" PRId64 "kB rss, %" PRId64 "kb swap", taskname, pid, uid, procp->oomadj, rss_kb, swap_kb); } kill_st.uid = static_cast(uid); kill_st.taskname = taskname; - kill_st.kill_reason = kill_reason; kill_st.oom_score = procp->oomadj; kill_st.min_oom_score = min_oom_score; kill_st.free_mem_kb = mi->field.nr_free_pages * page_k; @@ -2216,8 +2228,7 @@ out: * Find one process to kill at or above the given oom_score_adj level. * Returns size of the killed process. */ -static int find_and_kill_process(int min_score_adj, enum kill_reasons kill_reason, - const char *kill_desc, union meminfo *mi, +static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi, struct wakeup_info *wi, struct timespec *tm) { int i; int killed_size = 0; @@ -2242,8 +2253,7 @@ static int find_and_kill_process(int min_score_adj, enum kill_reasons kill_reaso if (!procp) break; - killed_size = kill_one_process(procp, min_score_adj, kill_reason, kill_desc, - mi, wi, tm); + killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm); if (killed_size >= 0) { if (!lmk_state_change_start) { lmk_state_change_start = true; @@ -2399,6 +2409,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ static struct timespec thrashing_reset_tm; static int64_t prev_thrash_growth = 0; static bool check_filecache = false; + static int max_thrashing = 0; union meminfo mi; union vmstat vs; @@ -2524,6 +2535,9 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ } /* Add previous cycle's decayed thrashing amount */ thrashing += prev_thrash_growth; + if (max_thrashing < thrashing) { + max_thrashing = thrashing; + } /* * Refresh watermarks once per min in case user updated one of the margins. @@ -2636,10 +2650,16 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ /* Kill a process if necessary */ if (kill_reason != NONE) { - int pages_freed = find_and_kill_process(min_score_adj, kill_reason, kill_desc, &mi, - &wi, &curr_tm); + struct kill_info ki = { + .kill_reason = kill_reason, + .kill_desc = kill_desc, + .thrashing = (int)thrashing, + .max_thrashing = max_thrashing, + }; + int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm); if (pages_freed > 0) { killing = true; + max_thrashing = 0; if (cut_thrashing_limit) { /* * Cut thrasing limit by thrashing_limit_decay_pct percentage of the current @@ -2856,7 +2876,7 @@ static void mp_event_common(int data, uint32_t events, struct polling_params *po do_kill: if (low_ram_device) { /* For Go devices kill only one task */ - if (find_and_kill_process(level_oomadj[level], NONE, NULL, &mi, &wi, &curr_tm) == 0) { + if (find_and_kill_process(level_oomadj[level], NULL, &mi, &wi, &curr_tm) == 0) { if (debug_process_killing) { ALOGI("Nothing to kill"); } @@ -2879,7 +2899,7 @@ do_kill: min_score_adj = level_oomadj[level]; } - pages_freed = find_and_kill_process(min_score_adj, NONE, NULL, &mi, &wi, &curr_tm); + pages_freed = find_and_kill_process(min_score_adj, NULL, &mi, &wi, &curr_tm); if (pages_freed == 0) { /* Rate limit kill reports when nothing was reclaimed */ diff --git a/statslog.cpp b/statslog.cpp index ba39f54..6568f73 100644 --- a/statslog.cpp +++ b/statslog.cpp @@ -323,6 +323,8 @@ size_t lmkd_pack_set_kill_occurred(LMK_KILL_OCCURRED_PACKET packet, index = pack_int32(packet, index, (int)kill_stat->free_mem_kb); index = pack_int32(packet, index, (int)kill_stat->free_swap_kb); index = pack_int32(packet, index, (int)kill_stat->kill_reason); + index = pack_int32(packet, index, kill_stat->thrashing); + index = pack_int32(packet, index, kill_stat->max_thrashing); index = pack_string(packet, index, kill_stat->taskname); return index; diff --git a/statslog.h b/statslog.h index 44af35f..89e4d2e 100644 --- a/statslog.h +++ b/statslog.h @@ -35,13 +35,13 @@ __BEGIN_DECLS * Max LMKD reply packet length in bytes * Notes about size calculation: * 4 bytes for packet type - * 80 bytes for the LmkKillOccurred fields: memory_stat + kill_stat + * 88 bytes for the LmkKillOccurred fields: memory_stat + kill_stat * 2 bytes for process name string size * MAX_TASKNAME_LEN bytes for the process name string * * Must be in sync with LmkdConnection.java */ -#define LMKD_REPLY_MAX_SIZE 214 +#define LMKD_REPLY_MAX_SIZE 222 /* LMK_MEMORY_STATS packet payload */ struct memory_stat { @@ -76,6 +76,8 @@ struct kill_stat { int32_t min_oom_score; int64_t free_mem_kb; int64_t free_swap_kb; + int32_t thrashing; + int32_t max_thrashing; }; /* LMKD reply packet to hold data for the LmkKillOccurred statsd atom */