From 16e959bc676f8985d8f7fb31bb9f13b4aa288d88 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 5 Sep 2018 15:46:32 -0700 Subject: [PATCH 1/4] lmkd: rate-limit and cleanup failed kill reports Excessive number of failed kill reports when lmkd can't find an eligible process to kill or frees not enough memory pollutes logs and bugreports. Cleanup kill reports to remove duplicate information and rate limit failed kill attempts at 1 report per sec. The number of suppressed failed kills will be reported in the next lmkd report. Bug: 113864581 Test: Verified using lmkd_unit_test Change-Id: I67fa1fec97613f136c7582115edcbc56b1503c9c Merged-In: I67fa1fec97613f136c7582115edcbc56b1503c9c Signed-off-by: Suren Baghdasaryan --- lmkd.c | 80 ++++++++++++++++++++++++++------------------- tests/lmkd_test.cpp | 2 +- 2 files changed, 48 insertions(+), 34 deletions(-) diff --git a/lmkd.c b/lmkd.c index cc5a7c2..bf265b0 100644 --- a/lmkd.c +++ b/lmkd.c @@ -79,6 +79,8 @@ #define STRINGIFY(x) STRINGIFY_INTERNAL(x) #define STRINGIFY_INTERNAL(x) #x +#define FAIL_REPORT_RLIMIT_MS 1000 + /* default to old in-kernel interface if no memory pressure events */ static int use_inkernel_interface = 1; static bool has_inkernel_module; @@ -956,8 +958,7 @@ static struct proc *proc_get_heaviest(int oomadj) { } /* Kill one process specified by procp. Returns the size of the process killed */ -static int kill_one_process(struct proc* procp, int min_score_adj, - enum vmpressure_level level) { +static int kill_one_process(struct proc* procp) { int pid = procp->pid; uid_t uid = procp->uid; char *taskname; @@ -990,11 +991,8 @@ static int kill_one_process(struct proc* procp, int min_score_adj, TRACE_KILL_START(pid); r = kill(pid, SIGKILL); - ALOGI( - "Killing '%s' (%d), uid %d, adj %d\n" - " to free %ldkB because system is under %s memory pressure oom_adj %d\n", - taskname, pid, uid, procp->oomadj, tasksize * page_k, - level_name[level], min_score_adj); + ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB", + taskname, pid, uid, procp->oomadj, tasksize * page_k); pid_remove(pid); TRACE_KILL_END(); @@ -1021,8 +1019,7 @@ static int kill_one_process(struct proc* procp, int min_score_adj, * If pages_to_free is set to 0 only one process will be killed. * Returns the size of the killed processes. */ -static int find_and_kill_processes(enum vmpressure_level level, - int min_score_adj, int pages_to_free) { +static int find_and_kill_processes(int min_score_adj, int pages_to_free) { int i; int killed_size; int pages_freed = 0; @@ -1041,7 +1038,7 @@ static int find_and_kill_processes(enum vmpressure_level level, if (!procp) break; - killed_size = kill_one_process(procp, min_score_adj, level); + killed_size = kill_one_process(procp); if (killed_size >= 0) { #ifdef LMKD_LOG_STATS if (enable_stats_log && !lmk_state_change_start) { @@ -1146,8 +1143,9 @@ static void mp_event_common(int data, uint32_t events __unused) { enum vmpressure_level lvl; union meminfo mi; union zoneinfo zi; - static struct timeval last_report_tm; - static unsigned long skip_count = 0; + struct timeval curr_tm; + static struct timeval last_kill_tm; + static unsigned long kill_skip_count = 0; enum vmpressure_level level = (enum vmpressure_level)data; long other_free = 0, other_file = 0; int min_score_adj; @@ -1176,19 +1174,18 @@ static void mp_event_common(int data, uint32_t events __unused) { } } + gettimeofday(&curr_tm, NULL); if (kill_timeout_ms) { - struct timeval curr_tm; - gettimeofday(&curr_tm, NULL); - if (get_time_diff_ms(&last_report_tm, &curr_tm) < kill_timeout_ms) { - skip_count++; + if (get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) { + kill_skip_count++; return; } } - if (skip_count > 0) { + if (kill_skip_count > 0) { ALOGI("%lu memory pressure events were skipped after a kill!", - skip_count); - skip_count = 0; + kill_skip_count); + kill_skip_count = 0; } if (meminfo_parse(&mi) < 0 || zoneinfo_parse(&zi) < 0) { @@ -1280,13 +1277,15 @@ static void mp_event_common(int data, uint32_t events __unused) { do_kill: if (low_ram_device) { /* For Go devices kill only one task */ - if (find_and_kill_processes(level, level_oomadj[level], 0) == 0) { + if (find_and_kill_processes(level_oomadj[level], 0) == 0) { if (debug_process_killing) { ALOGI("Nothing to kill"); } } } else { int pages_freed; + static struct timeval last_report_tm; + static unsigned long report_skip_count = 0; if (!use_minfree_levels) { /* If pressure level is less than critical and enough free swap then ignore */ @@ -1314,24 +1313,39 @@ do_kill: min_score_adj = level_oomadj[level]; } - pages_freed = find_and_kill_processes(level, min_score_adj, pages_to_free); + pages_freed = find_and_kill_processes(min_score_adj, pages_to_free); + + if (pages_freed == 0) { + /* Rate limit kill reports when nothing was reclaimed */ + if (get_time_diff_ms(&last_report_tm, &curr_tm) < FAIL_REPORT_RLIMIT_MS) { + report_skip_count++; + return; + } + } + + if (pages_freed >= pages_to_free) { + /* Reset kill time only if reclaimed enough memory */ + last_kill_tm = curr_tm; + } if (use_minfree_levels) { - ALOGI("Killing because cache %ldkB is below " - "limit %ldkB for oom_adj %d\n" - " Free memory is %ldkB %s reserved", - other_file * page_k, minfree * page_k, min_score_adj, - other_free * page_k, other_free >= 0 ? "above" : "below"); + ALOGI("Killing to reclaim %ldkB, reclaimed %ldkB, cache(%ldkB) and " + "free(%" PRId64 "kB)-reserved(%" PRId64 "kB) below min(%ldkB) for oom_adj %d", + pages_to_free * page_k, pages_freed * page_k, + other_file * page_k, mi.field.nr_free_pages * page_k, + zi.field.totalreserve_pages * page_k, + minfree * page_k, min_score_adj); + } else { + ALOGI("Killing to reclaim %ldkB, reclaimed %ldkB at oom_adj %d", + pages_to_free * page_k, pages_freed * page_k, min_score_adj); } - if (pages_freed < pages_to_free) { - ALOGI("Unable to free enough memory (pages to free=%d, pages freed=%d)", - pages_to_free, pages_freed); - } else { - ALOGI("Reclaimed enough memory (pages to free=%d, pages freed=%d)", - pages_to_free, pages_freed); - gettimeofday(&last_report_tm, NULL); + if (report_skip_count > 0) { + ALOGI("Suppressed %lu failed kill reports", report_skip_count); + report_skip_count = 0; } + + last_report_tm = curr_tm; } } diff --git a/tests/lmkd_test.cpp b/tests/lmkd_test.cpp index 4afaeb8..c2ad74a 100644 --- a/tests/lmkd_test.cpp +++ b/tests/lmkd_test.cpp @@ -39,7 +39,7 @@ using namespace android::base; #define LMKDTEST_RESPAWN_FLAG "LMKDTEST_RESPAWN" #define LMKD_LOGCAT_MARKER "lowmemorykiller" -#define LMKD_KILL_MARKER_TEMPLATE LMKD_LOGCAT_MARKER ": Killing '%s'" +#define LMKD_KILL_MARKER_TEMPLATE LMKD_LOGCAT_MARKER ": Kill '%s'" #define OOM_MARKER "Out of memory" #define OOM_KILL_MARKER "Killed process" #define MIN_LOG_SIZE 100 From 08df7871d33851495f1710d5e02d95bca8c9cbcd Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 10 Oct 2018 14:17:17 -0700 Subject: [PATCH 2/4] lmkd: Implement pid purge command to clear old pids when zygote restarts lmkd keeps a list of pids registered by ActivityManager, however on rare occasions when framework restarts and lmkd survives that list has to be purged. Implement a command that can be used to clear the pid list. Bug: 116801366 Test: locally by killing zygote process Change-Id: I71d6012f86bb83a73edd5b687e05a0848e0569b1 Signed-off-by: Suren Baghdasaryan --- include/lmkd.h | 10 ++++++++++ lmkd.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/include/lmkd.h b/include/lmkd.h index fe6364d..e8f51da 100644 --- a/include/lmkd.h +++ b/include/lmkd.h @@ -30,6 +30,7 @@ enum lmk_cmd { LMK_TARGET = 0, /* Associate minfree with oom_adj_score */ LMK_PROCPRIO, /* Register a process and set its oom_adj_score */ LMK_PROCREMOVE, /* Unregister a process */ + LMK_PROCPURGE, /* Purge all registered processes */ }; /* @@ -142,6 +143,15 @@ inline size_t lmkd_pack_set_procremove(LMKD_CTRL_PACKET packet, return 2 * sizeof(int); } +/* + * Prepare LMK_PROCPURGE packet and return packet size in bytes. + * Warning: no checks performed, caller should ensure valid parameters. + */ +inline size_t lmkd_pack_set_procpurge(LMKD_CTRL_PACKET packet) { + packet[0] = htonl(LMK_PROCPURGE); + return sizeof(int); +} + __END_DECLS #endif /* _LMKD_H_ */ diff --git a/lmkd.c b/lmkd.c index bf265b0..e36f72b 100644 --- a/lmkd.c +++ b/lmkd.c @@ -526,6 +526,31 @@ static void cmd_procremove(LMKD_CTRL_PACKET packet) { pid_remove(params.pid); } +static void cmd_procpurge() { + int i; + struct proc *procp; + struct proc *next; + + if (use_inkernel_interface) { + return; + } + + for (i = 0; i <= ADJTOSLOT(OOM_SCORE_ADJ_MAX); i++) { + procadjslot_list[i].next = &procadjslot_list[i]; + procadjslot_list[i].prev = &procadjslot_list[i]; + } + + for (i = 0; i < PIDHASH_SZ; i++) { + procp = pidhash[i]; + while (procp) { + next = procp->pidhash_next; + free(procp); + procp = next; + } + } + memset(&pidhash[0], 0, sizeof(pidhash)); +} + static void cmd_target(int ntargets, LMKD_CTRL_PACKET packet) { int i; struct lmk_target target; @@ -634,6 +659,11 @@ static void ctrl_command_handler(int dsock_idx) { goto wronglen; cmd_procremove(packet); break; + case LMK_PROCPURGE: + if (nargs != 0) + goto wronglen; + cmd_procpurge(); + break; default: ALOGE("Received unknown command code %d", cmd); return; From d128423ee46f013f5c41e77c088828d948e6ea58 Mon Sep 17 00:00:00 2001 From: Rajeev Kumar Date: Fri, 5 Oct 2018 12:34:59 -0700 Subject: [PATCH 3/4] Read memory stats from /proc/pid/stat file. (cherry pick from commit 0301683e49ab255769b15469487feaab3466167a) Bug: 117333340 Test: Manual testing using alloc-stress tool Change-Id: Ie555933aafa6a6b7aa1dbf5518ebe804376e0afd Merged-In: Ie555933aafa6a6b7aa1dbf5518ebe804376e0afd --- lmkd.c | 104 ++++++++++++++++++++++++++++++++++++++++------------- statslog.h | 3 ++ 2 files changed, 82 insertions(+), 25 deletions(-) diff --git a/lmkd.c b/lmkd.c index bf265b0..74049ea 100644 --- a/lmkd.c +++ b/lmkd.c @@ -114,6 +114,7 @@ static bool low_ram_device; static bool kill_heaviest_task; static unsigned long kill_timeout_ms; static bool use_minfree_levels; +static bool per_app_memcg; /* data required to handle events */ struct event_handler_info { @@ -523,6 +524,10 @@ static void cmd_procremove(LMKD_CTRL_PACKET packet) { return; lmkd_pack_get_procremove(packet, ¶ms); + /* + * WARNING: After pid_remove() procp is freed and can't be used! + * Therefore placed at the end of the function. + */ pid_remove(params.pid); } @@ -699,7 +704,7 @@ static void ctrl_connect_handler(int data __unused, uint32_t events __unused) { } #ifdef LMKD_LOG_STATS -static void memory_stat_parse_line(char *line, struct memory_stat *mem_st) { +static void memory_stat_parse_line(char* line, struct memory_stat* mem_st) { char key[LINE_MAX + 1]; int64_t value; @@ -721,25 +726,62 @@ static void memory_stat_parse_line(char *line, struct memory_stat *mem_st) { mem_st->swap_in_bytes = value; } -static int memory_stat_parse(struct memory_stat *mem_st, int pid, uid_t uid) { - FILE *fp; - char buf[PATH_MAX]; +static int memory_stat_from_cgroup(struct memory_stat* mem_st, int pid, uid_t uid) { + FILE* fp; + char buf[PATH_MAX]; - snprintf(buf, sizeof(buf), MEMCG_PROCESS_MEMORY_STAT_PATH, uid, pid); + snprintf(buf, sizeof(buf), MEMCG_PROCESS_MEMORY_STAT_PATH, uid, pid); - fp = fopen(buf, "r"); + fp = fopen(buf, "r"); - if (fp == NULL) { - ALOGE("%s open failed: %s", buf, strerror(errno)); - return -1; - } + if (fp == NULL) { + ALOGE("%s open failed: %s", buf, strerror(errno)); + return -1; + } - while (fgets(buf, PAGE_SIZE, fp) != NULL ) { - memory_stat_parse_line(buf, mem_st); - } - fclose(fp); + while (fgets(buf, PAGE_SIZE, fp) != NULL) { + memory_stat_parse_line(buf, mem_st); + } + fclose(fp); - return 0; + return 0; +} + +static int memory_stat_from_procfs(struct memory_stat* mem_st, int pid) { + char path[PATH_MAX]; + char buffer[PROC_STAT_BUFFER_SIZE]; + int fd, ret; + + snprintf(path, sizeof(path), PROC_STAT_FILE_PATH, pid); + if ((fd = open(path, O_RDONLY | O_CLOEXEC)) < 0) { + ALOGE("%s open failed: %s", path, strerror(errno)); + return -1; + } + + ret = read(fd, buffer, sizeof(buffer)); + if (ret < 0) { + ALOGE("%s read failed: %s", path, strerror(errno)); + close(fd); + return -1; + } + close(fd); + + // field 10 is pgfault + // field 12 is pgmajfault + // field 24 is rss_in_pages + int64_t pgfault = 0, pgmajfault = 0, rss_in_pages = 0; + if (sscanf(buffer, + "%*u %*s %*s %*d %*d %*d %*d %*d %*d %" SCNd64 " %*d " + "%" SCNd64 " %*d %*u %*u %*d %*d %*d %*d %*d %*d " + "%*d %*d %" SCNd64 "", + &pgfault, &pgmajfault, &rss_in_pages) != 3) { + return -1; + } + mem_st->pgfault = pgfault; + mem_st->pgmajfault = pgmajfault; + mem_st->rss_in_bytes = (rss_in_pages * PAGE_SIZE); + + return 0; } #endif @@ -964,6 +1006,7 @@ static int kill_one_process(struct proc* procp) { char *taskname; int tasksize; int r; + int result = -1; #ifdef LMKD_LOG_STATS struct memory_stat mem_st = {}; @@ -972,46 +1015,57 @@ static int kill_one_process(struct proc* procp) { taskname = proc_get_name(pid); if (!taskname) { - pid_remove(pid); - return -1; + goto out; } tasksize = proc_get_size(pid); if (tasksize <= 0) { - pid_remove(pid); - return -1; + goto out; } #ifdef LMKD_LOG_STATS if (enable_stats_log) { - memory_stat_parse_result = memory_stat_parse(&mem_st, pid, uid); + if (per_app_memcg) { + memory_stat_parse_result = memory_stat_from_cgroup(&mem_st, pid, uid); + } else { + memory_stat_parse_result = memory_stat_from_procfs(&mem_st, pid); + } } #endif TRACE_KILL_START(pid); + /* CAP_KILL required */ r = kill(pid, SIGKILL); ALOGI("Kill '%s' (%d), uid %d, oom_adj %d to free %ldkB", taskname, pid, uid, procp->oomadj, tasksize * page_k); - pid_remove(pid); TRACE_KILL_END(); if (r) { ALOGE("kill(%d): errno=%d", pid, errno); - return -1; + goto out; } else { #ifdef LMKD_LOG_STATS if (memory_stat_parse_result == 0) { stats_write_lmk_kill_occurred(log_ctx, LMK_KILL_OCCURRED, uid, taskname, procp->oomadj, mem_st.pgfault, mem_st.pgmajfault, mem_st.rss_in_bytes, mem_st.cache_in_bytes, mem_st.swap_in_bytes); + } else if (enable_stats_log) { + stats_write_lmk_kill_occurred(log_ctx, LMK_KILL_OCCURRED, uid, taskname, procp->oomadj, + -1, -1, tasksize * BYTES_IN_KILOBYTE, -1, -1); } #endif - return tasksize; + result = tasksize; } - return tasksize; +out: + /* + * WARNING: After pid_remove() procp is freed and can't be used! + * Therefore placed at the end of the function. + */ + pid_remove(pid); + return result; } /* @@ -1556,7 +1610,7 @@ int main(int argc __unused, char **argv __unused) { (unsigned long)property_get_int32("ro.lmk.kill_timeout_ms", 0); use_minfree_levels = property_get_bool("ro.lmk.use_minfree_levels", false); - + per_app_memcg = property_get_bool("ro.config.per_app_memcg", low_ram_device); #ifdef LMKD_LOG_STATS statslog_init(&log_ctx, &enable_stats_log); #endif diff --git a/statslog.h b/statslog.h index edebb19..8458480 100644 --- a/statslog.h +++ b/statslog.h @@ -67,6 +67,9 @@ struct memory_stat { }; #define MEMCG_PROCESS_MEMORY_STAT_PATH "/dev/memcg/apps/uid_%u/pid_%u/memory.stat" +#define PROC_STAT_FILE_PATH "/proc/%d/stat" +#define PROC_STAT_BUFFER_SIZE 1024 +#define BYTES_IN_KILOBYTE 1024 /** * Logs the change in LMKD state which is used as start/stop boundaries for logging From 6a9c142f1b99c3e64f5d15c600df8399f804f8d9 Mon Sep 17 00:00:00 2001 From: Tim Murray Date: Thu, 25 Oct 2018 17:05:41 -0700 Subject: [PATCH 4/4] DO NOT MERGE: lmkd: retune rate at which processes are killed Kill a single process at a time and try to wait up to 100ms for that process to reclaim memory before triggering another kill. Test: boots, works bug: 116877958 Change-Id: I6775d0534b3e3728c04389d3eae1a00e3cbf9f27 --- lmkd.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/lmkd.c b/lmkd.c index 11151ca..ef1bf10 100644 --- a/lmkd.c +++ b/lmkd.c @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include #include #include @@ -1029,6 +1029,8 @@ static struct proc *proc_get_heaviest(int oomadj) { return maxprocp; } +static int last_killed_pid = -1; + /* Kill one process specified by procp. Returns the size of the process killed */ static int kill_one_process(struct proc* procp) { int pid = procp->pid; @@ -1072,6 +1074,8 @@ static int kill_one_process(struct proc* procp) { TRACE_KILL_END(); + last_killed_pid = pid; + if (r) { ALOGE("kill(%d): errno=%d", pid, errno); goto out; @@ -1219,6 +1223,22 @@ static inline unsigned long get_time_diff_ms(struct timeval *from, (to->tv_usec - from->tv_usec) / 1000; } +static bool is_kill_pending(void) { + char buf[24]; + if (last_killed_pid < 0) { + return false; + } + + snprintf(buf, sizeof(buf), "/proc/%d/", last_killed_pid); + if (access(buf, F_OK) == 0) { + return true; + } + + // reset last killed PID because there's nothing pending + last_killed_pid = -1; + return false; +} + static void mp_event_common(int data, uint32_t events __unused) { int ret; unsigned long long evcount; @@ -1260,7 +1280,11 @@ static void mp_event_common(int data, uint32_t events __unused) { gettimeofday(&curr_tm, NULL); if (kill_timeout_ms) { - if (get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) { + // If we're within the timeout, see if there's pending reclaim work + // from the last killed process. If there is (as evidenced by + // /proc/ continuing to exist), skip killing for now. + if ((get_time_diff_ms(&last_kill_tm, &curr_tm) < kill_timeout_ms) && + (low_ram_device || is_kill_pending())) { kill_skip_count++; return; } @@ -1397,7 +1421,7 @@ do_kill: min_score_adj = level_oomadj[level]; } - pages_freed = find_and_kill_processes(min_score_adj, pages_to_free); + pages_freed = find_and_kill_processes(min_score_adj, 0); if (pages_freed == 0) { /* Rate limit kill reports when nothing was reclaimed */ @@ -1405,10 +1429,8 @@ do_kill: report_skip_count++; return; } - } - - if (pages_freed >= pages_to_free) { - /* Reset kill time only if reclaimed enough memory */ + } else { + /* If we killed anything, update the last killed timestamp. */ last_kill_tm = curr_tm; }