From 81a7c210876daff7ed0c3f8f1b27039f99f0aed7 Mon Sep 17 00:00:00 2001 From: Carlos Galo Date: Mon, 12 Feb 2024 13:56:15 -0800 Subject: [PATCH] lmkd: Introduce kill strategy based on direct reclaim length Add kill reason for when the device is stuck in direct reclaim for longer than the configurable threshold. Only allow configurable threshold, and direct reclaim stuck detection, if memevents direct reclaim monitoring is enabled. Test: Verified direct reclaim stuck kill log with memory pressure test Test: m Bug: 244232958 Change-Id: I1156899874d2eb7e0f4b61597741087c110b3414 Signed-off-by: Carlos Galo --- README.md | 3 +++ lmkd.cpp | 18 +++++++++++++++++- lmkd.rc | 3 +++ statslog.h | 1 + 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ae406e3..c378bda 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,9 @@ properties: cycle after kill used to allow blocking of killing critical processes when not enough memory was freed in a kill cycle. Default score = 0. + - `ro.lmk.direct_reclaim_threshold_ms`: direct reclaim duration threshold in + milliseconds to consider the system as stuck in + direct reclaim. Default = 0 (disabled) lmkd will set the following Android properties according to current system configurations: diff --git a/lmkd.cpp b/lmkd.cpp index ded3510..30b4338 100644 --- a/lmkd.cpp +++ b/lmkd.cpp @@ -40,7 +40,6 @@ #include #include -#include #include #include #include @@ -161,6 +160,8 @@ static inline void trace_kill_end() {} #define DEF_PARTIAL_STALL 70 /* ro.lmk.psi_complete_stall_ms property defaults */ #define DEF_COMPLETE_STALL 700 +/* ro.lmk.direct_reclaim_threshold_ms property defaults */ +#define DEF_DIRECT_RECL_THRESH_MS 0 #define LMKD_REINIT_PROP "lmkd.reinit" @@ -228,6 +229,7 @@ static int64_t stall_limit_critical; static bool use_psi_monitors = false; static int kpoll_fd; static bool delay_monitors_until_boot; +static int direct_reclaim_threshold_ms; static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = { { PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */ { PSI_SOME, 100 }, /* 100ms out of 1sec for partial stall */ @@ -2631,6 +2633,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ int64_t workingset_refault_file; bool critical_stall = false; bool in_direct_reclaim; + long direct_reclaim_duration_ms; if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { ALOGE("Failed to get current time"); @@ -2692,6 +2695,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ init_pgscan_direct = vs.field.pgscan_direct; init_pgscan_kswapd = vs.field.pgscan_kswapd; init_pgrefill = vs.field.pgrefill; + direct_reclaim_duration_ms = get_time_diff_ms(&direct_reclaim_start_tm, &curr_tm); reclaim = DIRECT_RECLAIM; } else if (vs.field.pgscan_kswapd != init_pgscan_kswapd) { init_pgscan_kswapd = vs.field.pgscan_kswapd; @@ -2849,6 +2853,12 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_ min_score_adj = PERCEPTIBLE_APP_ADJ + 1; } check_filecache = true; + } else if (reclaim == DIRECT_RECLAIM && direct_reclaim_threshold_ms > 0 && + direct_reclaim_duration_ms > direct_reclaim_threshold_ms) { + kill_reason = DIRECT_RECL_STUCK; + snprintf(kill_desc, sizeof(kill_desc), + "device is stuck in direct reclaim (%" PRId64 "ms > %dms)", + direct_reclaim_duration_ms, direct_reclaim_threshold_ms); } else if (check_filecache) { int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k; @@ -3499,6 +3509,10 @@ static bool init_monitors() { ALOGI("Using memevents for direct reclaim detection"); } else { ALOGI("Using vmstats for direct reclaim detection"); + if (direct_reclaim_threshold_ms > 0) { + ALOGW("Kernel support for direct_reclaim_threshold_ms is not found"); + direct_reclaim_threshold_ms = 0; + } } monitors_initialized = true; @@ -3916,6 +3930,8 @@ static bool update_props() { filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0); stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100); delay_monitors_until_boot = GET_LMK_PROPERTY(bool, "delay_monitors_until_boot", false); + direct_reclaim_threshold_ms = + GET_LMK_PROPERTY(int64, "direct_reclaim_threshold_ms", DEF_DIRECT_RECL_THRESH_MS); reaper.enable_debug(debug_process_killing); diff --git a/lmkd.rc b/lmkd.rc index ba662b4..ffe0bc6 100644 --- a/lmkd.rc +++ b/lmkd.rc @@ -49,3 +49,6 @@ on property:persist.device_config.lmkd_native.swap_util_max=* on property:persist.device_config.lmkd_native.filecache_min_kb=* setprop lmkd.reinit ${sys.boot_completed:-0} + +on property:persist.device_config.lmkd_native.direct_reclaim_threshold_ms=* + setprop lmkd.reinit ${sys.boot_completed:-0} diff --git a/statslog.h b/statslog.h index 292d556..60c7016 100644 --- a/statslog.h +++ b/statslog.h @@ -65,6 +65,7 @@ enum kill_reasons { LOW_MEM_AND_SWAP_UTIL, LOW_FILECACHE_AFTER_THRASHING, LOW_MEM, + DIRECT_RECL_STUCK, KILL_REASON_COUNT };