lmkd: Introduce kill strategy based on direct reclaim length
Add kill reason for when the device is stuck in direct reclaim for longer than the configurable threshold. Only allow configurable threshold, and direct reclaim stuck detection, if memevents direct reclaim monitoring is enabled. Test: Verified direct reclaim stuck kill log with memory pressure test Test: m Bug: 244232958 Change-Id: I1156899874d2eb7e0f4b61597741087c110b3414 Signed-off-by: Carlos Galo <carlosgalo@google.com>
This commit is contained in:
parent
9e136285a6
commit
81a7c21087
|
|
@ -92,6 +92,9 @@ properties:
|
||||||
cycle after kill used to allow blocking of killing
|
cycle after kill used to allow blocking of killing
|
||||||
critical processes when not enough memory was freed
|
critical processes when not enough memory was freed
|
||||||
in a kill cycle. Default score = 0.
|
in a kill cycle. Default score = 0.
|
||||||
|
- `ro.lmk.direct_reclaim_threshold_ms`: direct reclaim duration threshold in
|
||||||
|
milliseconds to consider the system as stuck in
|
||||||
|
direct reclaim. Default = 0 (disabled)
|
||||||
|
|
||||||
lmkd will set the following Android properties according to current system
|
lmkd will set the following Android properties according to current system
|
||||||
configurations:
|
configurations:
|
||||||
|
|
|
||||||
18
lmkd.cpp
18
lmkd.cpp
|
|
@ -40,7 +40,6 @@
|
||||||
#include <shared_mutex>
|
#include <shared_mutex>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include <bpf/KernelUtils.h>
|
|
||||||
#include <bpf/WaitForProgsLoaded.h>
|
#include <bpf/WaitForProgsLoaded.h>
|
||||||
#include <cutils/properties.h>
|
#include <cutils/properties.h>
|
||||||
#include <cutils/sockets.h>
|
#include <cutils/sockets.h>
|
||||||
|
|
@ -161,6 +160,8 @@ static inline void trace_kill_end() {}
|
||||||
#define DEF_PARTIAL_STALL 70
|
#define DEF_PARTIAL_STALL 70
|
||||||
/* ro.lmk.psi_complete_stall_ms property defaults */
|
/* ro.lmk.psi_complete_stall_ms property defaults */
|
||||||
#define DEF_COMPLETE_STALL 700
|
#define DEF_COMPLETE_STALL 700
|
||||||
|
/* ro.lmk.direct_reclaim_threshold_ms property defaults */
|
||||||
|
#define DEF_DIRECT_RECL_THRESH_MS 0
|
||||||
|
|
||||||
#define LMKD_REINIT_PROP "lmkd.reinit"
|
#define LMKD_REINIT_PROP "lmkd.reinit"
|
||||||
|
|
||||||
|
|
@ -228,6 +229,7 @@ static int64_t stall_limit_critical;
|
||||||
static bool use_psi_monitors = false;
|
static bool use_psi_monitors = false;
|
||||||
static int kpoll_fd;
|
static int kpoll_fd;
|
||||||
static bool delay_monitors_until_boot;
|
static bool delay_monitors_until_boot;
|
||||||
|
static int direct_reclaim_threshold_ms;
|
||||||
static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
|
static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
|
||||||
{ PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */
|
{ PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */
|
||||||
{ PSI_SOME, 100 }, /* 100ms out of 1sec for partial stall */
|
{ PSI_SOME, 100 }, /* 100ms out of 1sec for partial stall */
|
||||||
|
|
@ -2631,6 +2633,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
|
||||||
int64_t workingset_refault_file;
|
int64_t workingset_refault_file;
|
||||||
bool critical_stall = false;
|
bool critical_stall = false;
|
||||||
bool in_direct_reclaim;
|
bool in_direct_reclaim;
|
||||||
|
long direct_reclaim_duration_ms;
|
||||||
|
|
||||||
if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
|
if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
|
||||||
ALOGE("Failed to get current time");
|
ALOGE("Failed to get current time");
|
||||||
|
|
@ -2692,6 +2695,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
|
||||||
init_pgscan_direct = vs.field.pgscan_direct;
|
init_pgscan_direct = vs.field.pgscan_direct;
|
||||||
init_pgscan_kswapd = vs.field.pgscan_kswapd;
|
init_pgscan_kswapd = vs.field.pgscan_kswapd;
|
||||||
init_pgrefill = vs.field.pgrefill;
|
init_pgrefill = vs.field.pgrefill;
|
||||||
|
direct_reclaim_duration_ms = get_time_diff_ms(&direct_reclaim_start_tm, &curr_tm);
|
||||||
reclaim = DIRECT_RECLAIM;
|
reclaim = DIRECT_RECLAIM;
|
||||||
} else if (vs.field.pgscan_kswapd != init_pgscan_kswapd) {
|
} else if (vs.field.pgscan_kswapd != init_pgscan_kswapd) {
|
||||||
init_pgscan_kswapd = vs.field.pgscan_kswapd;
|
init_pgscan_kswapd = vs.field.pgscan_kswapd;
|
||||||
|
|
@ -2849,6 +2853,12 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
|
||||||
min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
|
min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
|
||||||
}
|
}
|
||||||
check_filecache = true;
|
check_filecache = true;
|
||||||
|
} else if (reclaim == DIRECT_RECLAIM && direct_reclaim_threshold_ms > 0 &&
|
||||||
|
direct_reclaim_duration_ms > direct_reclaim_threshold_ms) {
|
||||||
|
kill_reason = DIRECT_RECL_STUCK;
|
||||||
|
snprintf(kill_desc, sizeof(kill_desc),
|
||||||
|
"device is stuck in direct reclaim (%" PRId64 "ms > %dms)",
|
||||||
|
direct_reclaim_duration_ms, direct_reclaim_threshold_ms);
|
||||||
} else if (check_filecache) {
|
} else if (check_filecache) {
|
||||||
int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
|
int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
|
||||||
|
|
||||||
|
|
@ -3499,6 +3509,10 @@ static bool init_monitors() {
|
||||||
ALOGI("Using memevents for direct reclaim detection");
|
ALOGI("Using memevents for direct reclaim detection");
|
||||||
} else {
|
} else {
|
||||||
ALOGI("Using vmstats for direct reclaim detection");
|
ALOGI("Using vmstats for direct reclaim detection");
|
||||||
|
if (direct_reclaim_threshold_ms > 0) {
|
||||||
|
ALOGW("Kernel support for direct_reclaim_threshold_ms is not found");
|
||||||
|
direct_reclaim_threshold_ms = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
monitors_initialized = true;
|
monitors_initialized = true;
|
||||||
|
|
@ -3916,6 +3930,8 @@ static bool update_props() {
|
||||||
filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
|
filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
|
||||||
stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);
|
stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);
|
||||||
delay_monitors_until_boot = GET_LMK_PROPERTY(bool, "delay_monitors_until_boot", false);
|
delay_monitors_until_boot = GET_LMK_PROPERTY(bool, "delay_monitors_until_boot", false);
|
||||||
|
direct_reclaim_threshold_ms =
|
||||||
|
GET_LMK_PROPERTY(int64, "direct_reclaim_threshold_ms", DEF_DIRECT_RECL_THRESH_MS);
|
||||||
|
|
||||||
reaper.enable_debug(debug_process_killing);
|
reaper.enable_debug(debug_process_killing);
|
||||||
|
|
||||||
|
|
|
||||||
3
lmkd.rc
3
lmkd.rc
|
|
@ -49,3 +49,6 @@ on property:persist.device_config.lmkd_native.swap_util_max=*
|
||||||
|
|
||||||
on property:persist.device_config.lmkd_native.filecache_min_kb=*
|
on property:persist.device_config.lmkd_native.filecache_min_kb=*
|
||||||
setprop lmkd.reinit ${sys.boot_completed:-0}
|
setprop lmkd.reinit ${sys.boot_completed:-0}
|
||||||
|
|
||||||
|
on property:persist.device_config.lmkd_native.direct_reclaim_threshold_ms=*
|
||||||
|
setprop lmkd.reinit ${sys.boot_completed:-0}
|
||||||
|
|
|
||||||
|
|
@ -65,6 +65,7 @@ enum kill_reasons {
|
||||||
LOW_MEM_AND_SWAP_UTIL,
|
LOW_MEM_AND_SWAP_UTIL,
|
||||||
LOW_FILECACHE_AFTER_THRASHING,
|
LOW_FILECACHE_AFTER_THRASHING,
|
||||||
LOW_MEM,
|
LOW_MEM,
|
||||||
|
DIRECT_RECL_STUCK,
|
||||||
KILL_REASON_COUNT
|
KILL_REASON_COUNT
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue