lmkd: Allow killing perceptible apps when recorded stall is too high

When system is under heavy memory pressure the system might be able
to keep free memory above the min watermark avoiding perceptible app
kills. In such situation system might end up using all its cpu
capacity on memory reclaim and not doing productive work. To detect
this condition, check memory full stall and compare it with the new
ro.lmk.stall_limit_critical tunable representing the stall threshold.
When the recorded level is over ro.lmk.stall_limit_critical, lmkd will
be allowed to kill perceptible apps. ro.lmk.stall_limit_critical
represents the max memory full stall in % that is allowed before
perceptible apps will get killed. By default it is set to 100%, which
effectively disables the feature.
Currently system stall is measured based on psi memory stall 10s average
value, however this definition might change in the future if better
metrics are developed. Setting ro.lmk.stall_limit_critical to 5 means
the system should be fully stalled (no productive work is done) for 5%
of the 10sec period, resulting in 0.5 sec loss due to the stall.

Bug: 205182133
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Change-Id: I9713e30d82641d86d1b7edb5e1ba2971b935c898
This commit is contained in:
Suren Baghdasaryan 2022-02-10 21:10:23 -08:00
parent 2bf5487381
commit 5ae47a9563
3 changed files with 80 additions and 5 deletions

View File

@ -22,12 +22,25 @@
__BEGIN_DECLS __BEGIN_DECLS
#define PSI_PATH_MEMORY "/proc/pressure/memory"
enum psi_stall_type { enum psi_stall_type {
PSI_SOME, PSI_SOME,
PSI_FULL, PSI_FULL,
PSI_TYPE_COUNT PSI_TYPE_COUNT
}; };
struct psi_stats {
float avg10;
float avg60;
float avg300;
unsigned long total;
};
struct psi_data {
struct psi_stats mem_stats[PSI_TYPE_COUNT];
};
/* /*
* Initializes psi monitor. * Initializes psi monitor.
* stall_type, threshold_us and window_us are monitor parameters * stall_type, threshold_us and window_us are monitor parameters
@ -63,6 +76,13 @@ int unregister_psi_monitor(int epollfd, int fd);
*/ */
void destroy_psi_monitor(int fd); void destroy_psi_monitor(int fd);
/*
* Parse psi file line content. Expected file format is:
* some avg10=0.00 avg60=0.00 avg300=0.00 total=0
* full avg10=0.00 avg60=0.00 avg300=0.00 total=0
*/
int parse_psi_line(char *line, enum psi_stall_type stall_type, struct psi_stats stats[]);
__END_DECLS __END_DECLS
#endif // __ANDROID_PSI_H__ #endif // __ANDROID_PSI_H__

View File

@ -28,8 +28,6 @@
#include <stdio.h> #include <stdio.h>
#include "psi/psi.h" #include "psi/psi.h"
#define PSI_MON_FILE_MEMORY "/proc/pressure/memory"
static const char* stall_type_name[] = { static const char* stall_type_name[] = {
"some", "some",
"full", "full",
@ -41,7 +39,7 @@ int init_psi_monitor(enum psi_stall_type stall_type,
int res; int res;
char buf[256]; char buf[256];
fd = TEMP_FAILURE_RETRY(open(PSI_MON_FILE_MEMORY, O_WRONLY | O_CLOEXEC)); fd = TEMP_FAILURE_RETRY(open(PSI_PATH_MEMORY, O_WRONLY | O_CLOEXEC));
if (fd < 0) { if (fd < 0) {
ALOGE("No kernel psi monitor support (errno=%d)", errno); ALOGE("No kernel psi monitor support (errno=%d)", errno);
return -1; return -1;
@ -61,7 +59,7 @@ int init_psi_monitor(enum psi_stall_type stall_type,
if (res >= (ssize_t)sizeof(buf)) { if (res >= (ssize_t)sizeof(buf)) {
ALOGE("%s line overflow for psi stall type '%s'", ALOGE("%s line overflow for psi stall type '%s'",
PSI_MON_FILE_MEMORY, stall_type_name[stall_type]); PSI_PATH_MEMORY, stall_type_name[stall_type]);
errno = EINVAL; errno = EINVAL;
goto err; goto err;
} }
@ -69,7 +67,7 @@ int init_psi_monitor(enum psi_stall_type stall_type,
res = TEMP_FAILURE_RETRY(write(fd, buf, strlen(buf) + 1)); res = TEMP_FAILURE_RETRY(write(fd, buf, strlen(buf) + 1));
if (res < 0) { if (res < 0) {
ALOGE("%s write failed for psi stall type '%s'; errno=%d", ALOGE("%s write failed for psi stall type '%s'; errno=%d",
PSI_MON_FILE_MEMORY, stall_type_name[stall_type], errno); PSI_PATH_MEMORY, stall_type_name[stall_type], errno);
goto err; goto err;
} }
@ -102,3 +100,17 @@ void destroy_psi_monitor(int fd) {
close(fd); close(fd);
} }
} }
int parse_psi_line(char *line, enum psi_stall_type stall_type, struct psi_stats stats[]) {
char type_name[5];
struct psi_stats *stat = &stats[stall_type];
if (!line || sscanf(line, "%4s avg10=%f avg60=%f avg300=%f total=%lu",
type_name, &stat->avg10, &stat->avg60, &stat->avg300, &stat->total) != 5) {
return -1;
}
if (strcmp(type_name, stall_type_name[stall_type])) {
return -1;
}
return 0;
}

View File

@ -218,6 +218,7 @@ static int thrashing_limit_decay_pct;
static int thrashing_critical_pct; static int thrashing_critical_pct;
static int swap_util_max; static int swap_util_max;
static int64_t filecache_min_kb; static int64_t filecache_min_kb;
static int64_t stall_limit_critical;
static bool use_psi_monitors = false; static bool use_psi_monitors = false;
static int kpoll_fd; static int kpoll_fd;
static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = { static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
@ -1915,6 +1916,37 @@ static int vmstat_parse(union vmstat *vs) {
return 0; return 0;
} }
static int psi_parse(struct reread_data *file_data, struct psi_stats stats[], bool full) {
char *buf;
char *save_ptr;
char *line;
if ((buf = reread_file(file_data)) == NULL) {
return -1;
}
line = strtok_r(buf, "\n", &save_ptr);
if (parse_psi_line(line, PSI_SOME, stats)) {
return -1;
}
if (full) {
line = strtok_r(NULL, "\n", &save_ptr);
if (parse_psi_line(line, PSI_FULL, stats)) {
return -1;
}
}
return 0;
}
static int psi_parse_mem(struct psi_data *psi_data) {
static struct reread_data file_data = {
.filename = PSI_PATH_MEMORY,
.fd = -1,
};
return psi_parse(&file_data, psi_data->mem_stats, true);
}
enum wakeup_reason { enum wakeup_reason {
Event, Event,
Polling Polling
@ -2500,6 +2532,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
union meminfo mi; union meminfo mi;
union vmstat vs; union vmstat vs;
struct psi_data psi_data;
struct timespec curr_tm; struct timespec curr_tm;
int64_t thrashing = 0; int64_t thrashing = 0;
bool swap_is_low = false; bool swap_is_low = false;
@ -2515,6 +2548,7 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
int64_t swap_low_threshold; int64_t swap_low_threshold;
long since_thrashing_reset_ms; long since_thrashing_reset_ms;
int64_t workingset_refault_file; int64_t workingset_refault_file;
bool critical_stall = false;
if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
ALOGE("Failed to get current time"); ALOGE("Failed to get current time");
@ -2647,6 +2681,9 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
/* Find out which watermark is breached if any */ /* Find out which watermark is breached if any */
wmark = get_lowest_watermark(&mi, &watermarks); wmark = get_lowest_watermark(&mi, &watermarks);
if (!psi_parse_mem(&psi_data)) {
critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
}
/* /*
* TODO: move this logic into a separate function * TODO: move this logic into a separate function
* Decide if killing a process is necessary and record the reason * Decide if killing a process is necessary and record the reason
@ -2744,6 +2781,11 @@ static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_
.thrashing = (int)thrashing, .thrashing = (int)thrashing,
.max_thrashing = max_thrashing, .max_thrashing = max_thrashing,
}; };
/* Allow killing perceptible apps if the system is stalled */
if (critical_stall) {
min_score_adj = 0;
}
int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm); int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm);
if (pages_freed > 0) { if (pages_freed > 0) {
killing = true; killing = true;
@ -3601,6 +3643,7 @@ static void update_props() {
thrashing_limit_pct * 2)); thrashing_limit_pct * 2));
swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100)); swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100));
filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0); filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);
reaper.enable_debug(debug_process_killing); reaper.enable_debug(debug_process_killing);
} }