Commit 4b87af0c authored by kmjohansen's avatar kmjohansen Committed by Sasha Goldshtein

bcc container improvements (#1051)

* Bcc should look at mountns during symbol resolution.

Allow bcc to resolve symbols in processes that have mappings in a
different mount namespace.  This allows us to obtain stack traces from
the host when our target resides in a container.  With this change it's
possible to get stacks from targets that used to show up as unknown.

* When searching for perf-map files look in container, and then host.

Allow perf-map files to exist either in the container under the pid
that's specific to the container's pid namespace, or in the host
container using the pid that's specific to the initial pid namespace.
This lets us store the perf-map either in the continer or on the host,
depending upon which is easier for the person performing the debugging.

* Allow bcc to place uprobes and USDT probes in containers.

The uprobe/usdt mechanism uses the target's inode in order to determine
where to place the probe.  The inode lookup occurs at the time the file
path is written to uprobe_events.  If bpf_attach_uprobe() has been
passed a pid, and that pid is in a different mount namespace from the
caller, attempt to switch to the victim's mount namespace so that we can
select the correct inode for the probe.

* Add unit tests for the container improvements code.
parent 26383f25
......@@ -13,10 +13,12 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdbool.h>
......@@ -392,3 +394,94 @@ char *bcc_procutils_which_so(const char *libname, int pid) {
void bcc_procutils_free(const char *ptr) {
free((void *)ptr);
}
bool bcc_procutils_enter_mountns(int pid, struct ns_cookie *nc) {
char curnspath[4096];
char newnspath[4096];
int oldns = -1;
int newns = -1;
struct stat ons_stat;
struct stat nns_stat;
if (nc == NULL)
return false;
nc->nsc_oldns = -1;
nc->nsc_newns = -1;
if (snprintf(curnspath, 4096, "/proc/self/ns/mnt") == 4096) {
return false;
}
if (snprintf(newnspath, 4096, "/proc/%d/ns/mnt", pid) == 4096) {
return false;
}
if ((oldns = open(curnspath, O_RDONLY)) < 0) {
return false;
}
if ((newns = open(newnspath, O_RDONLY)) < 0) {
goto errout;
}
if (fstat(oldns, &ons_stat) < 0) {
goto errout;
}
if (fstat(newns, &nns_stat) < 0) {
goto errout;
}
/*
* Only switch to the new namespace if it doesn't match the existing
* namespace. This prevents us from getting an EPERM when trying to enter an
* identical namespace.
*/
if (ons_stat.st_ino == nns_stat.st_ino) {
goto errout;
}
if (setns(newns, CLONE_NEWNS) < 0) {
goto errout;
}
nc->nsc_oldns = oldns;
nc->nsc_newns = newns;
return true;
errout:
if (oldns > -1) {
(void) close(oldns);
}
if (newns > -1) {
(void) close(newns);
}
return false;
}
bool bcc_procutils_exit_mountns(struct ns_cookie *nc) {
bool rc = false;
if (nc == NULL)
return rc;
if (nc->nsc_oldns == -1 || nc->nsc_newns == -1)
return rc;
if (setns(nc->nsc_oldns, CLONE_NEWNS) == 0) {
rc = true;
}
if (nc->nsc_oldns > -1) {
(void) close(nc->nsc_oldns);
nc->nsc_oldns = -1;
}
if (nc->nsc_newns > -1) {
(void) close(nc->nsc_newns);
nc->nsc_newns = -1;
}
return rc;
}
......@@ -24,6 +24,11 @@ extern "C" {
#include <stdint.h>
struct ns_cookie {
int nsc_oldns;
int nsc_newns;
};
typedef int (*bcc_procutils_modulecb)(const char *, uint64_t, uint64_t, void *);
typedef void (*bcc_procutils_ksymcb)(const char *, uint64_t, void *);
......@@ -34,6 +39,8 @@ int bcc_procutils_each_module(int pid, bcc_procutils_modulecb callback,
void *payload);
int bcc_procutils_each_ksym(bcc_procutils_ksymcb callback, void *payload);
void bcc_procutils_free(const char *ptr);
bool bcc_procutils_enter_mountns(int pid, struct ns_cookie *nc);
bool bcc_procutils_exit_mountns(struct ns_cookie *nc);
#ifdef __cplusplus
}
......
......@@ -100,12 +100,38 @@ void ProcSyms::refresh() {
int ProcSyms::_add_module(const char *modname, uint64_t start, uint64_t end,
void *payload) {
struct ns_cookie nsc = {-1, -1};
bool ns_switch = false;
int arc;
ProcSyms *ps = static_cast<ProcSyms *>(payload);
auto it = std::find_if(ps->modules_.begin(), ps->modules_.end(),
[=](const ProcSyms::Module &m) { return m.name_ == modname; });
if (it == ps->modules_.end())
it = ps->modules_.insert(ps->modules_.end(), modname);
if (it == ps->modules_.end()) {
// If modname references a perf-map, determine if we need to enter a mount
// namespace in order to read symbols from it later.
if (strstr(modname, ".map") != nullptr) {
ns_switch = bcc_procutils_enter_mountns(ps->pid_, &nsc);
if (ns_switch) {
char new_modname[4096];
arc = access(modname, R_OK);
bcc_procutils_exit_mountns(&nsc);
if (arc != 0) {
snprintf(new_modname, sizeof (new_modname), "/tmp/perf-%d.map",
ps->pid_);
it = ps->modules_.insert(ps->modules_.end(), Module(new_modname,
ps->pid_, false));
it->ranges_.push_back(ProcSyms::Module::Range(start, end));
return 0;
}
}
}
it = ps->modules_.insert(ps->modules_.end(), Module(modname, ps->pid_,
ns_switch));
}
it->ranges_.push_back(ProcSyms::Module::Range(start, end));
return 0;
}
......@@ -159,9 +185,13 @@ bool ProcSyms::resolve_name(const char *module, const char *name,
return false;
}
ProcSyms::Module::Module(const char *name)
: name_(name) {
ProcSyms::Module::Module(const char *name, int pid, bool in_ns)
: name_(name), pid_(pid), in_ns_(in_ns) {
struct ns_cookie nsc;
bcc_procutils_enter_mountns(pid_, &nsc);
is_so_ = bcc_elf_is_shared_obj(name) == 1;
bcc_procutils_exit_mountns(&nsc);
}
int ProcSyms::Module::_add_symbol(const char *symname, uint64_t start,
......@@ -177,13 +207,21 @@ bool ProcSyms::Module::is_perf_map() const {
}
void ProcSyms::Module::load_sym_table() {
struct ns_cookie nsc = {-1, -1};
if (syms_.size())
return;
if (is_perf_map())
if (is_perf_map()) {
if (in_ns_)
bcc_procutils_enter_mountns(pid_, &nsc);
bcc_perf_map_foreach_sym(name_.c_str(), _add_symbol, this);
else
} else {
bcc_procutils_enter_mountns(pid_, &nsc);
bcc_elf_foreach_sym(name_.c_str(), _add_symbol, this);
}
bcc_procutils_exit_mountns(&nsc);
std::sort(syms_.begin(), syms_.end());
}
......@@ -352,6 +390,8 @@ int bcc_foreach_symbol(const char *module, SYM_CB cb) {
int bcc_resolve_symname(const char *module, const char *symname,
const uint64_t addr, int pid, struct bcc_symbol *sym) {
uint64_t load_addr;
struct ns_cookie nsc = {-1, -1};
bool success = true;
sym->module = NULL;
sym->name = NULL;
......@@ -369,20 +409,29 @@ int bcc_resolve_symname(const char *module, const char *symname,
if (sym->module == NULL)
return -1;
bcc_procutils_enter_mountns(pid, &nsc);
if (bcc_elf_loadaddr(sym->module, &load_addr) < 0) {
sym->module = NULL;
return -1;
success = false;
goto exitns;
}
sym->name = symname;
sym->offset = addr;
if (sym->name && sym->offset == 0x0) {
if (bcc_find_symbol_addr(sym) < 0)
return -1;
if (bcc_find_symbol_addr(sym) < 0) {
sym->module = NULL;
success = false;
goto exitns;
}
}
if (sym->offset == 0x0)
exitns:
bcc_procutils_exit_mountns(&nsc);
if (!success || sym->offset == 0x0)
return -1;
sym->offset = (sym->offset - load_addr);
......
......@@ -38,6 +38,7 @@
#include <sys/stat.h>
#include <sys/types.h>
#include "bcc_proc.h"
#include "libbpf.h"
#include "perf_reader.h"
......@@ -412,6 +413,7 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con
char new_name[128];
struct perf_reader *reader = NULL;
static char *event_type = "uprobe";
struct ns_cookie nsc = {-1, -1};
int n;
snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
......@@ -432,12 +434,15 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con
close(kfd);
goto error;
}
bcc_procutils_enter_mountns(pid, &nsc);
if (write(kfd, buf, strlen(buf)) < 0) {
if (errno == EINVAL)
fprintf(stderr, "check dmesg output for possible cause\n");
close(kfd);
goto error;
}
bcc_procutils_exit_mountns(&nsc);
close(kfd);
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, new_name);
......@@ -447,6 +452,7 @@ void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, con
return reader;
error:
bcc_procutils_exit_mountns(&nsc);
perf_reader_free(reader);
return NULL;
}
......
......@@ -85,10 +85,12 @@ class ProcSyms : SymbolCache {
Range(uint64_t s, uint64_t e) : start(s), end(e) {}
};
Module(const char *name);
Module(const char *name, int pid, bool in_ns);
std::string name_;
std::vector<Range> ranges_;
bool is_so_;
int pid_;
bool in_ns_;
std::unordered_set<std::string> symnames_;
std::vector<Symbol> syms_;
......
......@@ -13,10 +13,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <fcntl.h>
#include <dlfcn.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include "bcc_elf.h"
......@@ -30,6 +34,8 @@
using namespace std;
static pid_t spawn_child(void *, bool, bool, int (*)(void *));
TEST_CASE("shared object resolution", "[c_api]") {
char *libm = bcc_procutils_which_so("m", 0);
REQUIRE(libm);
......@@ -103,6 +109,83 @@ extern "C" int _a_test_function(const char *a_string) {
return i;
}
static int setup_tmp_mnts(void) {
// Disconnect this mount namespace from its parent
if (mount(NULL, "/", NULL, MS_REC|MS_PRIVATE, NULL) < 0) {
fprintf(stderr, "unable to mark / PRIVATE: %s\n", strerror(errno));
return -1;
}
// create a new tmpfs mounted on /tmp
if (mount("tmpfs", "/tmp", "tmpfs", 0, NULL) < 0) {
fprintf(stderr, "unable to mount /tmp in mntns: %s\n", strerror(errno));
return -1;
}
return 0;
}
static int mntns_func(void *arg) {
int in_fd, out_fd;
char buf[4096];
char libpath[1024];
ssize_t rb;
void *dlhdl;
if (setup_tmp_mnts() < 0) {
return -1;
}
// Find libz.so.1, if it's installed
dlhdl = dlopen("libz.so.1", RTLD_LAZY);
if (dlhdl == NULL) {
fprintf(stderr, "Unable to dlopen libz.so.1: %s\n", dlerror());
return -1;
}
if (dlinfo(dlhdl, RTLD_DI_ORIGIN, &libpath) < 0) {
fprintf(stderr, "Unable to find origin of libz.so.1: %s\n", dlerror());
return -1;
}
dlclose(dlhdl);
dlhdl = NULL;
// Copy a shared library from shared mntns to private /tmp
snprintf(buf, 4096, "%s/libz.so.1", libpath);
in_fd = open(buf, O_RDONLY);
if (in_fd < 0) {
fprintf(stderr, "Unable to open %s: %s\n", buf, strerror(errno));
return -1;
}
out_fd = open("/tmp/libz.so.1", O_RDWR|O_CREAT|O_EXCL,
S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
if (out_fd < 0) {
fprintf(stderr, "Unable to open /tmp/libz.so.1: %s\n", strerror(errno));
return -1;
}
memset(buf, 0, sizeof (buf));
while ((rb = read(in_fd, buf, sizeof (buf))) > 0) {
if (write(out_fd, buf, rb) < 0) {
fprintf(stderr, "Write error: %s\n", strerror(errno));
return -1;
}
}
close(in_fd);
close(out_fd);
dlhdl = dlopen("/tmp/libz.so.1", RTLD_NOW);
if (dlhdl == NULL) {
fprintf(stderr, "dlopen error: %s\n", dlerror());
return -1;
}
sleep(5);
dlclose(dlhdl);
return 0;
}
TEST_CASE("resolve symbol addresses for a given PID", "[c_api]") {
struct bcc_symbol sym;
void *resolver = bcc_symcache_new(getpid());
......@@ -142,6 +225,21 @@ TEST_CASE("resolve symbol addresses for a given PID", "[c_api]") {
REQUIRE(string(sym.module).find("libc") != string::npos);
REQUIRE(string("strtok") == sym.name);
}
SECTION("resolve in separate mount namespace") {
pid_t child;
uint64_t addr = 0;
child = spawn_child(0, true, true, mntns_func);
REQUIRE(child > 0);
void *resolver = bcc_symcache_new(child);
REQUIRE(resolver);
REQUIRE(bcc_symcache_resolve_name(resolver, "/tmp/libz.so.1", "zlibVersion",
&addr) == 0);
REQUIRE(addr != 0);
}
}
#define STACK_SIZE (1024 * 1024)
......@@ -151,10 +249,7 @@ static string perf_map_path(pid_t pid) {
return tfm::format("/tmp/perf-%d.map", pid);
}
static int child_func(void *arg) {
unsigned long long map_addr = (unsigned long long)arg;
string path = perf_map_path(getpid());
static int make_perf_map_file(string &path, unsigned long long map_addr) {
FILE *file = fopen(path.c_str(), "w");
if (file == NULL) {
return -1;
......@@ -163,19 +258,56 @@ static int child_func(void *arg) {
fprintf(file, "%llx 10 right_next_door_fn\n", map_addr + 0x10);
fclose(file);
return 0;
}
static int perf_map_func(void *arg) {
string path = perf_map_path(getpid());
if (make_perf_map_file(path, (unsigned long long)arg) < 0)
return -1;
sleep(5);
unlink(path.c_str());
return 0;
}
static pid_t spawn_child(void *map_addr, bool own_pidns) {
static int perf_map_func_mntns(void *arg) {
string path = perf_map_path(getpid());
if (setup_tmp_mnts() < 0) {
return -1;
}
if (make_perf_map_file(path, (unsigned long long)arg) < 0)
return -1;
sleep(5);
unlink(path.c_str());
return 0;
}
static int perf_map_func_noop(void *arg) {
if (setup_tmp_mnts() < 0) {
return -1;
}
sleep(5);
return 0;
}
static pid_t spawn_child(void *map_addr, bool own_pidns, bool own_mntns,
int (*child_func)(void *)) {
int flags = 0;
if (own_pidns)
flags |= CLONE_NEWPID;
if (own_mntns)
flags |= CLONE_NEWNS;
pid_t child = clone(child_func, /* stack grows down */ child_stack + STACK_SIZE,
flags, (void*)map_addr);
pid_t child = clone(child_func,
/* stack grows down */ child_stack + STACK_SIZE, flags, (void*)map_addr);
if (child < 0)
return -1;
......@@ -193,7 +325,7 @@ TEST_CASE("resolve symbols using /tmp/perf-pid.map", "[c_api]") {
pid_t child = -1;
SECTION("same namespace") {
child = spawn_child(map_addr, /* own_pidns */ false);
child = spawn_child(map_addr, /* own_pidns */ false, false, perf_map_func);
REQUIRE(child > 0);
void *resolver = bcc_symcache_new(child);
......@@ -213,7 +345,7 @@ TEST_CASE("resolve symbols using /tmp/perf-pid.map", "[c_api]") {
}
SECTION("separate namespace") {
child = spawn_child(map_addr, /* own_pidns */ true);
child = spawn_child(map_addr, /* own_pidns */ true, false, perf_map_func);
REQUIRE(child > 0);
void *resolver = bcc_symcache_new(child);
......@@ -225,8 +357,48 @@ TEST_CASE("resolve symbols using /tmp/perf-pid.map", "[c_api]") {
// child is PID 1 in its namespace
REQUIRE(string(sym.module) == perf_map_path(1));
REQUIRE(string("dummy_fn") == sym.name);
unlink("/tmp/perf-1.map");
}
SECTION("separate pid and mount namespace") {
child = spawn_child(map_addr, /* own_pidns */ true, true,
perf_map_func_mntns);
REQUIRE(child > 0);
void *resolver = bcc_symcache_new(child);
REQUIRE(resolver);
REQUIRE(bcc_symcache_resolve(resolver, (unsigned long long)map_addr,
&sym) == 0);
REQUIRE(sym.module);
// child is PID 1 in its namespace
REQUIRE(string(sym.module) == perf_map_path(1));
REQUIRE(string("dummy_fn") == sym.name);
}
SECTION("separate pid and mount namespace, perf-map in host") {
child = spawn_child(map_addr, /* own_pidns */ true, true,
perf_map_func_noop);
REQUIRE(child > 0);
string path = perf_map_path(child);
REQUIRE(make_perf_map_file(path, (unsigned long long)map_addr) == 0);
void *resolver = bcc_symcache_new(child);
REQUIRE(resolver);
REQUIRE(bcc_symcache_resolve(resolver, (unsigned long long)map_addr,
&sym) == 0);
REQUIRE(sym.module);
// child is PID 1 in its namespace
REQUIRE(string(sym.module) == perf_map_path(child));
REQUIRE(string("dummy_fn") == sym.name);
unlink(path.c_str());
}
munmap(map_addr, map_sz);
}
......
......@@ -4,7 +4,11 @@
import bcc
import ctypes
import errno
import os
import subprocess
import shutil
import time
import unittest
class TestUprobes(unittest.TestCase):
......@@ -60,5 +64,73 @@ int count(struct pt_regs *ctx) {
b.detach_uretprobe(name="/usr/bin/python", sym="main")
b.detach_uprobe(name="/usr/bin/python", sym="main")
def test_mount_namespace(self):
text = """
#include <uapi/linux/ptrace.h>
BPF_TABLE("array", int, u64, stats, 1);
static void incr(int idx) {
u64 *ptr = stats.lookup(&idx);
if (ptr)
++(*ptr);
}
int count(struct pt_regs *ctx) {
bpf_trace_printk("count() uprobe fired");
u32 pid = bpf_get_current_pid_tgid();
if (pid == PID)
incr(0);
return 0;
}"""
# Need to import libc from ctypes to access unshare(2)
libc = ctypes.CDLL("libc.so.6", use_errno=True)
# Need to find path to libz.so.1
libz_path = None
p = subprocess.Popen(["ldconfig", "-p"], stdout=subprocess.PIPE)
for l in p.stdout:
n = l.split()
if n[0] == "libz.so.1":
libz_path = n[-1]
p.wait()
p = None
self.assertIsNotNone(libz_path)
# fork a child that we'll place in a separate mount namespace
child_pid = os.fork()
if child_pid == 0:
# Unshare CLONE_NEWNS
if libc.unshare(0x00020000) == -1:
e = ctypes.get_errno()
raise OSError(e, errno.errorcode[e])
# Remount root MS_REC|MS_PRIVATE
if libc.mount(None, "/", None, (1<<14)|(1<<18) , None) == -1:
e = ctypes.get_errno()
raise OSError(e, errno.errorcode[e])
if libc.mount("tmpfs", "/tmp", "tmpfs", 0, None) == -1:
e = ctypes.get_errno()
raise OSError(e, errno.errorcode[e])
shutil.copy(libz_path, "/tmp")
libz = ctypes.CDLL("/tmp/libz.so.1")
time.sleep(1)
libz.zlibVersion()
time.sleep(5)
os._exit(0)
libname = "/tmp/libz.so.1"
symname = "zlibVersion"
text = text.replace("PID", "%d" % child_pid)
b = bcc.BPF(text=text)
b.attach_uprobe(name=libname, sym=symname, fn_name="count", pid=child_pid)
b.attach_uretprobe(name=libname, sym=symname, fn_name="count", pid=child_pid)
time.sleep(1)
self.assertEqual(b["stats"][ctypes.c_int(0)].value, 2)
b.detach_uretprobe(name=libname, sym=symname, pid=child_pid)
b.detach_uprobe(name=libname, sym=symname, pid=child_pid)
os.wait()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment