libbpf.c 21.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * Copyright (c) 2015 PLUMgrid, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

Brenden Blanco's avatar
Brenden Blanco committed
17
#include <arpa/inet.h>
Brenden Blanco's avatar
Brenden Blanco committed
18
#include <errno.h>
Brenden Blanco's avatar
Brenden Blanco committed
19
#include <fcntl.h>
20
#include <limits.h>
Brenden Blanco's avatar
Brenden Blanco committed
21 22 23 24 25 26 27
#include <linux/bpf.h>
#include <linux/if_packet.h>
#include <linux/pkt_cls.h>
#include <linux/perf_event.h>
#include <linux/rtnetlink.h>
#include <linux/unistd.h>
#include <linux/version.h>
28
#include <linux/bpf_common.h>
Brenden Blanco's avatar
Brenden Blanco committed
29 30
#include <net/ethernet.h>
#include <net/if.h>
31
#include <stdio.h>
Brenden Blanco's avatar
Brenden Blanco committed
32 33 34
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
35
#include <sys/resource.h>
36
#include <unistd.h>
37
#include <stdbool.h>
38 39
#include <sys/stat.h>
#include <sys/types.h>
Brenden Blanco's avatar
Brenden Blanco committed
40

41
#include "bcc_proc.h"
Brenden Blanco's avatar
Brenden Blanco committed
42
#include "libbpf.h"
43
#include "perf_reader.h"
Brenden Blanco's avatar
Brenden Blanco committed
44

45 46 47
// TODO: remove these defines when linux-libc-dev exports them properly

#ifndef __NR_bpf
Naveen N. Rao's avatar
Naveen N. Rao committed
48 49
#if defined(__powerpc64__)
#define __NR_bpf 361
50 51
#elif defined(__s390x__)
#define __NR_bpf 351
Zhiyi Sun's avatar
Zhiyi Sun committed
52 53
#elif defined(__aarch64__)
#define __NR_bpf 280
Naveen N. Rao's avatar
Naveen N. Rao committed
54
#else
55 56
#define __NR_bpf 321
#endif
Naveen N. Rao's avatar
Naveen N. Rao committed
57
#endif
58 59 60 61 62 63 64 65 66 67 68 69 70

#ifndef SO_ATTACH_BPF
#define SO_ATTACH_BPF 50
#endif

#ifndef PERF_EVENT_IOC_SET_BPF
#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32)
#endif

#ifndef PERF_FLAG_FD_CLOEXEC
#define PERF_FLAG_FD_CLOEXEC (1UL << 3)
#endif

71 72
static int probe_perf_reader_page_cnt = 8;

Brenden Blanco's avatar
Brenden Blanco committed
73 74 75 76 77
static __u64 ptr_to_u64(void *ptr)
{
  return (__u64) (unsigned long) ptr;
}

Huapeng Zhou's avatar
Huapeng Zhou committed
78
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int max_entries, int map_flags)
Brenden Blanco's avatar
Brenden Blanco committed
79
{
80 81 82 83 84 85
  union bpf_attr attr;
  memset(&attr, 0, sizeof(attr));
  attr.map_type = map_type;
  attr.key_size = key_size;
  attr.value_size = value_size;
  attr.max_entries = max_entries;
Huapeng Zhou's avatar
Huapeng Zhou committed
86
  attr.map_flags = map_flags;
Brenden Blanco's avatar
Brenden Blanco committed
87

88 89 90 91 92 93 94 95 96 97 98 99 100
  int ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
  if (ret < 0 && errno == EPERM) {
    // see note below about the rationale for this retry

    struct rlimit rl = {};
    if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
      rl.rlim_max = RLIM_INFINITY;
      rl.rlim_cur = rl.rlim_max;
      if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
        ret = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
    }
  }
  return ret;
Brenden Blanco's avatar
Brenden Blanco committed
101 102 103 104
}

int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags)
{
105 106 107 108 109 110
  union bpf_attr attr;
  memset(&attr, 0, sizeof(attr));
  attr.map_fd = fd;
  attr.key = ptr_to_u64(key);
  attr.value = ptr_to_u64(value);
  attr.flags = flags;
Brenden Blanco's avatar
Brenden Blanco committed
111 112 113 114 115 116

  return syscall(__NR_bpf, BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
}

int bpf_lookup_elem(int fd, void *key, void *value)
{
117 118 119 120 121
  union bpf_attr attr;
  memset(&attr, 0, sizeof(attr));
  attr.map_fd = fd;
  attr.key = ptr_to_u64(key);
  attr.value = ptr_to_u64(value);
Brenden Blanco's avatar
Brenden Blanco committed
122 123 124 125 126 127

  return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
}

int bpf_delete_elem(int fd, void *key)
{
128 129 130 131
  union bpf_attr attr;
  memset(&attr, 0, sizeof(attr));
  attr.map_fd = fd;
  attr.key = ptr_to_u64(key);
Brenden Blanco's avatar
Brenden Blanco committed
132 133 134 135 136 137

  return syscall(__NR_bpf, BPF_MAP_DELETE_ELEM, &attr, sizeof(attr));
}

int bpf_get_next_key(int fd, void *key, void *next_key)
{
138 139 140 141 142
  union bpf_attr attr;
  memset(&attr, 0, sizeof(attr));
  attr.map_fd = fd;
  attr.key = ptr_to_u64(key);
  attr.next_key = ptr_to_u64(next_key);
Brenden Blanco's avatar
Brenden Blanco committed
143 144 145 146

  return syscall(__NR_bpf, BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr));
}

Brendan Gregg's avatar
Brendan Gregg committed
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
void bpf_print_hints(char *log)
{
  if (log == NULL)
    return;

  // The following error strings will need maintenance to match LLVM.

  // stack busting
  if (strstr(log, "invalid stack off=-") != NULL) {
    fprintf(stderr, "HINT: Looks like you exceeded the BPF stack limit. "
      "This can happen if you allocate too much local variable storage. "
      "For example, if you allocated a 1 Kbyte struct (maybe for "
      "BPF_PERF_OUTPUT), busting a max stack of 512 bytes.\n\n");
  }

  // didn't check NULL on map lookup
  if (strstr(log, "invalid mem access 'map_value_or_null'") != NULL) {
    fprintf(stderr, "HINT: The 'map_value_or_null' error can happen if "
      "you dereference a pointer value from a map lookup without first "
      "checking if that pointer is NULL.\n\n");
  }

  // lacking a bpf_probe_read
  if (strstr(log, "invalid mem access 'inv'") != NULL) {
    fprintf(stderr, "HINT: The invalid mem access 'inv' error can happen "
      "if you try to dereference memory without first using "
      "bpf_probe_read() to copy it to the BPF stack. Sometimes the "
      "bpf_probe_read is automatic by the bcc rewriter, other times "
      "you'll need to be explicit.\n\n");
  }
}
Brenden Blanco's avatar
Brenden Blanco committed
178 179 180
#define ROUND_UP(x, n) (((x) + (n) - 1u) & ~((n) - 1u))

int bpf_prog_load(enum bpf_prog_type prog_type,
Brenden Blanco's avatar
Brenden Blanco committed
181
                  const struct bpf_insn *insns, int prog_len,
182 183
                  const char *license, unsigned kern_version,
                  char *log_buf, unsigned log_buf_size)
Brenden Blanco's avatar
Brenden Blanco committed
184
{
185
  union bpf_attr attr;
186 187 188 189
  char *bpf_log_buffer = NULL;
  unsigned buffer_size = 0;
  int ret = 0;

190 191 192 193 194 195 196 197
  memset(&attr, 0, sizeof(attr));
  attr.prog_type = prog_type;
  attr.insns = ptr_to_u64((void *) insns);
  attr.insn_cnt = prog_len / sizeof(struct bpf_insn);
  attr.license = ptr_to_u64((void *) license);
  attr.log_buf = ptr_to_u64(log_buf);
  attr.log_size = log_buf_size;
  attr.log_level = log_buf ? 1 : 0;
Brenden Blanco's avatar
Brenden Blanco committed
198

Brenden Blanco's avatar
Brenden Blanco committed
199
  attr.kern_version = kern_version;
200 201
  if (log_buf)
    log_buf[0] = 0;
Brenden Blanco's avatar
Brenden Blanco committed
202

203 204 205 206 207 208 209 210 211 212
  if (attr.insn_cnt > BPF_MAXINSNS) {
    ret = -1;
    errno = EINVAL;
    fprintf(stderr,
            "bpf: %s. Program too large (%d insns), at most %d insns\n\n",
            strerror(errno), attr.insn_cnt, BPF_MAXINSNS);
    return ret;
  }

  ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
Huapeng Zhou's avatar
Huapeng Zhou committed
213

214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
  if (ret < 0 && errno == EPERM) {
    // When EPERM is returned, two reasons are possible:
    //  1. user has no permissions for bpf()
    //  2. user has insufficent rlimit for locked memory
    // Unfortunately, there is no api to inspect the current usage of locked
    // mem for the user, so an accurate calculation of how much memory to lock
    // for this new program is difficult to calculate. As a hack, bump the limit
    // to unlimited. If program load fails again, return the error.

    struct rlimit rl = {};
    if (getrlimit(RLIMIT_MEMLOCK, &rl) == 0) {
      rl.rlim_max = RLIM_INFINITY;
      rl.rlim_cur = rl.rlim_max;
      if (setrlimit(RLIMIT_MEMLOCK, &rl) == 0)
        ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
    }
  }

232
  if (ret < 0 && !log_buf) {
233 234

    buffer_size = LOG_BUF_SIZE;
235
    // caller did not specify log_buf but failure should be printed,
236 237 238 239 240 241 242 243 244
    // so repeat the syscall and print the result to stderr
    for (;;) {
         bpf_log_buffer = malloc(buffer_size);
         if (!bpf_log_buffer) {
             fprintf(stderr,
                     "bpf: buffer log memory allocation failed for error %s\n\n",
                     strerror(errno));
             return ret;
         }
245
         bpf_log_buffer[0] = 0;
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261

         attr.log_buf = ptr_to_u64(bpf_log_buffer);
         attr.log_size = buffer_size;
         attr.log_level = bpf_log_buffer ? 1 : 0;

         ret = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
         if (ret < 0 && errno == ENOSPC) {
             free(bpf_log_buffer);
             bpf_log_buffer = NULL;
             buffer_size <<= 1;
         } else {
             break;
         }
    }

    fprintf(stderr, "bpf: %s\n%s\n", strerror(errno), bpf_log_buffer);
Brendan Gregg's avatar
Brendan Gregg committed
262
    bpf_print_hints(bpf_log_buffer);
263

Huapeng Zhou's avatar
Huapeng Zhou committed
264
    free(bpf_log_buffer);
Brenden Blanco's avatar
Brenden Blanco committed
265 266
  }
  return ret;
Brenden Blanco's avatar
Brenden Blanco committed
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
}

int bpf_open_raw_sock(const char *name)
{
  struct sockaddr_ll sll;
  int sock;

  sock = socket(PF_PACKET, SOCK_RAW | SOCK_NONBLOCK | SOCK_CLOEXEC, htons(ETH_P_ALL));
  if (sock < 0) {
    printf("cannot create raw socket\n");
    return -1;
  }

  memset(&sll, 0, sizeof(sll));
  sll.sll_family = AF_PACKET;
  sll.sll_ifindex = if_nametoindex(name);
  sll.sll_protocol = htons(ETH_P_ALL);
  if (bind(sock, (struct sockaddr *)&sll, sizeof(sll)) < 0) {
    printf("bind to %s: %s\n", name, strerror(errno));
    close(sock);
    return -1;
  }

  return sock;
}

int bpf_attach_socket(int sock, int prog) {
294
  return setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog, sizeof(prog));
Brenden Blanco's avatar
Brenden Blanco committed
295 296
}

297 298
static int bpf_attach_tracing_event(int progfd, const char *event_path,
    struct perf_reader *reader, int pid, int cpu, int group_fd) {
299
  int efd, pfd;
300
  ssize_t bytes;
Brenden Blanco's avatar
Brenden Blanco committed
301 302 303 304 305 306 307
  char buf[256];
  struct perf_event_attr attr = {};

  snprintf(buf, sizeof(buf), "%s/id", event_path);
  efd = open(buf, O_RDONLY, 0);
  if (efd < 0) {
    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
308
    return -1;
Brenden Blanco's avatar
Brenden Blanco committed
309 310 311 312 313
  }

  bytes = read(efd, buf, sizeof(buf));
  if (bytes <= 0 || bytes >= sizeof(buf)) {
    fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
314 315
    close(efd);
    return -1;
Brenden Blanco's avatar
Brenden Blanco committed
316
  }
317
  close(efd);
Brenden Blanco's avatar
Brenden Blanco committed
318 319 320
  buf[bytes] = '\0';
  attr.config = strtol(buf, NULL, 0);
  attr.type = PERF_TYPE_TRACEPOINT;
321
  attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
Brenden Blanco's avatar
Brenden Blanco committed
322 323 324 325
  attr.sample_period = 1;
  attr.wakeup_events = 1;
  pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC);
  if (pfd < 0) {
326
    fprintf(stderr, "perf_event_open(%s/id): %s\n", event_path, strerror(errno));
327
    return -1;
Brenden Blanco's avatar
Brenden Blanco committed
328
  }
329
  perf_reader_set_fd(reader, pfd);
330

331
  if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
332
    return -1;
333

Brenden Blanco's avatar
Brenden Blanco committed
334 335
  if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
    perror("ioctl(PERF_EVENT_IOC_SET_BPF)");
336
    return -1;
Brenden Blanco's avatar
Brenden Blanco committed
337 338 339
  }
  if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
    perror("ioctl(PERF_EVENT_IOC_ENABLE)");
340
    return -1;
Brenden Blanco's avatar
Brenden Blanco committed
341 342
  }

343
  return 0;
Brenden Blanco's avatar
Brenden Blanco committed
344 345
}

346
void * bpf_attach_kprobe(int progfd, enum bpf_probe_attach_type attach_type, const char *ev_name,
347 348 349 350
                        const char *fn_name,
                        pid_t pid, int cpu, int group_fd,
                        perf_reader_cb cb, void *cb_cookie) 
{
351
  int kfd;
352 353
  char buf[256];
  char new_name[128];
354
  struct perf_reader *reader = NULL;
355
  static char *event_type = "kprobe";
356
  int n;
357

358
  snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
359
  reader = perf_reader_new(cb, NULL, NULL, cb_cookie, probe_perf_reader_page_cnt);
360
  if (!reader)
361
    goto error;
Brenden Blanco's avatar
Brenden Blanco committed
362

363 364
  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
  kfd = open(buf, O_WRONLY | O_APPEND, 0);
Brenden Blanco's avatar
Brenden Blanco committed
365
  if (kfd < 0) {
366
    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
367
    goto error;
Brenden Blanco's avatar
Brenden Blanco committed
368 369
  }

Derek's avatar
Derek committed
370
  snprintf(buf, sizeof(buf), "%c:%ss/%s %s", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r', 
371 372
			event_type, new_name, fn_name);
  if (write(kfd, buf, strlen(buf)) < 0) {
373 374
    if (errno == EINVAL)
      fprintf(stderr, "check dmesg output for possible cause\n");
375
    close(kfd);
376
    goto error;
Brenden Blanco's avatar
Brenden Blanco committed
377
  }
378
  close(kfd);
Brenden Blanco's avatar
Brenden Blanco committed
379

380
  if (access("/sys/kernel/debug/tracing/instances", F_OK) != -1) {
381 382 383 384 385 386 387
    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/instances/bcc_%d", getpid());
    if (access(buf, F_OK) == -1) {
      if (mkdir(buf, 0755) == -1) 
        goto retry;
    }
    n = snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/instances/bcc_%d/events/%ss/%s", 
             getpid(), event_type, new_name);
388
    if (n < sizeof(buf) && bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) == 0)
389
	  goto out;
390
    snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/instances/bcc_%d", getpid());
391 392 393
    rmdir(buf);
  }
retry:
394
  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, new_name);
395 396
  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) < 0)
    goto error;
397
out:
398 399 400
  return reader;

error:
401
  perf_reader_free(reader);
402
  return NULL;
Brenden Blanco's avatar
Brenden Blanco committed
403

404 405
}

406
void * bpf_attach_uprobe(int progfd, enum bpf_probe_attach_type attach_type, const char *ev_name,
407
                        const char *binary_path, uint64_t offset,
408
                        pid_t pid, int cpu, int group_fd,
409 410 411
                        perf_reader_cb cb, void *cb_cookie) 
{
  int kfd;
412 413
  char buf[PATH_MAX];
  char new_name[128];
414 415
  struct perf_reader *reader = NULL;
  static char *event_type = "uprobe";
416
  struct ns_cookie nsc = {-1, -1};
417
  int n;
418 419

  snprintf(new_name, sizeof(new_name), "%s_bcc_%d", ev_name, getpid());
420
  reader = perf_reader_new(cb, NULL, NULL, cb_cookie, probe_perf_reader_page_cnt);
421 422 423 424 425 426 427 428 429 430
  if (!reader)
    goto error;

  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
  kfd = open(buf, O_WRONLY | O_APPEND, 0);
  if (kfd < 0) {
    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
    goto error;
  }

Derek's avatar
Derek committed
431
  n = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx", attach_type==BPF_PROBE_ENTRY ? 'p' : 'r', 
432
			event_type, new_name, binary_path, offset);
433 434 435 436
  if (n >= sizeof(buf)) {
    close(kfd);
    goto error;
  }
437 438

  bcc_procutils_enter_mountns(pid, &nsc);
439 440 441 442 443 444
  if (write(kfd, buf, strlen(buf)) < 0) {
    if (errno == EINVAL)
      fprintf(stderr, "check dmesg output for possible cause\n");
    close(kfd);
    goto error;
  }
445
  bcc_procutils_exit_mountns(&nsc);
446 447 448 449 450 451 452 453 454
  close(kfd);

  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s", event_type, new_name);
  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) < 0)
    goto error;

  return reader;

error:
455
  bcc_procutils_exit_mountns(&nsc);
456 457
  perf_reader_free(reader);
  return NULL;
458 459
}

460 461
static int bpf_detach_probe(const char *ev_name, const char *event_type)
{
462
  int kfd;
463
  char buf[256];
464 465
  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events", event_type);
  kfd = open(buf, O_WRONLY | O_APPEND, 0);
466
  if (kfd < 0) {
467
    fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
468
    return -1;
469 470
  }

471 472
  snprintf(buf, sizeof(buf), "-:%ss/%s_bcc_%d", event_type, ev_name, getpid());
  if (write(kfd, buf, strlen(buf)) < 0) {
473
    fprintf(stderr, "write(%s): %s\n", buf, strerror(errno));
474 475
    close(kfd);
    return -1;
476
  }
477
  close(kfd);
478 479 480 481 482 483

  return 0;
}

int bpf_detach_kprobe(const char *ev_name)
{
484
  char buf[256];
485
  int ret = bpf_detach_probe(ev_name, "kprobe");
486
  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/instances/bcc_%d", getpid());
487 488 489
  if (access(buf, F_OK) != -1) {
    rmdir(buf);
  }
490

491
  return ret;
492 493
}

494 495 496
int bpf_detach_uprobe(const char *ev_name)
{
  return bpf_detach_probe(ev_name, "uprobe");
497 498 499
}


500 501 502 503 504 505
void * bpf_attach_tracepoint(int progfd, const char *tp_category,
                             const char *tp_name, int pid, int cpu,
                             int group_fd, perf_reader_cb cb, void *cb_cookie) {
  char buf[256];
  struct perf_reader *reader = NULL;

506
  reader = perf_reader_new(cb, NULL, NULL, cb_cookie, probe_perf_reader_page_cnt);
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
  if (!reader)
    goto error;

  snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%s/%s",
           tp_category, tp_name);
  if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) < 0)
    goto error;

  return reader;

error:
  perf_reader_free(reader);
  return NULL;
}

int bpf_detach_tracepoint(const char *tp_category, const char *tp_name) {
  // Right now, there is nothing to do, but it's a good idea to encourage
  // callers to detach anything they attach.
  return 0;
}

528 529 530
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb,
                            perf_reader_lost_cb lost_cb, void *cb_cookie,
                            int pid, int cpu, int page_cnt) {
531
  int pfd;
532
  struct perf_event_attr attr = {};
533
  struct perf_reader *reader = NULL;
534

535
  reader = perf_reader_new(NULL, raw_cb, lost_cb, cb_cookie, page_cnt);
536
  if (!reader)
537
    goto error;
538

539
  attr.config = 10;//PERF_COUNT_SW_BPF_OUTPUT;
540 541
  attr.type = PERF_TYPE_SOFTWARE;
  attr.sample_type = PERF_SAMPLE_RAW;
542 543 544
  attr.sample_period = 1;
  attr.wakeup_events = 1;
  pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
545
  if (pfd < 0) {
546 547
    fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
    fprintf(stderr, "   (check your kernel for PERF_COUNT_SW_BPF_OUTPUT support, 4.4 or newer)\n");
548
    goto error;
549 550 551 552
  }
  perf_reader_set_fd(reader, pfd);

  if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
553
    goto error;
554 555 556

  if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
    perror("ioctl(PERF_EVENT_IOC_ENABLE)");
557
    goto error;
558 559
  }

560
  return reader;
561

562 563
error:
  if (reader)
564 565
    perf_reader_free(reader);

566
  return NULL;
567
}
568

569 570 571 572 573 574 575 576 577 578 579 580 581
int bpf_open_perf_event(uint32_t type, uint64_t config, int pid, int cpu) {
  int fd;
  struct perf_event_attr attr = {};

  attr.sample_period = LONG_MAX;
  attr.type = type;
  attr.config = config;

  fd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
  if (fd < 0) {
    fprintf(stderr, "perf_event_open: %s\n", strerror(errno));
    return -1;
  }
582

583 584 585 586 587 588 589 590
  if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
    perror("ioctl(PERF_EVENT_IOC_ENABLE)");
    close(fd);
    return -1;
  }

  return fd;
}
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639

int bpf_attach_xdp(const char *dev_name, int progfd) {
    struct sockaddr_nl sa;
    int sock, seq = 0, len, ret = -1;
    char buf[4096];
    struct nlattr *nla, *nla_xdp;
    struct {
        struct nlmsghdr  nh;
        struct ifinfomsg ifinfo;
        char             attrbuf[64];
    } req;
    struct nlmsghdr *nh;
    struct nlmsgerr *err;

    memset(&sa, 0, sizeof(sa));
    sa.nl_family = AF_NETLINK;

    sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
    if (sock < 0) {
        fprintf(stderr, "bpf: opening a netlink socket: %s\n", strerror(errno));
        return -1;
    }

    if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
        fprintf(stderr, "bpf: bind to netlink: %s\n", strerror(errno));
        goto cleanup;
    }

    memset(&req, 0, sizeof(req));
    req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
    req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
    req.nh.nlmsg_type = RTM_SETLINK;
    req.nh.nlmsg_pid = 0;
    req.nh.nlmsg_seq = ++seq;
    req.ifinfo.ifi_family = AF_UNSPEC;
    req.ifinfo.ifi_index = if_nametoindex(dev_name);
    if (req.ifinfo.ifi_index == 0) {
        fprintf(stderr, "bpf: Resolving device name to index: %s\n", strerror(errno));
        goto cleanup;
    }

    nla = (struct nlattr *)(((char *)&req)
                            + NLMSG_ALIGN(req.nh.nlmsg_len));
    nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;

    nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN);

    // we specify the FD passed over by the user
    nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
Arthur Gautier's avatar
Arthur Gautier committed
640
    nla_xdp->nla_len = NLA_HDRLEN + sizeof(progfd);
641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
    memcpy((char *)nla_xdp + NLA_HDRLEN, &progfd, sizeof(progfd));
    nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len;

    req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);

    if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
        fprintf(stderr, "bpf: send to netlink: %s\n", strerror(errno));
        goto cleanup;
    }

    len = recv(sock, buf, sizeof(buf), 0);
    if (len < 0) {
        fprintf(stderr, "bpf: recv from netlink: %s\n", strerror(errno));
        goto cleanup;
    }

    for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
         nh = NLMSG_NEXT(nh, len)) {
        if (nh->nlmsg_pid != getpid()) {
            fprintf(stderr, "bpf: Wrong pid %d, expected %d\n",
                   nh->nlmsg_pid, getpid());
            errno = EBADMSG;
            goto cleanup;
        }
        if (nh->nlmsg_seq != seq) {
            fprintf(stderr, "bpf: Wrong seq %d, expected %d\n",
                   nh->nlmsg_seq, seq);
            errno = EBADMSG;
            goto cleanup;
        }
        switch (nh->nlmsg_type) {
            case NLMSG_ERROR:
                err = (struct nlmsgerr *)NLMSG_DATA(nh);
                if (!err->error)
                    continue;
                fprintf(stderr, "bpf: nlmsg error %s\n", strerror(-err->error));
                errno = -err->error;
                goto cleanup;
            case NLMSG_DONE:
                break;
        }
    }

    ret = 0;

cleanup:
    close(sock);
    return ret;
}
690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746

int bpf_attach_perf_event(int progfd, uint32_t ev_type, uint32_t ev_config,
                          uint64_t sample_period, uint64_t sample_freq,
                          pid_t pid, int cpu, int group_fd) {
  if (ev_type != PERF_TYPE_HARDWARE && ev_type != PERF_TYPE_SOFTWARE) {
    fprintf(stderr, "Unsupported perf event type\n");
    return -1;
  }
  if ((ev_type == PERF_TYPE_HARDWARE && ev_config >= PERF_COUNT_HW_MAX) ||
      (ev_type == PERF_TYPE_SOFTWARE && ev_config >= PERF_COUNT_SW_MAX)) {
    fprintf(stderr, "Invalid perf event config\n");
    return -1;
  }
  if (!((sample_period > 0) ^ (sample_freq > 0))) {
    fprintf(
      stderr, "Exactly one of sample_period / sample_freq should be set\n"
    );
    return -1;
  }

  struct perf_event_attr attr = {};
  attr.type = ev_type;
  attr.config = ev_config;
  attr.inherit = 1;
  if (sample_freq > 0) {
    attr.freq = 1;
    attr.sample_freq = sample_freq;
  } else {
    attr.sample_period = sample_period;
  }

  int fd = syscall(
    __NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC
  );
  if (fd < 0) {
    perror("perf_event_open failed");
    return -1;
  }
  if (ioctl(fd, PERF_EVENT_IOC_SET_BPF, progfd) != 0) {
    perror("ioctl(PERF_EVENT_IOC_SET_BPF) failed");
    close(fd);
    return -1;
  }
  if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) {
    perror("ioctl(PERF_EVENT_IOC_ENABLE) failed");
    close(fd);
    return -1;
  }

  return fd;
}

int bpf_detach_perf_event(uint32_t ev_type, uint32_t ev_config) {
  // Right now, there is nothing to do, but it's a good idea to encourage
  // callers to detach anything they attach.
  return 0;
}
747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765

int bpf_obj_pin(int fd, const char *pathname)
{
  union bpf_attr attr = {
    .pathname = ptr_to_u64((void *)pathname),
    .bpf_fd = fd,
  };

  return syscall(__NR_bpf, BPF_OBJ_PIN, &attr, sizeof(attr));
}

int bpf_obj_get(const char *pathname)
{
  union bpf_attr attr = {
    .pathname = ptr_to_u64((void *)pathname),
  };

  return syscall(__NR_bpf, BPF_OBJ_GET, &attr, sizeof(attr));
}