~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/tools/perf/builtin-record.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * builtin-record.c
  4  *
  5  * Builtin record command: Record the profile of a workload
  6  * (or a CPU, or a PID) into the perf.data output file - for
  7  * later analysis via perf report.
  8  */
  9 #include "builtin.h"
 10 
 11 #include "util/build-id.h"
 12 #include <subcmd/parse-options.h>
 13 #include <internal/xyarray.h>
 14 #include "util/parse-events.h"
 15 #include "util/config.h"
 16 
 17 #include "util/callchain.h"
 18 #include "util/cgroup.h"
 19 #include "util/header.h"
 20 #include "util/event.h"
 21 #include "util/evlist.h"
 22 #include "util/evsel.h"
 23 #include "util/debug.h"
 24 #include "util/mmap.h"
 25 #include "util/mutex.h"
 26 #include "util/target.h"
 27 #include "util/session.h"
 28 #include "util/tool.h"
 29 #include "util/symbol.h"
 30 #include "util/record.h"
 31 #include "util/cpumap.h"
 32 #include "util/thread_map.h"
 33 #include "util/data.h"
 34 #include "util/perf_regs.h"
 35 #include "util/auxtrace.h"
 36 #include "util/tsc.h"
 37 #include "util/parse-branch-options.h"
 38 #include "util/parse-regs-options.h"
 39 #include "util/perf_api_probe.h"
 40 #include "util/trigger.h"
 41 #include "util/perf-hooks.h"
 42 #include "util/cpu-set-sched.h"
 43 #include "util/synthetic-events.h"
 44 #include "util/time-utils.h"
 45 #include "util/units.h"
 46 #include "util/bpf-event.h"
 47 #include "util/util.h"
 48 #include "util/pfm.h"
 49 #include "util/pmu.h"
 50 #include "util/pmus.h"
 51 #include "util/clockid.h"
 52 #include "util/off_cpu.h"
 53 #include "util/bpf-filter.h"
 54 #include "asm/bug.h"
 55 #include "perf.h"
 56 #include "cputopo.h"
 57 
 58 #include <errno.h>
 59 #include <inttypes.h>
 60 #include <locale.h>
 61 #include <poll.h>
 62 #include <pthread.h>
 63 #include <unistd.h>
 64 #ifndef HAVE_GETTID
 65 #include <syscall.h>
 66 #endif
 67 #include <sched.h>
 68 #include <signal.h>
 69 #ifdef HAVE_EVENTFD_SUPPORT
 70 #include <sys/eventfd.h>
 71 #endif
 72 #include <sys/mman.h>
 73 #include <sys/wait.h>
 74 #include <sys/types.h>
 75 #include <sys/stat.h>
 76 #include <fcntl.h>
 77 #include <linux/err.h>
 78 #include <linux/string.h>
 79 #include <linux/time64.h>
 80 #include <linux/zalloc.h>
 81 #include <linux/bitmap.h>
 82 #include <sys/time.h>
 83 
 84 struct switch_output {
 85         bool             enabled;
 86         bool             signal;
 87         unsigned long    size;
 88         unsigned long    time;
 89         const char      *str;
 90         bool             set;
 91         char             **filenames;
 92         int              num_files;
 93         int              cur_file;
 94 };
 95 
 96 struct thread_mask {
 97         struct mmap_cpu_mask    maps;
 98         struct mmap_cpu_mask    affinity;
 99 };
100 
101 struct record_thread {
102         pid_t                   tid;
103         struct thread_mask      *mask;
104         struct {
105                 int             msg[2];
106                 int             ack[2];
107         } pipes;
108         struct fdarray          pollfd;
109         int                     ctlfd_pos;
110         int                     nr_mmaps;
111         struct mmap             **maps;
112         struct mmap             **overwrite_maps;
113         struct record           *rec;
114         unsigned long long      samples;
115         unsigned long           waking;
116         u64                     bytes_written;
117         u64                     bytes_transferred;
118         u64                     bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124         THREAD_MSG__UNDEFINED = 0,
125         THREAD_MSG__READY,
126         THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130         "UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134         THREAD_SPEC__UNDEFINED = 0,
135         THREAD_SPEC__CPU,
136         THREAD_SPEC__CORE,
137         THREAD_SPEC__PACKAGE,
138         THREAD_SPEC__NUMA,
139         THREAD_SPEC__USER,
140         THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144         "undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148         int evlist_pollfd_index;
149         int thread_pollfd_index;
150 };
151 
152 struct record {
153         struct perf_tool        tool;
154         struct record_opts      opts;
155         u64                     bytes_written;
156         u64                     thread_bytes_written;
157         struct perf_data        data;
158         struct auxtrace_record  *itr;
159         struct evlist   *evlist;
160         struct perf_session     *session;
161         struct evlist           *sb_evlist;
162         pthread_t               thread_id;
163         int                     realtime_prio;
164         bool                    switch_output_event_set;
165         bool                    no_buildid;
166         bool                    no_buildid_set;
167         bool                    no_buildid_cache;
168         bool                    no_buildid_cache_set;
169         bool                    buildid_all;
170         bool                    buildid_mmap;
171         bool                    timestamp_filename;
172         bool                    timestamp_boundary;
173         bool                    off_cpu;
174         struct switch_output    switch_output;
175         unsigned long long      samples;
176         unsigned long           output_max_size;        /* = 0: unlimited */
177         struct perf_debuginfod  debuginfod;
178         int                     nr_threads;
179         struct thread_mask      *thread_masks;
180         struct record_thread    *thread_data;
181         struct pollfd_index_map *index_map;
182         size_t                  index_map_sz;
183         size_t                  index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193         "SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199         return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205         return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210         return rec->switch_output.signal &&
211                trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216         return rec->switch_output.size &&
217                trigger_is_ready(&switch_output_trigger) &&
218                (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223         return rec->switch_output.time &&
224                trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229         return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234         return rec->output_max_size &&
235                (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239                          void *bf, size_t size)
240 {
241         struct perf_data_file *file = &rec->session->data->file;
242 
243         if (map && map->file)
244                 file = map->file;
245 
246         if (perf_data_file__write(file, bf, size) < 0) {
247                 pr_err("failed to write perf data, error: %m\n");
248                 return -1;
249         }
250 
251         if (map && map->file) {
252                 thread->bytes_written += size;
253                 rec->thread_bytes_written += size;
254         } else {
255                 rec->bytes_written += size;
256         }
257 
258         if (record__output_max_size_exceeded(rec) && !done) {
259                 fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260                                 " stopping session ]\n",
261                                 record__bytes_written(rec) >> 10);
262                 done = 1;
263         }
264 
265         if (switch_output_size(rec))
266                 trigger_hit(&switch_output_trigger);
267 
268         return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274                             void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278                 void *buf, size_t size, off_t off)
279 {
280         int rc;
281 
282         cblock->aio_fildes = trace_fd;
283         cblock->aio_buf    = buf;
284         cblock->aio_nbytes = size;
285         cblock->aio_offset = off;
286         cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288         do {
289                 rc = aio_write(cblock);
290                 if (rc == 0) {
291                         break;
292                 } else if (errno != EAGAIN) {
293                         cblock->aio_fildes = -1;
294                         pr_err("failed to queue perf data, error: %m\n");
295                         break;
296                 }
297         } while (1);
298 
299         return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304         void *rem_buf;
305         off_t rem_off;
306         size_t rem_size;
307         int rc, aio_errno;
308         ssize_t aio_ret, written;
309 
310         aio_errno = aio_error(cblock);
311         if (aio_errno == EINPROGRESS)
312                 return 0;
313 
314         written = aio_ret = aio_return(cblock);
315         if (aio_ret < 0) {
316                 if (aio_errno != EINTR)
317                         pr_err("failed to write perf data, error: %m\n");
318                 written = 0;
319         }
320 
321         rem_size = cblock->aio_nbytes - written;
322 
323         if (rem_size == 0) {
324                 cblock->aio_fildes = -1;
325                 /*
326                  * md->refcount is incremented in record__aio_pushfn() for
327                  * every aio write request started in record__aio_push() so
328                  * decrement it because the request is now complete.
329                  */
330                 perf_mmap__put(&md->core);
331                 rc = 1;
332         } else {
333                 /*
334                  * aio write request may require restart with the
335                  * remainder if the kernel didn't write whole
336                  * chunk at once.
337                  */
338                 rem_off = cblock->aio_offset + written;
339                 rem_buf = (void *)(cblock->aio_buf + written);
340                 record__aio_write(cblock, cblock->aio_fildes,
341                                 rem_buf, rem_size, rem_off);
342                 rc = 0;
343         }
344 
345         return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350         struct aiocb **aiocb = md->aio.aiocb;
351         struct aiocb *cblocks = md->aio.cblocks;
352         struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353         int i, do_suspend;
354 
355         do {
356                 do_suspend = 0;
357                 for (i = 0; i < md->aio.nr_cblocks; ++i) {
358                         if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359                                 if (sync_all)
360                                         aiocb[i] = NULL;
361                                 else
362                                         return i;
363                         } else {
364                                 /*
365                                  * Started aio write is not complete yet
366                                  * so it has to be waited before the
367                                  * next allocation.
368                                  */
369                                 aiocb[i] = &cblocks[i];
370                                 do_suspend = 1;
371                         }
372                 }
373                 if (!do_suspend)
374                         return -1;
375 
376                 while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377                         if (!(errno == EAGAIN || errno == EINTR))
378                                 pr_err("failed to sync perf data, error: %m\n");
379                 }
380         } while (1);
381 }
382 
383 struct record_aio {
384         struct record   *rec;
385         void            *data;
386         size_t          size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391         struct record_aio *aio = to;
392 
393         /*
394          * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395          * to release space in the kernel buffer as fast as possible, calling
396          * perf_mmap__consume() from perf_mmap__push() function.
397          *
398          * That lets the kernel to proceed with storing more profiling data into
399          * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400          *
401          * Coping can be done in two steps in case the chunk of profiling data
402          * crosses the upper bound of the kernel buffer. In this case we first move
403          * part of data from map->start till the upper bound and then the remainder
404          * from the beginning of the kernel buffer till the end of the data chunk.
405          */
406 
407         if (record__comp_enabled(aio->rec)) {
408                 ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409                                                    mmap__mmap_len(map) - aio->size,
410                                                    buf, size);
411                 if (compressed < 0)
412                         return (int)compressed;
413 
414                 size = compressed;
415         } else {
416                 memcpy(aio->data + aio->size, buf, size);
417         }
418 
419         if (!aio->size) {
420                 /*
421                  * Increment map->refcount to guard map->aio.data[] buffer
422                  * from premature deallocation because map object can be
423                  * released earlier than aio write request started on
424                  * map->aio.data[] buffer is complete.
425                  *
426                  * perf_mmap__put() is done at record__aio_complete()
427                  * after started aio request completion or at record__aio_push()
428                  * if the request failed to start.
429                  */
430                 perf_mmap__get(&map->core);
431         }
432 
433         aio->size += size;
434 
435         return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440         int ret, idx;
441         int trace_fd = rec->session->data->file.fd;
442         struct record_aio aio = { .rec = rec, .size = 0 };
443 
444         /*
445          * Call record__aio_sync() to wait till map->aio.data[] buffer
446          * becomes available after previous aio write operation.
447          */
448 
449         idx = record__aio_sync(map, false);
450         aio.data = map->aio.data[idx];
451         ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452         if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453                 return ret;
454 
455         rec->samples++;
456         ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457         if (!ret) {
458                 *off += aio.size;
459                 rec->bytes_written += aio.size;
460                 if (switch_output_size(rec))
461                         trigger_hit(&switch_output_trigger);
462         } else {
463                 /*
464                  * Decrement map->refcount incremented in record__aio_pushfn()
465                  * back if record__aio_write() operation failed to start, otherwise
466                  * map->refcount is decremented in record__aio_complete() after
467                  * aio write operation finishes successfully.
468                  */
469                 perf_mmap__put(&map->core);
470         }
471 
472         return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477         return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482         lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487         int i;
488         struct evlist *evlist = rec->evlist;
489         struct mmap *maps = evlist->mmap;
490 
491         if (!record__aio_enabled(rec))
492                 return;
493 
494         for (i = 0; i < evlist->core.nr_mmaps; i++) {
495                 struct mmap *map = &maps[i];
496 
497                 if (map->core.base)
498                         record__aio_sync(map, true);
499         }
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506                              const char *str,
507                              int unset)
508 {
509         struct record_opts *opts = (struct record_opts *)opt->value;
510 
511         if (unset) {
512                 opts->nr_cblocks = 0;
513         } else {
514                 if (str)
515                         opts->nr_cblocks = strtol(str, NULL, 0);
516                 if (!opts->nr_cblocks)
517                         opts->nr_cblocks = nr_cblocks_default;
518         }
519 
520         return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526                             off_t *off __maybe_unused)
527 {
528         return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533         return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547         return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552                                     const char *str,
553                                     int unset)
554 {
555         int flush_max;
556         struct record_opts *opts = (struct record_opts *)opt->value;
557         static struct parse_tag tags[] = {
558                         { .tag  = 'B', .mult = 1       },
559                         { .tag  = 'K', .mult = 1 << 10 },
560                         { .tag  = 'M', .mult = 1 << 20 },
561                         { .tag  = 'G', .mult = 1 << 30 },
562                         { .tag  = 0 },
563         };
564 
565         if (unset)
566                 return 0;
567 
568         if (str) {
569                 opts->mmap_flush = parse_tag_value(str, tags);
570                 if (opts->mmap_flush == (int)-1)
571                         opts->mmap_flush = strtol(str, NULL, 0);
572         }
573 
574         if (!opts->mmap_flush)
575                 opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577         flush_max = evlist__mmap_size(opts->mmap_pages);
578         flush_max /= 4;
579         if (opts->mmap_flush > flush_max)
580                 opts->mmap_flush = flush_max;
581 
582         return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590         struct record_opts *opts = opt->value;
591 
592         if (unset) {
593                 opts->comp_level = 0;
594         } else {
595                 if (str)
596                         opts->comp_level = strtol(str, NULL, 0);
597                 if (!opts->comp_level)
598                         opts->comp_level = comp_level_default;
599         }
600 
601         return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608         return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612                                      union perf_event *event,
613                                      struct perf_sample *sample __maybe_unused,
614                                      struct machine *machine __maybe_unused)
615 {
616         struct record *rec = container_of(tool, struct record, tool);
617         return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623                                      union perf_event *event,
624                                      struct perf_sample *sample __maybe_unused,
625                                      struct machine *machine __maybe_unused)
626 {
627         int ret;
628 
629         mutex_lock(&synth_lock);
630         ret = process_synthesized_event(tool, event, sample, machine);
631         mutex_unlock(&synth_lock);
632         return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637         struct record *rec = to;
638 
639         if (record__comp_enabled(rec)) {
640                 ssize_t compressed = zstd_compress(rec->session, map, map->data,
641                                                    mmap__mmap_len(map), bf, size);
642 
643                 if (compressed < 0)
644                         return (int)compressed;
645 
646                 size = compressed;
647                 bf   = map->data;
648         }
649 
650         thread->samples++;
651         return record__write(rec, map, bf, size);
652 }
653 
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659 
660 static void sig_handler(int sig)
661 {
662         if (sig == SIGCHLD)
663                 child_finished = 1;
664         else
665                 signr = sig;
666 
667         done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669         if (done_fd >= 0) {
670                 u64 tmp = 1;
671                 int orig_errno = errno;
672 
673                 /*
674                  * It is possible for this signal handler to run after done is
675                  * checked in the main loop, but before the perf counter fds are
676                  * polled. If this happens, the poll() will continue to wait
677                  * even though done is set, and will only break out if either
678                  * another signal is received, or the counters are ready for
679                  * read. To ensure the poll() doesn't sleep when done is set,
680                  * use an eventfd (done_fd) to wake up the poll().
681                  */
682                 if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683                         pr_err("failed to signal wakeup fd, error: %m\n");
684 
685                 errno = orig_errno;
686         }
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689 
690 static void sigsegv_handler(int sig)
691 {
692         perf_hooks__recover();
693         sighandler_dump_stack(sig);
694 }
695 
696 static void record__sig_exit(void)
697 {
698         if (signr == -1)
699                 return;
700 
701         signal(signr, SIG_DFL);
702         raise(signr);
703 }
704 
705 #ifdef HAVE_AUXTRACE_SUPPORT
706 
707 static int record__process_auxtrace(struct perf_tool *tool,
708                                     struct mmap *map,
709                                     union perf_event *event, void *data1,
710                                     size_t len1, void *data2, size_t len2)
711 {
712         struct record *rec = container_of(tool, struct record, tool);
713         struct perf_data *data = &rec->data;
714         size_t padding;
715         u8 pad[8] = {0};
716 
717         if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718                 off_t file_offset;
719                 int fd = perf_data__fd(data);
720                 int err;
721 
722                 file_offset = lseek(fd, 0, SEEK_CUR);
723                 if (file_offset == -1)
724                         return -1;
725                 err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726                                                      event, file_offset);
727                 if (err)
728                         return err;
729         }
730 
731         /* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732         padding = (len1 + len2) & 7;
733         if (padding)
734                 padding = 8 - padding;
735 
736         record__write(rec, map, event, event->header.size);
737         record__write(rec, map, data1, len1);
738         if (len2)
739                 record__write(rec, map, data2, len2);
740         record__write(rec, map, &pad, padding);
741 
742         return 0;
743 }
744 
745 static int record__auxtrace_mmap_read(struct record *rec,
746                                       struct mmap *map)
747 {
748         int ret;
749 
750         ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751                                   record__process_auxtrace);
752         if (ret < 0)
753                 return ret;
754 
755         if (ret)
756                 rec->samples++;
757 
758         return 0;
759 }
760 
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762                                                struct mmap *map)
763 {
764         int ret;
765 
766         ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767                                            record__process_auxtrace,
768                                            rec->opts.auxtrace_snapshot_size);
769         if (ret < 0)
770                 return ret;
771 
772         if (ret)
773                 rec->samples++;
774 
775         return 0;
776 }
777 
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780         int i;
781         int rc = 0;
782 
783         for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784                 struct mmap *map = &rec->evlist->mmap[i];
785 
786                 if (!map->auxtrace_mmap.base)
787                         continue;
788 
789                 if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790                         rc = -1;
791                         goto out;
792                 }
793         }
794 out:
795         return rc;
796 }
797 
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800         pr_debug("Recording AUX area tracing snapshot\n");
801         if (record__auxtrace_read_snapshot_all(rec) < 0) {
802                 trigger_error(&auxtrace_snapshot_trigger);
803         } else {
804                 if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805                         trigger_error(&auxtrace_snapshot_trigger);
806                 else
807                         trigger_ready(&auxtrace_snapshot_trigger);
808         }
809 }
810 
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813         if (trigger_is_error(&auxtrace_snapshot_trigger))
814                 return 0;
815 
816         if (!auxtrace_record__snapshot_started &&
817             auxtrace_record__snapshot_start(rec->itr))
818                 return -1;
819 
820         record__read_auxtrace_snapshot(rec, true);
821         if (trigger_is_error(&auxtrace_snapshot_trigger))
822                 return -1;
823 
824         return 0;
825 }
826 
827 static int record__auxtrace_init(struct record *rec)
828 {
829         int err;
830 
831         if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832             && record__threads_enabled(rec)) {
833                 pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834                 return -EINVAL;
835         }
836 
837         if (!rec->itr) {
838                 rec->itr = auxtrace_record__init(rec->evlist, &err);
839                 if (err)
840                         return err;
841         }
842 
843         err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844                                               rec->opts.auxtrace_snapshot_opts);
845         if (err)
846                 return err;
847 
848         err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849                                             rec->opts.auxtrace_sample_opts);
850         if (err)
851                 return err;
852 
853         auxtrace_regroup_aux_output(rec->evlist);
854 
855         return auxtrace_parse_filters(rec->evlist);
856 }
857 
858 #else
859 
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862                                struct mmap *map __maybe_unused)
863 {
864         return 0;
865 }
866 
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869                                     bool on_exit __maybe_unused)
870 {
871 }
872 
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876         return 0;
877 }
878 
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882         return 0;
883 }
884 
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887         return 0;
888 }
889 
890 #endif
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894         struct evsel *evsel;
895 
896         /* Nothing to do if text poke is already configured */
897         evlist__for_each_entry(evlist, evsel) {
898                 if (evsel->core.attr.text_poke)
899                         return 0;
900         }
901 
902         evsel = evlist__add_dummy_on_all_cpus(evlist);
903         if (!evsel)
904                 return -ENOMEM;
905 
906         evsel->core.attr.text_poke = 1;
907         evsel->core.attr.ksymbol = 1;
908         evsel->immediate = true;
909         evsel__set_sample_bit(evsel, TIME);
910 
911         return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916         return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921         struct evlist *evlist = rec->evlist;
922         struct evsel *evsel;
923 
924         /*
925          * If non-dummy evsel exists, system_wide sideband is need to
926          * help parse sample information.
927          * For example, PERF_EVENT_MMAP event to help parse symbol,
928          * and PERF_EVENT_COMM event to help parse task executable name.
929          */
930         evlist__for_each_entry(evlist, evsel) {
931                 if (!evsel__is_dummy_event(evsel))
932                         return true;
933         }
934 
935         return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940         struct record_opts *opts = &rec->opts;
941         struct evlist *evlist = rec->evlist;
942         bool system_wide = false;
943         struct evsel *evsel;
944 
945         /*
946          * For initial_delay, system wide or a hybrid system, we need to add
947          * tracking event so that we can track PERF_RECORD_MMAP to cover the
948          * delay of waiting or event synthesis.
949          */
950         if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951             perf_pmus__num_core_pmus() > 1) {
952 
953                 /*
954                  * User space tasks can migrate between CPUs, so when tracing
955                  * selected CPUs, sideband for all CPUs is still needed.
956                  */
957                 if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958                         system_wide = true;
959 
960                 evsel = evlist__findnew_tracking_event(evlist, system_wide);
961                 if (!evsel)
962                         return -ENOMEM;
963 
964                 /*
965                  * Enable the tracking event when the process is forked for
966                  * initial_delay, immediately for system wide.
967                  */
968                 if (opts->target.initial_delay && !evsel->immediate &&
969                     !target__has_cpu(&opts->target))
970                         evsel->core.attr.enable_on_exec = 1;
971                 else
972                         evsel->immediate = 1;
973         }
974 
975         return 0;
976 }
977 
978 static bool record__kcore_readable(struct machine *machine)
979 {
980         char kcore[PATH_MAX];
981         int fd;
982 
983         scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984 
985         fd = open(kcore, O_RDONLY);
986         if (fd < 0)
987                 return false;
988 
989         close(fd);
990 
991         return true;
992 }
993 
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996         char from_dir[PATH_MAX];
997         char kcore_dir[PATH_MAX];
998         int ret;
999 
1000         snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001 
1002         ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003         if (ret)
1004                 return ret;
1005 
1006         return kcore_copy(from_dir, kcore_dir);
1007 }
1008 
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011         thread_data->pipes.msg[0] = -1;
1012         thread_data->pipes.msg[1] = -1;
1013         thread_data->pipes.ack[0] = -1;
1014         thread_data->pipes.ack[1] = -1;
1015 }
1016 
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019         if (pipe(thread_data->pipes.msg))
1020                 return -EINVAL;
1021 
1022         if (pipe(thread_data->pipes.ack)) {
1023                 close(thread_data->pipes.msg[0]);
1024                 thread_data->pipes.msg[0] = -1;
1025                 close(thread_data->pipes.msg[1]);
1026                 thread_data->pipes.msg[1] = -1;
1027                 return -EINVAL;
1028         }
1029 
1030         pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031                  thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032                  thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033 
1034         return 0;
1035 }
1036 
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039         if (thread_data->pipes.msg[0] != -1) {
1040                 close(thread_data->pipes.msg[0]);
1041                 thread_data->pipes.msg[0] = -1;
1042         }
1043         if (thread_data->pipes.msg[1] != -1) {
1044                 close(thread_data->pipes.msg[1]);
1045                 thread_data->pipes.msg[1] = -1;
1046         }
1047         if (thread_data->pipes.ack[0] != -1) {
1048                 close(thread_data->pipes.ack[0]);
1049                 thread_data->pipes.ack[0] = -1;
1050         }
1051         if (thread_data->pipes.ack[1] != -1) {
1052                 close(thread_data->pipes.ack[1]);
1053                 thread_data->pipes.ack[1] = -1;
1054         }
1055 }
1056 
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059         return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061 
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064         int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065         struct mmap *mmap = evlist->mmap;
1066         struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067         struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068         bool per_thread = evlist__per_thread(evlist);
1069 
1070         if (per_thread)
1071                 thread_data->nr_mmaps = nr_mmaps;
1072         else
1073                 thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074                                                       thread_data->mask->maps.nbits);
1075         if (mmap) {
1076                 thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077                 if (!thread_data->maps)
1078                         return -ENOMEM;
1079         }
1080         if (overwrite_mmap) {
1081                 thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082                 if (!thread_data->overwrite_maps) {
1083                         zfree(&thread_data->maps);
1084                         return -ENOMEM;
1085                 }
1086         }
1087         pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088                  thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089 
1090         for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091                 if (per_thread ||
1092                     test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093                         if (thread_data->maps) {
1094                                 thread_data->maps[tm] = &mmap[m];
1095                                 pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097                         }
1098                         if (thread_data->overwrite_maps) {
1099                                 thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100                                 pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101                                           thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102                         }
1103                         tm++;
1104                 }
1105         }
1106 
1107         return 0;
1108 }
1109 
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112         int f, tm, pos;
1113         struct mmap *map, *overwrite_map;
1114 
1115         fdarray__init(&thread_data->pollfd, 64);
1116 
1117         for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118                 map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119                 overwrite_map = thread_data->overwrite_maps ?
1120                                 thread_data->overwrite_maps[tm] : NULL;
1121 
1122                 for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123                         void *ptr = evlist->core.pollfd.priv[f].ptr;
1124 
1125                         if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126                                 pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127                                                               &evlist->core.pollfd);
1128                                 if (pos < 0)
1129                                         return pos;
1130                                 pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131                                          thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132                         }
1133                 }
1134         }
1135 
1136         return 0;
1137 }
1138 
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141         int t;
1142         struct record_thread *thread_data = rec->thread_data;
1143 
1144         if (thread_data == NULL)
1145                 return;
1146 
1147         for (t = 0; t < rec->nr_threads; t++) {
1148                 record__thread_data_close_pipes(&thread_data[t]);
1149                 zfree(&thread_data[t].maps);
1150                 zfree(&thread_data[t].overwrite_maps);
1151                 fdarray__exit(&thread_data[t].pollfd);
1152         }
1153 
1154         zfree(&rec->thread_data);
1155 }
1156 
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158                                                     int evlist_pollfd_index,
1159                                                     int thread_pollfd_index)
1160 {
1161         size_t x = rec->index_map_cnt;
1162 
1163         if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164                 return -ENOMEM;
1165         rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166         rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167         rec->index_map_cnt += 1;
1168         return 0;
1169 }
1170 
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172                                                     struct evlist *evlist,
1173                                                     struct record_thread *thread_data)
1174 {
1175         struct pollfd *e_entries = evlist->core.pollfd.entries;
1176         struct pollfd *t_entries = thread_data->pollfd.entries;
1177         int err = 0;
1178         size_t i;
1179 
1180         for (i = 0; i < rec->index_map_cnt; i++) {
1181                 int e_pos = rec->index_map[i].evlist_pollfd_index;
1182                 int t_pos = rec->index_map[i].thread_pollfd_index;
1183 
1184                 if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185                     e_entries[e_pos].events != t_entries[t_pos].events) {
1186                         pr_err("Thread and evlist pollfd index mismatch\n");
1187                         err = -EINVAL;
1188                         continue;
1189                 }
1190                 e_entries[e_pos].revents = t_entries[t_pos].revents;
1191         }
1192         return err;
1193 }
1194 
1195 static int record__dup_non_perf_events(struct record *rec,
1196                                        struct evlist *evlist,
1197                                        struct record_thread *thread_data)
1198 {
1199         struct fdarray *fda = &evlist->core.pollfd;
1200         int i, ret;
1201 
1202         for (i = 0; i < fda->nr; i++) {
1203                 if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204                         continue;
1205                 ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206                 if (ret < 0) {
1207                         pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208                         return ret;
1209                 }
1210                 pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211                           thread_data, ret, fda->entries[i].fd);
1212                 ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213                 if (ret < 0) {
1214                         pr_err("Failed to map thread and evlist pollfd indexes\n");
1215                         return ret;
1216                 }
1217         }
1218         return 0;
1219 }
1220 
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223         int t, ret;
1224         struct record_thread *thread_data;
1225 
1226         rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227         if (!rec->thread_data) {
1228                 pr_err("Failed to allocate thread data\n");
1229                 return -ENOMEM;
1230         }
1231         thread_data = rec->thread_data;
1232 
1233         for (t = 0; t < rec->nr_threads; t++)
1234                 record__thread_data_init_pipes(&thread_data[t]);
1235 
1236         for (t = 0; t < rec->nr_threads; t++) {
1237                 thread_data[t].rec = rec;
1238                 thread_data[t].mask = &rec->thread_masks[t];
1239                 ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240                 if (ret) {
1241                         pr_err("Failed to initialize thread[%d] maps\n", t);
1242                         goto out_free;
1243                 }
1244                 ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245                 if (ret) {
1246                         pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247                         goto out_free;
1248                 }
1249                 if (t) {
1250                         thread_data[t].tid = -1;
1251                         ret = record__thread_data_open_pipes(&thread_data[t]);
1252                         if (ret) {
1253                                 pr_err("Failed to open thread[%d] communication pipes\n", t);
1254                                 goto out_free;
1255                         }
1256                         ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257                                            POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258                         if (ret < 0) {
1259                                 pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260                                 goto out_free;
1261                         }
1262                         thread_data[t].ctlfd_pos = ret;
1263                         pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264                                  thread_data, thread_data[t].ctlfd_pos,
1265                                  thread_data[t].pipes.msg[0]);
1266                 } else {
1267                         thread_data[t].tid = gettid();
1268 
1269                         ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270                         if (ret < 0)
1271                                 goto out_free;
1272 
1273                         thread_data[t].ctlfd_pos = -1; /* Not used */
1274                 }
1275         }
1276 
1277         return 0;
1278 
1279 out_free:
1280         record__free_thread_data(rec);
1281 
1282         return ret;
1283 }
1284 
1285 static int record__mmap_evlist(struct record *rec,
1286                                struct evlist *evlist)
1287 {
1288         int i, ret;
1289         struct record_opts *opts = &rec->opts;
1290         bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291                                   opts->auxtrace_sample_mode;
1292         char msg[512];
1293 
1294         if (opts->affinity != PERF_AFFINITY_SYS)
1295                 cpu__setup_cpunode_map();
1296 
1297         if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298                                  opts->auxtrace_mmap_pages,
1299                                  auxtrace_overwrite,
1300                                  opts->nr_cblocks, opts->affinity,
1301                                  opts->mmap_flush, opts->comp_level) < 0) {
1302                 if (errno == EPERM) {
1303                         pr_err("Permission error mapping pages.\n"
1304                                "Consider increasing "
1305                                "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306                                "or try again with a smaller value of -m/--mmap_pages.\n"
1307                                "(current value: %u,%u)\n",
1308                                opts->mmap_pages, opts->auxtrace_mmap_pages);
1309                         return -errno;
1310                 } else {
1311                         pr_err("failed to mmap with %d (%s)\n", errno,
1312                                 str_error_r(errno, msg, sizeof(msg)));
1313                         if (errno)
1314                                 return -errno;
1315                         else
1316                                 return -EINVAL;
1317                 }
1318         }
1319 
1320         if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321                 return -1;
1322 
1323         ret = record__alloc_thread_data(rec, evlist);
1324         if (ret)
1325                 return ret;
1326 
1327         if (record__threads_enabled(rec)) {
1328                 ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329                 if (ret) {
1330                         pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331                         return ret;
1332                 }
1333                 for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334                         if (evlist->mmap)
1335                                 evlist->mmap[i].file = &rec->data.dir.files[i];
1336                         if (evlist->overwrite_mmap)
1337                                 evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338                 }
1339         }
1340 
1341         return 0;
1342 }
1343 
1344 static int record__mmap(struct record *rec)
1345 {
1346         return record__mmap_evlist(rec, rec->evlist);
1347 }
1348 
1349 static int record__open(struct record *rec)
1350 {
1351         char msg[BUFSIZ];
1352         struct evsel *pos;
1353         struct evlist *evlist = rec->evlist;
1354         struct perf_session *session = rec->session;
1355         struct record_opts *opts = &rec->opts;
1356         int rc = 0;
1357 
1358         evlist__for_each_entry(evlist, pos) {
1359 try_again:
1360                 if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1361                         if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1362                                 if (verbose > 0)
1363                                         ui__warning("%s\n", msg);
1364                                 goto try_again;
1365                         }
1366                         if ((errno == EINVAL || errno == EBADF) &&
1367                             pos->core.leader != &pos->core &&
1368                             pos->weak_group) {
1369                                 pos = evlist__reset_weak_group(evlist, pos, true);
1370                                 goto try_again;
1371                         }
1372                         rc = -errno;
1373                         evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1374                         ui__error("%s\n", msg);
1375                         goto out;
1376                 }
1377 
1378                 pos->supported = true;
1379         }
1380 
1381         if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1382                 pr_warning(
1383 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1384 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1385 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1386 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1387 "Samples in kernel modules won't be resolved at all.\n\n"
1388 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1389 "even with a suitable vmlinux or kallsyms file.\n\n");
1390         }
1391 
1392         if (evlist__apply_filters(evlist, &pos)) {
1393                 pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1394                         pos->filter ?: "BPF", evsel__name(pos), errno,
1395                         str_error_r(errno, msg, sizeof(msg)));
1396                 rc = -1;
1397                 goto out;
1398         }
1399 
1400         rc = record__mmap(rec);
1401         if (rc)
1402                 goto out;
1403 
1404         session->evlist = evlist;
1405         perf_session__set_id_hdr_size(session);
1406 out:
1407         return rc;
1408 }
1409 
1410 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1411 {
1412         if (rec->evlist->first_sample_time == 0)
1413                 rec->evlist->first_sample_time = sample_time;
1414 
1415         if (sample_time)
1416                 rec->evlist->last_sample_time = sample_time;
1417 }
1418 
1419 static int process_sample_event(struct perf_tool *tool,
1420                                 union perf_event *event,
1421                                 struct perf_sample *sample,
1422                                 struct evsel *evsel,
1423                                 struct machine *machine)
1424 {
1425         struct record *rec = container_of(tool, struct record, tool);
1426 
1427         set_timestamp_boundary(rec, sample->time);
1428 
1429         if (rec->buildid_all)
1430                 return 0;
1431 
1432         rec->samples++;
1433         return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1434 }
1435 
1436 static int process_buildids(struct record *rec)
1437 {
1438         struct perf_session *session = rec->session;
1439 
1440         if (perf_data__size(&rec->data) == 0)
1441                 return 0;
1442 
1443         /*
1444          * During this process, it'll load kernel map and replace the
1445          * dso->long_name to a real pathname it found.  In this case
1446          * we prefer the vmlinux path like
1447          *   /lib/modules/3.16.4/build/vmlinux
1448          *
1449          * rather than build-id path (in debug directory).
1450          *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1451          */
1452         symbol_conf.ignore_vmlinux_buildid = true;
1453 
1454         /*
1455          * If --buildid-all is given, it marks all DSO regardless of hits,
1456          * so no need to process samples. But if timestamp_boundary is enabled,
1457          * it still needs to walk on all samples to get the timestamps of
1458          * first/last samples.
1459          */
1460         if (rec->buildid_all && !rec->timestamp_boundary)
1461                 rec->tool.sample = NULL;
1462 
1463         return perf_session__process_events(session);
1464 }
1465 
1466 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1467 {
1468         int err;
1469         struct perf_tool *tool = data;
1470         /*
1471          *As for guest kernel when processing subcommand record&report,
1472          *we arrange module mmap prior to guest kernel mmap and trigger
1473          *a preload dso because default guest module symbols are loaded
1474          *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1475          *method is used to avoid symbol missing when the first addr is
1476          *in module instead of in guest kernel.
1477          */
1478         err = perf_event__synthesize_modules(tool, process_synthesized_event,
1479                                              machine);
1480         if (err < 0)
1481                 pr_err("Couldn't record guest kernel [%d]'s reference"
1482                        " relocation symbol.\n", machine->pid);
1483 
1484         /*
1485          * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1486          * have no _text sometimes.
1487          */
1488         err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1489                                                  machine);
1490         if (err < 0)
1491                 pr_err("Couldn't record guest kernel [%d]'s reference"
1492                        " relocation symbol.\n", machine->pid);
1493 }
1494 
1495 static struct perf_event_header finished_round_event = {
1496         .size = sizeof(struct perf_event_header),
1497         .type = PERF_RECORD_FINISHED_ROUND,
1498 };
1499 
1500 static struct perf_event_header finished_init_event = {
1501         .size = sizeof(struct perf_event_header),
1502         .type = PERF_RECORD_FINISHED_INIT,
1503 };
1504 
1505 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1506 {
1507         if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1508             !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1509                           thread->mask->affinity.nbits)) {
1510                 bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1511                 bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1512                           map->affinity_mask.bits, thread->mask->affinity.nbits);
1513                 sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1514                                         (cpu_set_t *)thread->mask->affinity.bits);
1515                 if (verbose == 2) {
1516                         pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1517                         mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1518                 }
1519         }
1520 }
1521 
1522 static size_t process_comp_header(void *record, size_t increment)
1523 {
1524         struct perf_record_compressed *event = record;
1525         size_t size = sizeof(*event);
1526 
1527         if (increment) {
1528                 event->header.size += increment;
1529                 return increment;
1530         }
1531 
1532         event->header.type = PERF_RECORD_COMPRESSED;
1533         event->header.size = size;
1534 
1535         return size;
1536 }
1537 
1538 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1539                             void *dst, size_t dst_size, void *src, size_t src_size)
1540 {
1541         ssize_t compressed;
1542         size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1543         struct zstd_data *zstd_data = &session->zstd_data;
1544 
1545         if (map && map->file)
1546                 zstd_data = &map->zstd_data;
1547 
1548         compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1549                                                      max_record_size, process_comp_header);
1550         if (compressed < 0)
1551                 return compressed;
1552 
1553         if (map && map->file) {
1554                 thread->bytes_transferred += src_size;
1555                 thread->bytes_compressed  += compressed;
1556         } else {
1557                 session->bytes_transferred += src_size;
1558                 session->bytes_compressed  += compressed;
1559         }
1560 
1561         return compressed;
1562 }
1563 
1564 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1565                                     bool overwrite, bool synch)
1566 {
1567         u64 bytes_written = rec->bytes_written;
1568         int i;
1569         int rc = 0;
1570         int nr_mmaps;
1571         struct mmap **maps;
1572         int trace_fd = rec->data.file.fd;
1573         off_t off = 0;
1574 
1575         if (!evlist)
1576                 return 0;
1577 
1578         nr_mmaps = thread->nr_mmaps;
1579         maps = overwrite ? thread->overwrite_maps : thread->maps;
1580 
1581         if (!maps)
1582                 return 0;
1583 
1584         if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1585                 return 0;
1586 
1587         if (record__aio_enabled(rec))
1588                 off = record__aio_get_pos(trace_fd);
1589 
1590         for (i = 0; i < nr_mmaps; i++) {
1591                 u64 flush = 0;
1592                 struct mmap *map = maps[i];
1593 
1594                 if (map->core.base) {
1595                         record__adjust_affinity(rec, map);
1596                         if (synch) {
1597                                 flush = map->core.flush;
1598                                 map->core.flush = 1;
1599                         }
1600                         if (!record__aio_enabled(rec)) {
1601                                 if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1602                                         if (synch)
1603                                                 map->core.flush = flush;
1604                                         rc = -1;
1605                                         goto out;
1606                                 }
1607                         } else {
1608                                 if (record__aio_push(rec, map, &off) < 0) {
1609                                         record__aio_set_pos(trace_fd, off);
1610                                         if (synch)
1611                                                 map->core.flush = flush;
1612                                         rc = -1;
1613                                         goto out;
1614                                 }
1615                         }
1616                         if (synch)
1617                                 map->core.flush = flush;
1618                 }
1619 
1620                 if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1621                     !rec->opts.auxtrace_sample_mode &&
1622                     record__auxtrace_mmap_read(rec, map) != 0) {
1623                         rc = -1;
1624                         goto out;
1625                 }
1626         }
1627 
1628         if (record__aio_enabled(rec))
1629                 record__aio_set_pos(trace_fd, off);
1630 
1631         /*
1632          * Mark the round finished in case we wrote
1633          * at least one event.
1634          *
1635          * No need for round events in directory mode,
1636          * because per-cpu maps and files have data
1637          * sorted by kernel.
1638          */
1639         if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1640                 rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1641 
1642         if (overwrite)
1643                 evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1644 out:
1645         return rc;
1646 }
1647 
1648 static int record__mmap_read_all(struct record *rec, bool synch)
1649 {
1650         int err;
1651 
1652         err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1653         if (err)
1654                 return err;
1655 
1656         return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1657 }
1658 
1659 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1660                                            void *arg __maybe_unused)
1661 {
1662         struct perf_mmap *map = fda->priv[fd].ptr;
1663 
1664         if (map)
1665                 perf_mmap__put(map);
1666 }
1667 
1668 static void *record__thread(void *arg)
1669 {
1670         enum thread_msg msg = THREAD_MSG__READY;
1671         bool terminate = false;
1672         struct fdarray *pollfd;
1673         int err, ctlfd_pos;
1674 
1675         thread = arg;
1676         thread->tid = gettid();
1677 
1678         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1679         if (err == -1)
1680                 pr_warning("threads[%d]: failed to notify on start: %s\n",
1681                            thread->tid, strerror(errno));
1682 
1683         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1684 
1685         pollfd = &thread->pollfd;
1686         ctlfd_pos = thread->ctlfd_pos;
1687 
1688         for (;;) {
1689                 unsigned long long hits = thread->samples;
1690 
1691                 if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1692                         break;
1693 
1694                 if (hits == thread->samples) {
1695 
1696                         err = fdarray__poll(pollfd, -1);
1697                         /*
1698                          * Propagate error, only if there's any. Ignore positive
1699                          * number of returned events and interrupt error.
1700                          */
1701                         if (err > 0 || (err < 0 && errno == EINTR))
1702                                 err = 0;
1703                         thread->waking++;
1704 
1705                         if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1706                                             record__thread_munmap_filtered, NULL) == 0)
1707                                 break;
1708                 }
1709 
1710                 if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1711                         terminate = true;
1712                         close(thread->pipes.msg[0]);
1713                         thread->pipes.msg[0] = -1;
1714                         pollfd->entries[ctlfd_pos].fd = -1;
1715                         pollfd->entries[ctlfd_pos].events = 0;
1716                 }
1717 
1718                 pollfd->entries[ctlfd_pos].revents = 0;
1719         }
1720         record__mmap_read_all(thread->rec, true);
1721 
1722         err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1723         if (err == -1)
1724                 pr_warning("threads[%d]: failed to notify on termination: %s\n",
1725                            thread->tid, strerror(errno));
1726 
1727         return NULL;
1728 }
1729 
1730 static void record__init_features(struct record *rec)
1731 {
1732         struct perf_session *session = rec->session;
1733         int feat;
1734 
1735         for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1736                 perf_header__set_feat(&session->header, feat);
1737 
1738         if (rec->no_buildid)
1739                 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1740 
1741 #ifdef HAVE_LIBTRACEEVENT
1742         if (!have_tracepoints(&rec->evlist->core.entries))
1743                 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1744 #endif
1745 
1746         if (!rec->opts.branch_stack)
1747                 perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1748 
1749         if (!rec->opts.full_auxtrace)
1750                 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1751 
1752         if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1753                 perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1754 
1755         if (!rec->opts.use_clockid)
1756                 perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1757 
1758         if (!record__threads_enabled(rec))
1759                 perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1760 
1761         if (!record__comp_enabled(rec))
1762                 perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1763 
1764         perf_header__clear_feat(&session->header, HEADER_STAT);
1765 }
1766 
1767 static void
1768 record__finish_output(struct record *rec)
1769 {
1770         int i;
1771         struct perf_data *data = &rec->data;
1772         int fd = perf_data__fd(data);
1773 
1774         if (data->is_pipe) {
1775                 /* Just to display approx. size */
1776                 data->file.size = rec->bytes_written;
1777                 return;
1778         }
1779 
1780         rec->session->header.data_size += rec->bytes_written;
1781         data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1782         if (record__threads_enabled(rec)) {
1783                 for (i = 0; i < data->dir.nr; i++)
1784                         data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1785         }
1786 
1787         if (!rec->no_buildid) {
1788                 process_buildids(rec);
1789 
1790                 if (rec->buildid_all)
1791                         perf_session__dsos_hit_all(rec->session);
1792         }
1793         perf_session__write_header(rec->session, rec->evlist, fd, true);
1794 
1795         return;
1796 }
1797 
1798 static int record__synthesize_workload(struct record *rec, bool tail)
1799 {
1800         int err;
1801         struct perf_thread_map *thread_map;
1802         bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1803 
1804         if (rec->opts.tail_synthesize != tail)
1805                 return 0;
1806 
1807         thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1808         if (thread_map == NULL)
1809                 return -1;
1810 
1811         err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1812                                                  process_synthesized_event,
1813                                                  &rec->session->machines.host,
1814                                                  needs_mmap,
1815                                                  rec->opts.sample_address);
1816         perf_thread_map__put(thread_map);
1817         return err;
1818 }
1819 
1820 static int write_finished_init(struct record *rec, bool tail)
1821 {
1822         if (rec->opts.tail_synthesize != tail)
1823                 return 0;
1824 
1825         return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1826 }
1827 
1828 static int record__synthesize(struct record *rec, bool tail);
1829 
1830 static int
1831 record__switch_output(struct record *rec, bool at_exit)
1832 {
1833         struct perf_data *data = &rec->data;
1834         char *new_filename = NULL;
1835         int fd, err;
1836 
1837         /* Same Size:      "2015122520103046"*/
1838         char timestamp[] = "InvalidTimestamp";
1839 
1840         record__aio_mmap_read_sync(rec);
1841 
1842         write_finished_init(rec, true);
1843 
1844         record__synthesize(rec, true);
1845         if (target__none(&rec->opts.target))
1846                 record__synthesize_workload(rec, true);
1847 
1848         rec->samples = 0;
1849         record__finish_output(rec);
1850         err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1851         if (err) {
1852                 pr_err("Failed to get current timestamp\n");
1853                 return -EINVAL;
1854         }
1855 
1856         fd = perf_data__switch(data, timestamp,
1857                                rec->session->header.data_offset,
1858                                at_exit, &new_filename);
1859         if (fd >= 0 && !at_exit) {
1860                 rec->bytes_written = 0;
1861                 rec->session->header.data_size = 0;
1862         }
1863 
1864         if (!quiet) {
1865                 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1866                         data->path, timestamp);
1867         }
1868 
1869         if (rec->switch_output.num_files) {
1870                 int n = rec->switch_output.cur_file + 1;
1871 
1872                 if (n >= rec->switch_output.num_files)
1873                         n = 0;
1874                 rec->switch_output.cur_file = n;
1875                 if (rec->switch_output.filenames[n]) {
1876                         remove(rec->switch_output.filenames[n]);
1877                         zfree(&rec->switch_output.filenames[n]);
1878                 }
1879                 rec->switch_output.filenames[n] = new_filename;
1880         } else {
1881                 free(new_filename);
1882         }
1883 
1884         /* Output tracking events */
1885         if (!at_exit) {
1886                 record__synthesize(rec, false);
1887 
1888                 /*
1889                  * In 'perf record --switch-output' without -a,
1890                  * record__synthesize() in record__switch_output() won't
1891                  * generate tracking events because there's no thread_map
1892                  * in evlist. Which causes newly created perf.data doesn't
1893                  * contain map and comm information.
1894                  * Create a fake thread_map and directly call
1895                  * perf_event__synthesize_thread_map() for those events.
1896                  */
1897                 if (target__none(&rec->opts.target))
1898                         record__synthesize_workload(rec, false);
1899                 write_finished_init(rec, false);
1900         }
1901         return fd;
1902 }
1903 
1904 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1905                                         struct perf_record_lost_samples *lost,
1906                                         int cpu_idx, int thread_idx, u64 lost_count,
1907                                         u16 misc_flag)
1908 {
1909         struct perf_sample_id *sid;
1910         struct perf_sample sample = {};
1911         int id_hdr_size;
1912 
1913         lost->lost = lost_count;
1914         if (evsel->core.ids) {
1915                 sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1916                 sample.id = sid->id;
1917         }
1918 
1919         id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1920                                                        evsel->core.attr.sample_type, &sample);
1921         lost->header.size = sizeof(*lost) + id_hdr_size;
1922         lost->header.misc = misc_flag;
1923         record__write(rec, NULL, lost, lost->header.size);
1924 }
1925 
1926 static void record__read_lost_samples(struct record *rec)
1927 {
1928         struct perf_session *session = rec->session;
1929         struct perf_record_lost_samples_and_ids lost;
1930         struct evsel *evsel;
1931 
1932         /* there was an error during record__open */
1933         if (session->evlist == NULL)
1934                 return;
1935 
1936         evlist__for_each_entry(session->evlist, evsel) {
1937                 struct xyarray *xy = evsel->core.sample_id;
1938                 u64 lost_count;
1939 
1940                 if (xy == NULL || evsel->core.fd == NULL)
1941                         continue;
1942                 if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1943                     xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1944                         pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1945                         continue;
1946                 }
1947 
1948                 for (int x = 0; x < xyarray__max_x(xy); x++) {
1949                         for (int y = 0; y < xyarray__max_y(xy); y++) {
1950                                 struct perf_counts_values count;
1951 
1952                                 if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1953                                         pr_debug("read LOST count failed\n");
1954                                         return;
1955                                 }
1956 
1957                                 if (count.lost) {
1958                                         memset(&lost, 0, sizeof(lost));
1959                                         lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1960                                         __record__save_lost_samples(rec, evsel, &lost.lost,
1961                                                                     x, y, count.lost, 0);
1962                                 }
1963                         }
1964                 }
1965 
1966                 lost_count = perf_bpf_filter__lost_count(evsel);
1967                 if (lost_count) {
1968                         memset(&lost, 0, sizeof(lost));
1969                         lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1970                         __record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
1971                                                     PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1972                 }
1973         }
1974 }
1975 
1976 static volatile sig_atomic_t workload_exec_errno;
1977 
1978 /*
1979  * evlist__prepare_workload will send a SIGUSR1
1980  * if the fork fails, since we asked by setting its
1981  * want_signal to true.
1982  */
1983 static void workload_exec_failed_signal(int signo __maybe_unused,
1984                                         siginfo_t *info,
1985                                         void *ucontext __maybe_unused)
1986 {
1987         workload_exec_errno = info->si_value.sival_int;
1988         done = 1;
1989         child_finished = 1;
1990 }
1991 
1992 static void snapshot_sig_handler(int sig);
1993 static void alarm_sig_handler(int sig);
1994 
1995 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1996 {
1997         if (evlist) {
1998                 if (evlist->mmap && evlist->mmap[0].core.base)
1999                         return evlist->mmap[0].core.base;
2000                 if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2001                         return evlist->overwrite_mmap[0].core.base;
2002         }
2003         return NULL;
2004 }
2005 
2006 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2007 {
2008         const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2009         if (pc)
2010                 return pc;
2011         return NULL;
2012 }
2013 
2014 static int record__synthesize(struct record *rec, bool tail)
2015 {
2016         struct perf_session *session = rec->session;
2017         struct machine *machine = &session->machines.host;
2018         struct perf_data *data = &rec->data;
2019         struct record_opts *opts = &rec->opts;
2020         struct perf_tool *tool = &rec->tool;
2021         int err = 0;
2022         event_op f = process_synthesized_event;
2023 
2024         if (rec->opts.tail_synthesize != tail)
2025                 return 0;
2026 
2027         if (data->is_pipe) {
2028                 err = perf_event__synthesize_for_pipe(tool, session, data,
2029                                                       process_synthesized_event);
2030                 if (err < 0)
2031                         goto out;
2032 
2033                 rec->bytes_written += err;
2034         }
2035 
2036         err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2037                                           process_synthesized_event, machine);
2038         if (err)
2039                 goto out;
2040 
2041         /* Synthesize id_index before auxtrace_info */
2042         err = perf_event__synthesize_id_index(tool,
2043                                               process_synthesized_event,
2044                                               session->evlist, machine);
2045         if (err)
2046                 goto out;
2047 
2048         if (rec->opts.full_auxtrace) {
2049                 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2050                                         session, process_synthesized_event);
2051                 if (err)
2052                         goto out;
2053         }
2054 
2055         if (!evlist__exclude_kernel(rec->evlist)) {
2056                 err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2057                                                          machine);
2058                 WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2059                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2060                                    "Check /proc/kallsyms permission or run as root.\n");
2061 
2062                 err = perf_event__synthesize_modules(tool, process_synthesized_event,
2063                                                      machine);
2064                 WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2065                                    "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2066                                    "Check /proc/modules permission or run as root.\n");
2067         }
2068 
2069         if (perf_guest) {
2070                 machines__process_guests(&session->machines,
2071                                          perf_event__synthesize_guest_os, tool);
2072         }
2073 
2074         err = perf_event__synthesize_extra_attr(&rec->tool,
2075                                                 rec->evlist,
2076                                                 process_synthesized_event,
2077                                                 data->is_pipe);
2078         if (err)
2079                 goto out;
2080 
2081         err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2082                                                  process_synthesized_event,
2083                                                 NULL);
2084         if (err < 0) {
2085                 pr_err("Couldn't synthesize thread map.\n");
2086                 return err;
2087         }
2088 
2089         err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2090                                              process_synthesized_event, NULL);
2091         if (err < 0) {
2092                 pr_err("Couldn't synthesize cpu map.\n");
2093                 return err;
2094         }
2095 
2096         err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2097                                                 machine, opts);
2098         if (err < 0) {
2099                 pr_warning("Couldn't synthesize bpf events.\n");
2100                 err = 0;
2101         }
2102 
2103         if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2104                 err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2105                                                      machine);
2106                 if (err < 0) {
2107                         pr_warning("Couldn't synthesize cgroup events.\n");
2108                         err = 0;
2109                 }
2110         }
2111 
2112         if (rec->opts.nr_threads_synthesize > 1) {
2113                 mutex_init(&synth_lock);
2114                 perf_set_multithreaded();
2115                 f = process_locked_synthesized_event;
2116         }
2117 
2118         if (rec->opts.synth & PERF_SYNTH_TASK) {
2119                 bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2120 
2121                 err = __machine__synthesize_threads(machine, tool, &opts->target,
2122                                                     rec->evlist->core.threads,
2123                                                     f, needs_mmap, opts->sample_address,
2124                                                     rec->opts.nr_threads_synthesize);
2125         }
2126 
2127         if (rec->opts.nr_threads_synthesize > 1) {
2128                 perf_set_singlethreaded();
2129                 mutex_destroy(&synth_lock);
2130         }
2131 
2132 out:
2133         return err;
2134 }
2135 
2136 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2137 {
2138         struct record *rec = data;
2139         pthread_kill(rec->thread_id, SIGUSR2);
2140         return 0;
2141 }
2142 
2143 static int record__setup_sb_evlist(struct record *rec)
2144 {
2145         struct record_opts *opts = &rec->opts;
2146 
2147         if (rec->sb_evlist != NULL) {
2148                 /*
2149                  * We get here if --switch-output-event populated the
2150                  * sb_evlist, so associate a callback that will send a SIGUSR2
2151                  * to the main thread.
2152                  */
2153                 evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2154                 rec->thread_id = pthread_self();
2155         }
2156 #ifdef HAVE_LIBBPF_SUPPORT
2157         if (!opts->no_bpf_event) {
2158                 if (rec->sb_evlist == NULL) {
2159                         rec->sb_evlist = evlist__new();
2160 
2161                         if (rec->sb_evlist == NULL) {
2162                                 pr_err("Couldn't create side band evlist.\n.");
2163                                 return -1;
2164                         }
2165                 }
2166 
2167                 if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2168                         pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2169                         return -1;
2170                 }
2171         }
2172 #endif
2173         if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2174                 pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2175                 opts->no_bpf_event = true;
2176         }
2177 
2178         return 0;
2179 }
2180 
2181 static int record__init_clock(struct record *rec)
2182 {
2183         struct perf_session *session = rec->session;
2184         struct timespec ref_clockid;
2185         struct timeval ref_tod;
2186         u64 ref;
2187 
2188         if (!rec->opts.use_clockid)
2189                 return 0;
2190 
2191         if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2192                 session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2193 
2194         session->header.env.clock.clockid = rec->opts.clockid;
2195 
2196         if (gettimeofday(&ref_tod, NULL) != 0) {
2197                 pr_err("gettimeofday failed, cannot set reference time.\n");
2198                 return -1;
2199         }
2200 
2201         if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2202                 pr_err("clock_gettime failed, cannot set reference time.\n");
2203                 return -1;
2204         }
2205 
2206         ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2207               (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2208 
2209         session->header.env.clock.tod_ns = ref;
2210 
2211         ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2212               (u64) ref_clockid.tv_nsec;
2213 
2214         session->header.env.clock.clockid_ns = ref;
2215         return 0;
2216 }
2217 
2218 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2219 {
2220         if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2221                 trigger_hit(&auxtrace_snapshot_trigger);
2222                 auxtrace_record__snapshot_started = 1;
2223                 if (auxtrace_record__snapshot_start(rec->itr))
2224                         trigger_error(&auxtrace_snapshot_trigger);
2225         }
2226 }
2227 
2228 static int record__terminate_thread(struct record_thread *thread_data)
2229 {
2230         int err;
2231         enum thread_msg ack = THREAD_MSG__UNDEFINED;
2232         pid_t tid = thread_data->tid;
2233 
2234         close(thread_data->pipes.msg[1]);
2235         thread_data->pipes.msg[1] = -1;
2236         err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2237         if (err > 0)
2238                 pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2239         else
2240                 pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2241                            thread->tid, tid);
2242 
2243         return 0;
2244 }
2245 
2246 static int record__start_threads(struct record *rec)
2247 {
2248         int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2249         struct record_thread *thread_data = rec->thread_data;
2250         sigset_t full, mask;
2251         pthread_t handle;
2252         pthread_attr_t attrs;
2253 
2254         thread = &thread_data[0];
2255 
2256         if (!record__threads_enabled(rec))
2257                 return 0;
2258 
2259         sigfillset(&full);
2260         if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2261                 pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2262                 return -1;
2263         }
2264 
2265         pthread_attr_init(&attrs);
2266         pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2267 
2268         for (t = 1; t < nr_threads; t++) {
2269                 enum thread_msg msg = THREAD_MSG__UNDEFINED;
2270 
2271 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2272                 pthread_attr_setaffinity_np(&attrs,
2273                                             MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2274                                             (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2275 #endif
2276                 if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2277                         for (tt = 1; tt < t; tt++)
2278                                 record__terminate_thread(&thread_data[t]);
2279                         pr_err("Failed to start threads: %s\n", strerror(errno));
2280                         ret = -1;
2281                         goto out_err;
2282                 }
2283 
2284                 err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2285                 if (err > 0)
2286                         pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2287                                   thread_msg_tags[msg]);
2288                 else
2289                         pr_warning("threads[%d]: failed to receive start notification from %d\n",
2290                                    thread->tid, rec->thread_data[t].tid);
2291         }
2292 
2293         sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2294                         (cpu_set_t *)thread->mask->affinity.bits);
2295 
2296         pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2297 
2298 out_err:
2299         pthread_attr_destroy(&attrs);
2300 
2301         if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2302                 pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2303                 ret = -1;
2304         }
2305 
2306         return ret;
2307 }
2308 
2309 static int record__stop_threads(struct record *rec)
2310 {
2311         int t;
2312         struct record_thread *thread_data = rec->thread_data;
2313 
2314         for (t = 1; t < rec->nr_threads; t++)
2315                 record__terminate_thread(&thread_data[t]);
2316 
2317         for (t = 0; t < rec->nr_threads; t++) {
2318                 rec->samples += thread_data[t].samples;
2319                 if (!record__threads_enabled(rec))
2320                         continue;
2321                 rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2322                 rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2323                 pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2324                          thread_data[t].samples, thread_data[t].waking);
2325                 if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2326                         pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2327                                  thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2328                 else
2329                         pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2330         }
2331 
2332         return 0;
2333 }
2334 
2335 static unsigned long record__waking(struct record *rec)
2336 {
2337         int t;
2338         unsigned long waking = 0;
2339         struct record_thread *thread_data = rec->thread_data;
2340 
2341         for (t = 0; t < rec->nr_threads; t++)
2342                 waking += thread_data[t].waking;
2343 
2344         return waking;
2345 }
2346 
2347 static int __cmd_record(struct record *rec, int argc, const char **argv)
2348 {
2349         int err;
2350         int status = 0;
2351         const bool forks = argc > 0;
2352         struct perf_tool *tool = &rec->tool;
2353         struct record_opts *opts = &rec->opts;
2354         struct perf_data *data = &rec->data;
2355         struct perf_session *session;
2356         bool disabled = false, draining = false;
2357         int fd;
2358         float ratio = 0;
2359         enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2360 
2361         atexit(record__sig_exit);
2362         signal(SIGCHLD, sig_handler);
2363         signal(SIGINT, sig_handler);
2364         signal(SIGTERM, sig_handler);
2365         signal(SIGSEGV, sigsegv_handler);
2366 
2367         if (rec->opts.record_namespaces)
2368                 tool->namespace_events = true;
2369 
2370         if (rec->opts.record_cgroup) {
2371 #ifdef HAVE_FILE_HANDLE
2372                 tool->cgroup_events = true;
2373 #else
2374                 pr_err("cgroup tracking is not supported\n");
2375                 return -1;
2376 #endif
2377         }
2378 
2379         if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2380                 signal(SIGUSR2, snapshot_sig_handler);
2381                 if (rec->opts.auxtrace_snapshot_mode)
2382                         trigger_on(&auxtrace_snapshot_trigger);
2383                 if (rec->switch_output.enabled)
2384                         trigger_on(&switch_output_trigger);
2385         } else {
2386                 signal(SIGUSR2, SIG_IGN);
2387         }
2388 
2389         session = perf_session__new(data, tool);
2390         if (IS_ERR(session)) {
2391                 pr_err("Perf session creation failed.\n");
2392                 return PTR_ERR(session);
2393         }
2394 
2395         if (record__threads_enabled(rec)) {
2396                 if (perf_data__is_pipe(&rec->data)) {
2397                         pr_err("Parallel trace streaming is not available in pipe mode.\n");
2398                         return -1;
2399                 }
2400                 if (rec->opts.full_auxtrace) {
2401                         pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2402                         return -1;
2403                 }
2404         }
2405 
2406         fd = perf_data__fd(data);
2407         rec->session = session;
2408 
2409         if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2410                 pr_err("Compression initialization failed.\n");
2411                 return -1;
2412         }
2413 #ifdef HAVE_EVENTFD_SUPPORT
2414         done_fd = eventfd(0, EFD_NONBLOCK);
2415         if (done_fd < 0) {
2416                 pr_err("Failed to create wakeup eventfd, error: %m\n");
2417                 status = -1;
2418                 goto out_delete_session;
2419         }
2420         err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2421         if (err < 0) {
2422                 pr_err("Failed to add wakeup eventfd to poll list\n");
2423                 status = err;
2424                 goto out_delete_session;
2425         }
2426 #endif // HAVE_EVENTFD_SUPPORT
2427 
2428         session->header.env.comp_type  = PERF_COMP_ZSTD;
2429         session->header.env.comp_level = rec->opts.comp_level;
2430 
2431         if (rec->opts.kcore &&
2432             !record__kcore_readable(&session->machines.host)) {
2433                 pr_err("ERROR: kcore is not readable.\n");
2434                 return -1;
2435         }
2436 
2437         if (record__init_clock(rec))
2438                 return -1;
2439 
2440         record__init_features(rec);
2441 
2442         if (forks) {
2443                 err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2444                                                workload_exec_failed_signal);
2445                 if (err < 0) {
2446                         pr_err("Couldn't run the workload!\n");
2447                         status = err;
2448                         goto out_delete_session;
2449                 }
2450         }
2451 
2452         /*
2453          * If we have just single event and are sending data
2454          * through pipe, we need to force the ids allocation,
2455          * because we synthesize event name through the pipe
2456          * and need the id for that.
2457          */
2458         if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2459                 rec->opts.sample_id = true;
2460 
2461         if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2462                 rec->timestamp_filename = false;
2463                 pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2464         }
2465 
2466         evlist__uniquify_name(rec->evlist);
2467 
2468         evlist__config(rec->evlist, opts, &callchain_param);
2469 
2470         /* Debug message used by test scripts */
2471         pr_debug3("perf record opening and mmapping events\n");
2472         if (record__open(rec) != 0) {
2473                 err = -1;
2474                 goto out_free_threads;
2475         }
2476         /* Debug message used by test scripts */
2477         pr_debug3("perf record done opening and mmapping events\n");
2478         session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2479 
2480         if (rec->opts.kcore) {
2481                 err = record__kcore_copy(&session->machines.host, data);
2482                 if (err) {
2483                         pr_err("ERROR: Failed to copy kcore\n");
2484                         goto out_free_threads;
2485                 }
2486         }
2487 
2488         /*
2489          * Normally perf_session__new would do this, but it doesn't have the
2490          * evlist.
2491          */
2492         if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2493                 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2494                 rec->tool.ordered_events = false;
2495         }
2496 
2497         if (evlist__nr_groups(rec->evlist) == 0)
2498                 perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2499 
2500         if (data->is_pipe) {
2501                 err = perf_header__write_pipe(fd);
2502                 if (err < 0)
2503                         goto out_free_threads;
2504         } else {
2505                 err = perf_session__write_header(session, rec->evlist, fd, false);
2506                 if (err < 0)
2507                         goto out_free_threads;
2508         }
2509 
2510         err = -1;
2511         if (!rec->no_buildid
2512             && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2513                 pr_err("Couldn't generate buildids. "
2514                        "Use --no-buildid to profile anyway.\n");
2515                 goto out_free_threads;
2516         }
2517 
2518         err = record__setup_sb_evlist(rec);
2519         if (err)
2520                 goto out_free_threads;
2521 
2522         err = record__synthesize(rec, false);
2523         if (err < 0)
2524                 goto out_free_threads;
2525 
2526         if (rec->realtime_prio) {
2527                 struct sched_param param;
2528 
2529                 param.sched_priority = rec->realtime_prio;
2530                 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2531                         pr_err("Could not set realtime priority.\n");
2532                         err = -1;
2533                         goto out_free_threads;
2534                 }
2535         }
2536 
2537         if (record__start_threads(rec))
2538                 goto out_free_threads;
2539 
2540         /*
2541          * When perf is starting the traced process, all the events
2542          * (apart from group members) have enable_on_exec=1 set,
2543          * so don't spoil it by prematurely enabling them.
2544          */
2545         if (!target__none(&opts->target) && !opts->target.initial_delay)
2546                 evlist__enable(rec->evlist);
2547 
2548         /*
2549          * Let the child rip
2550          */
2551         if (forks) {
2552                 struct machine *machine = &session->machines.host;
2553                 union perf_event *event;
2554                 pid_t tgid;
2555 
2556                 event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2557                 if (event == NULL) {
2558                         err = -ENOMEM;
2559                         goto out_child;
2560                 }
2561 
2562                 /*
2563                  * Some H/W events are generated before COMM event
2564                  * which is emitted during exec(), so perf script
2565                  * cannot see a correct process name for those events.
2566                  * Synthesize COMM event to prevent it.
2567                  */
2568                 tgid = perf_event__synthesize_comm(tool, event,
2569                                                    rec->evlist->workload.pid,
2570                                                    process_synthesized_event,
2571                                                    machine);
2572                 free(event);
2573 
2574                 if (tgid == -1)
2575                         goto out_child;
2576 
2577                 event = malloc(sizeof(event->namespaces) +
2578                                (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2579                                machine->id_hdr_size);
2580                 if (event == NULL) {
2581                         err = -ENOMEM;
2582                         goto out_child;
2583                 }
2584 
2585                 /*
2586                  * Synthesize NAMESPACES event for the command specified.
2587                  */
2588                 perf_event__synthesize_namespaces(tool, event,
2589                                                   rec->evlist->workload.pid,
2590                                                   tgid, process_synthesized_event,
2591                                                   machine);
2592                 free(event);
2593 
2594                 evlist__start_workload(rec->evlist);
2595         }
2596 
2597         if (opts->target.initial_delay) {
2598                 pr_info(EVLIST_DISABLED_MSG);
2599                 if (opts->target.initial_delay > 0) {
2600                         usleep(opts->target.initial_delay * USEC_PER_MSEC);
2601                         evlist__enable(rec->evlist);
2602                         pr_info(EVLIST_ENABLED_MSG);
2603                 }
2604         }
2605 
2606         err = event_enable_timer__start(rec->evlist->eet);
2607         if (err)
2608                 goto out_child;
2609 
2610         /* Debug message used by test scripts */
2611         pr_debug3("perf record has started\n");
2612         fflush(stderr);
2613 
2614         trigger_ready(&auxtrace_snapshot_trigger);
2615         trigger_ready(&switch_output_trigger);
2616         perf_hooks__invoke_record_start();
2617 
2618         /*
2619          * Must write FINISHED_INIT so it will be seen after all other
2620          * synthesized user events, but before any regular events.
2621          */
2622         err = write_finished_init(rec, false);
2623         if (err < 0)
2624                 goto out_child;
2625 
2626         for (;;) {
2627                 unsigned long long hits = thread->samples;
2628 
2629                 /*
2630                  * rec->evlist->bkw_mmap_state is possible to be
2631                  * BKW_MMAP_EMPTY here: when done == true and
2632                  * hits != rec->samples in previous round.
2633                  *
2634                  * evlist__toggle_bkw_mmap ensure we never
2635                  * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2636                  */
2637                 if (trigger_is_hit(&switch_output_trigger) || done || draining)
2638                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2639 
2640                 if (record__mmap_read_all(rec, false) < 0) {
2641                         trigger_error(&auxtrace_snapshot_trigger);
2642                         trigger_error(&switch_output_trigger);
2643                         err = -1;
2644                         goto out_child;
2645                 }
2646 
2647                 if (auxtrace_record__snapshot_started) {
2648                         auxtrace_record__snapshot_started = 0;
2649                         if (!trigger_is_error(&auxtrace_snapshot_trigger))
2650                                 record__read_auxtrace_snapshot(rec, false);
2651                         if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2652                                 pr_err("AUX area tracing snapshot failed\n");
2653                                 err = -1;
2654                                 goto out_child;
2655                         }
2656                 }
2657 
2658                 if (trigger_is_hit(&switch_output_trigger)) {
2659                         /*
2660                          * If switch_output_trigger is hit, the data in
2661                          * overwritable ring buffer should have been collected,
2662                          * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2663                          *
2664                          * If SIGUSR2 raise after or during record__mmap_read_all(),
2665                          * record__mmap_read_all() didn't collect data from
2666                          * overwritable ring buffer. Read again.
2667                          */
2668                         if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2669                                 continue;
2670                         trigger_ready(&switch_output_trigger);
2671 
2672                         /*
2673                          * Reenable events in overwrite ring buffer after
2674                          * record__mmap_read_all(): we should have collected
2675                          * data from it.
2676                          */
2677                         evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2678 
2679                         if (!quiet)
2680                                 fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2681                                         record__waking(rec));
2682                         thread->waking = 0;
2683                         fd = record__switch_output(rec, false);
2684                         if (fd < 0) {
2685                                 pr_err("Failed to switch to new file\n");
2686                                 trigger_error(&switch_output_trigger);
2687                                 err = fd;
2688                                 goto out_child;
2689                         }
2690 
2691                         /* re-arm the alarm */
2692                         if (rec->switch_output.time)
2693                                 alarm(rec->switch_output.time);
2694                 }
2695 
2696                 if (hits == thread->samples) {
2697                         if (done || draining)
2698                                 break;
2699                         err = fdarray__poll(&thread->pollfd, -1);
2700                         /*
2701                          * Propagate error, only if there's any. Ignore positive
2702                          * number of returned events and interrupt error.
2703                          */
2704                         if (err > 0 || (err < 0 && errno == EINTR))
2705                                 err = 0;
2706                         thread->waking++;
2707 
2708                         if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2709                                             record__thread_munmap_filtered, NULL) == 0)
2710                                 draining = true;
2711 
2712                         err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2713                         if (err)
2714                                 goto out_child;
2715                 }
2716 
2717                 if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2718                         switch (cmd) {
2719                         case EVLIST_CTL_CMD_SNAPSHOT:
2720                                 hit_auxtrace_snapshot_trigger(rec);
2721                                 evlist__ctlfd_ack(rec->evlist);
2722                                 break;
2723                         case EVLIST_CTL_CMD_STOP:
2724                                 done = 1;
2725                                 break;
2726                         case EVLIST_CTL_CMD_ACK:
2727                         case EVLIST_CTL_CMD_UNSUPPORTED:
2728                         case EVLIST_CTL_CMD_ENABLE:
2729                         case EVLIST_CTL_CMD_DISABLE:
2730                         case EVLIST_CTL_CMD_EVLIST:
2731                         case EVLIST_CTL_CMD_PING:
2732                         default:
2733                                 break;
2734                         }
2735                 }
2736 
2737                 err = event_enable_timer__process(rec->evlist->eet);
2738                 if (err < 0)
2739                         goto out_child;
2740                 if (err) {
2741                         err = 0;
2742                         done = 1;
2743                 }
2744 
2745                 /*
2746                  * When perf is starting the traced process, at the end events
2747                  * die with the process and we wait for that. Thus no need to
2748                  * disable events in this case.
2749                  */
2750                 if (done && !disabled && !target__none(&opts->target)) {
2751                         trigger_off(&auxtrace_snapshot_trigger);
2752                         evlist__disable(rec->evlist);
2753                         disabled = true;
2754                 }
2755         }
2756 
2757         trigger_off(&auxtrace_snapshot_trigger);
2758         trigger_off(&switch_output_trigger);
2759 
2760         if (opts->auxtrace_snapshot_on_exit)
2761                 record__auxtrace_snapshot_exit(rec);
2762 
2763         if (forks && workload_exec_errno) {
2764                 char msg[STRERR_BUFSIZE], strevsels[2048];
2765                 const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2766 
2767                 evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2768 
2769                 pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2770                         strevsels, argv[0], emsg);
2771                 err = -1;
2772                 goto out_child;
2773         }
2774 
2775         if (!quiet)
2776                 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2777                         record__waking(rec));
2778 
2779         write_finished_init(rec, true);
2780 
2781         if (target__none(&rec->opts.target))
2782                 record__synthesize_workload(rec, true);
2783 
2784 out_child:
2785         record__stop_threads(rec);
2786         record__mmap_read_all(rec, true);
2787 out_free_threads:
2788         record__free_thread_data(rec);
2789         evlist__finalize_ctlfd(rec->evlist);
2790         record__aio_mmap_read_sync(rec);
2791 
2792         if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2793                 ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2794                 session->header.env.comp_ratio = ratio + 0.5;
2795         }
2796 
2797         if (forks) {
2798                 int exit_status;
2799 
2800                 if (!child_finished)
2801                         kill(rec->evlist->workload.pid, SIGTERM);
2802 
2803                 wait(&exit_status);
2804 
2805                 if (err < 0)
2806                         status = err;
2807                 else if (WIFEXITED(exit_status))
2808                         status = WEXITSTATUS(exit_status);
2809                 else if (WIFSIGNALED(exit_status))
2810                         signr = WTERMSIG(exit_status);
2811         } else
2812                 status = err;
2813 
2814         if (rec->off_cpu)
2815                 rec->bytes_written += off_cpu_write(rec->session);
2816 
2817         record__read_lost_samples(rec);
2818         record__synthesize(rec, true);
2819         /* this will be recalculated during process_buildids() */
2820         rec->samples = 0;
2821 
2822         if (!err) {
2823                 if (!rec->timestamp_filename) {
2824                         record__finish_output(rec);
2825                 } else {
2826                         fd = record__switch_output(rec, true);
2827                         if (fd < 0) {
2828                                 status = fd;
2829                                 goto out_delete_session;
2830                         }
2831                 }
2832         }
2833 
2834         perf_hooks__invoke_record_end();
2835 
2836         if (!err && !quiet) {
2837                 char samples[128];
2838                 const char *postfix = rec->timestamp_filename ?
2839                                         ".<timestamp>" : "";
2840 
2841                 if (rec->samples && !rec->opts.full_auxtrace)
2842                         scnprintf(samples, sizeof(samples),
2843                                   " (%" PRIu64 " samples)", rec->samples);
2844                 else
2845                         samples[0] = '\0';
2846 
2847                 fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
2848                         perf_data__size(data) / 1024.0 / 1024.0,
2849                         data->path, postfix, samples);
2850                 if (ratio) {
2851                         fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2852                                         rec->session->bytes_transferred / 1024.0 / 1024.0,
2853                                         ratio);
2854                 }
2855                 fprintf(stderr, " ]\n");
2856         }
2857 
2858 out_delete_session:
2859 #ifdef HAVE_EVENTFD_SUPPORT
2860         if (done_fd >= 0) {
2861                 fd = done_fd;
2862                 done_fd = -1;
2863 
2864                 close(fd);
2865         }
2866 #endif
2867         zstd_fini(&session->zstd_data);
2868         if (!opts->no_bpf_event)
2869                 evlist__stop_sb_thread(rec->sb_evlist);
2870 
2871         perf_session__delete(session);
2872         return status;
2873 }
2874 
2875 static void callchain_debug(struct callchain_param *callchain)
2876 {
2877         static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2878 
2879         pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2880 
2881         if (callchain->record_mode == CALLCHAIN_DWARF)
2882                 pr_debug("callchain: stack dump size %d\n",
2883                          callchain->dump_size);
2884 }
2885 
2886 int record_opts__parse_callchain(struct record_opts *record,
2887                                  struct callchain_param *callchain,
2888                                  const char *arg, bool unset)
2889 {
2890         int ret;
2891         callchain->enabled = !unset;
2892 
2893         /* --no-call-graph */
2894         if (unset) {
2895                 callchain->record_mode = CALLCHAIN_NONE;
2896                 pr_debug("callchain: disabled\n");
2897                 return 0;
2898         }
2899 
2900         ret = parse_callchain_record_opt(arg, callchain);
2901         if (!ret) {
2902                 /* Enable data address sampling for DWARF unwind. */
2903                 if (callchain->record_mode == CALLCHAIN_DWARF)
2904                         record->sample_address = true;
2905                 callchain_debug(callchain);
2906         }
2907 
2908         return ret;
2909 }
2910 
2911 int record_parse_callchain_opt(const struct option *opt,
2912                                const char *arg,
2913                                int unset)
2914 {
2915         return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2916 }
2917 
2918 int record_callchain_opt(const struct option *opt,
2919                          const char *arg __maybe_unused,
2920                          int unset __maybe_unused)
2921 {
2922         struct callchain_param *callchain = opt->value;
2923 
2924         callchain->enabled = true;
2925 
2926         if (callchain->record_mode == CALLCHAIN_NONE)
2927                 callchain->record_mode = CALLCHAIN_FP;
2928 
2929         callchain_debug(callchain);
2930         return 0;
2931 }
2932 
2933 static int perf_record_config(const char *var, const char *value, void *cb)
2934 {
2935         struct record *rec = cb;
2936 
2937         if (!strcmp(var, "record.build-id")) {
2938                 if (!strcmp(value, "cache"))
2939                         rec->no_buildid_cache = false;
2940                 else if (!strcmp(value, "no-cache"))
2941                         rec->no_buildid_cache = true;
2942                 else if (!strcmp(value, "skip"))
2943                         rec->no_buildid = true;
2944                 else if (!strcmp(value, "mmap"))
2945                         rec->buildid_mmap = true;
2946                 else
2947                         return -1;
2948                 return 0;
2949         }
2950         if (!strcmp(var, "record.call-graph")) {
2951                 var = "call-graph.record-mode";
2952                 return perf_default_config(var, value, cb);
2953         }
2954 #ifdef HAVE_AIO_SUPPORT
2955         if (!strcmp(var, "record.aio")) {
2956                 rec->opts.nr_cblocks = strtol(value, NULL, 0);
2957                 if (!rec->opts.nr_cblocks)
2958                         rec->opts.nr_cblocks = nr_cblocks_default;
2959         }
2960 #endif
2961         if (!strcmp(var, "record.debuginfod")) {
2962                 rec->debuginfod.urls = strdup(value);
2963                 if (!rec->debuginfod.urls)
2964                         return -ENOMEM;
2965                 rec->debuginfod.set = true;
2966         }
2967 
2968         return 0;
2969 }
2970 
2971 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2972 {
2973         struct record *rec = (struct record *)opt->value;
2974 
2975         return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2976 }
2977 
2978 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2979 {
2980         struct record_opts *opts = (struct record_opts *)opt->value;
2981 
2982         if (unset || !str)
2983                 return 0;
2984 
2985         if (!strcasecmp(str, "node"))
2986                 opts->affinity = PERF_AFFINITY_NODE;
2987         else if (!strcasecmp(str, "cpu"))
2988                 opts->affinity = PERF_AFFINITY_CPU;
2989 
2990         return 0;
2991 }
2992 
2993 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2994 {
2995         mask->nbits = nr_bits;
2996         mask->bits = bitmap_zalloc(mask->nbits);
2997         if (!mask->bits)
2998                 return -ENOMEM;
2999 
3000         return 0;
3001 }
3002 
3003 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3004 {
3005         bitmap_free(mask->bits);
3006         mask->nbits = 0;
3007 }
3008 
3009 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3010 {
3011         int ret;
3012 
3013         ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3014         if (ret) {
3015                 mask->affinity.bits = NULL;
3016                 return ret;
3017         }
3018 
3019         ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3020         if (ret) {
3021                 record__mmap_cpu_mask_free(&mask->maps);
3022                 mask->maps.bits = NULL;
3023         }
3024 
3025         return ret;
3026 }
3027 
3028 static void record__thread_mask_free(struct thread_mask *mask)
3029 {
3030         record__mmap_cpu_mask_free(&mask->maps);
3031         record__mmap_cpu_mask_free(&mask->affinity);
3032 }
3033 
3034 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3035 {
3036         int s;
3037         struct record_opts *opts = opt->value;
3038 
3039         if (unset || !str || !strlen(str)) {
3040                 opts->threads_spec = THREAD_SPEC__CPU;
3041         } else {
3042                 for (s = 1; s < THREAD_SPEC__MAX; s++) {
3043                         if (s == THREAD_SPEC__USER) {
3044                                 opts->threads_user_spec = strdup(str);
3045                                 if (!opts->threads_user_spec)
3046                                         return -ENOMEM;
3047                                 opts->threads_spec = THREAD_SPEC__USER;
3048                                 break;
3049                         }
3050                         if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3051                                 opts->threads_spec = s;
3052                                 break;
3053                         }
3054                 }
3055         }
3056 
3057         if (opts->threads_spec == THREAD_SPEC__USER)
3058                 pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3059         else
3060                 pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3061 
3062         return 0;
3063 }
3064 
3065 static int parse_output_max_size(const struct option *opt,
3066                                  const char *str, int unset)
3067 {
3068         unsigned long *s = (unsigned long *)opt->value;
3069         static struct parse_tag tags_size[] = {
3070                 { .tag  = 'B', .mult = 1       },
3071                 { .tag  = 'K', .mult = 1 << 10 },
3072                 { .tag  = 'M', .mult = 1 << 20 },
3073                 { .tag  = 'G', .mult = 1 << 30 },
3074                 { .tag  = 0 },
3075         };
3076         unsigned long val;
3077 
3078         if (unset) {
3079                 *s = 0;
3080                 return 0;
3081         }
3082 
3083         val = parse_tag_value(str, tags_size);
3084         if (val != (unsigned long) -1) {
3085                 *s = val;
3086                 return 0;
3087         }
3088 
3089         return -1;
3090 }
3091 
3092 static int record__parse_mmap_pages(const struct option *opt,
3093                                     const char *str,
3094                                     int unset __maybe_unused)
3095 {
3096         struct record_opts *opts = opt->value;
3097         char *s, *p;
3098         unsigned int mmap_pages;
3099         int ret;
3100 
3101         if (!str)
3102                 return -EINVAL;
3103 
3104         s = strdup(str);
3105         if (!s)
3106                 return -ENOMEM;
3107 
3108         p = strchr(s, ',');
3109         if (p)
3110                 *p = '\0';
3111 
3112         if (*s) {
3113                 ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3114                 if (ret)
3115                         goto out_free;
3116                 opts->mmap_pages = mmap_pages;
3117         }
3118 
3119         if (!p) {
3120                 ret = 0;
3121                 goto out_free;
3122         }
3123 
3124         ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3125         if (ret)
3126                 goto out_free;
3127 
3128         opts->auxtrace_mmap_pages = mmap_pages;
3129 
3130 out_free:
3131         free(s);
3132         return ret;
3133 }
3134 
3135 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3136 {
3137 }
3138 
3139 static int parse_control_option(const struct option *opt,
3140                                 const char *str,
3141                                 int unset __maybe_unused)
3142 {
3143         struct record_opts *opts = opt->value;
3144 
3145         return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3146 }
3147 
3148 static void switch_output_size_warn(struct record *rec)
3149 {
3150         u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3151         struct switch_output *s = &rec->switch_output;
3152 
3153         wakeup_size /= 2;
3154 
3155         if (s->size < wakeup_size) {
3156                 char buf[100];
3157 
3158                 unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3159                 pr_warning("WARNING: switch-output data size lower than "
3160                            "wakeup kernel buffer size (%s) "
3161                            "expect bigger perf.data sizes\n", buf);
3162         }
3163 }
3164 
3165 static int switch_output_setup(struct record *rec)
3166 {
3167         struct switch_output *s = &rec->switch_output;
3168         static struct parse_tag tags_size[] = {
3169                 { .tag  = 'B', .mult = 1       },
3170                 { .tag  = 'K', .mult = 1 << 10 },
3171                 { .tag  = 'M', .mult = 1 << 20 },
3172                 { .tag  = 'G', .mult = 1 << 30 },
3173                 { .tag  = 0 },
3174         };
3175         static struct parse_tag tags_time[] = {
3176                 { .tag  = 's', .mult = 1        },
3177                 { .tag  = 'm', .mult = 60       },
3178                 { .tag  = 'h', .mult = 60*60    },
3179                 { .tag  = 'd', .mult = 60*60*24 },
3180                 { .tag  = 0 },
3181         };
3182         unsigned long val;
3183 
3184         /*
3185          * If we're using --switch-output-events, then we imply its
3186          * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3187          *  thread to its parent.
3188          */
3189         if (rec->switch_output_event_set) {
3190                 if (record__threads_enabled(rec)) {
3191                         pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3192                         return 0;
3193                 }
3194                 goto do_signal;
3195         }
3196 
3197         if (!s->set)
3198                 return 0;
3199 
3200         if (record__threads_enabled(rec)) {
3201                 pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3202                 return 0;
3203         }
3204 
3205         if (!strcmp(s->str, "signal")) {
3206 do_signal:
3207                 s->signal = true;
3208                 pr_debug("switch-output with SIGUSR2 signal\n");
3209                 goto enabled;
3210         }
3211 
3212         val = parse_tag_value(s->str, tags_size);
3213         if (val != (unsigned long) -1) {
3214                 s->size = val;
3215                 pr_debug("switch-output with %s size threshold\n", s->str);
3216                 goto enabled;
3217         }
3218 
3219         val = parse_tag_value(s->str, tags_time);
3220         if (val != (unsigned long) -1) {
3221                 s->time = val;
3222                 pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3223                          s->str, s->time);
3224                 goto enabled;
3225         }
3226 
3227         return -1;
3228 
3229 enabled:
3230         rec->timestamp_filename = true;
3231         s->enabled              = true;
3232 
3233         if (s->size && !rec->opts.no_buffering)
3234                 switch_output_size_warn(rec);
3235 
3236         return 0;
3237 }
3238 
3239 static const char * const __record_usage[] = {
3240         "perf record [<options>] [<command>]",
3241         "perf record [<options>] -- <command> [<options>]",
3242         NULL
3243 };
3244 const char * const *record_usage = __record_usage;
3245 
3246 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3247                                   struct perf_sample *sample, struct machine *machine)
3248 {
3249         /*
3250          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3251          * no need to add them twice.
3252          */
3253         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3254                 return 0;
3255         return perf_event__process_mmap(tool, event, sample, machine);
3256 }
3257 
3258 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3259                                    struct perf_sample *sample, struct machine *machine)
3260 {
3261         /*
3262          * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3263          * no need to add them twice.
3264          */
3265         if (!(event->header.misc & PERF_RECORD_MISC_USER))
3266                 return 0;
3267 
3268         return perf_event__process_mmap2(tool, event, sample, machine);
3269 }
3270 
3271 static int process_timestamp_boundary(struct perf_tool *tool,
3272                                       union perf_event *event __maybe_unused,
3273                                       struct perf_sample *sample,
3274                                       struct machine *machine __maybe_unused)
3275 {
3276         struct record *rec = container_of(tool, struct record, tool);
3277 
3278         set_timestamp_boundary(rec, sample->time);
3279         return 0;
3280 }
3281 
3282 static int parse_record_synth_option(const struct option *opt,
3283                                      const char *str,
3284                                      int unset __maybe_unused)
3285 {
3286         struct record_opts *opts = opt->value;
3287         char *p = strdup(str);
3288 
3289         if (p == NULL)
3290                 return -1;
3291 
3292         opts->synth = parse_synth_opt(p);
3293         free(p);
3294 
3295         if (opts->synth < 0) {
3296                 pr_err("Invalid synth option: %s\n", str);
3297                 return -1;
3298         }
3299         return 0;
3300 }
3301 
3302 /*
3303  * XXX Ideally would be local to cmd_record() and passed to a record__new
3304  * because we need to have access to it in record__exit, that is called
3305  * after cmd_record() exits, but since record_options need to be accessible to
3306  * builtin-script, leave it here.
3307  *
3308  * At least we don't ouch it in all the other functions here directly.
3309  *
3310  * Just say no to tons of global variables, sigh.
3311  */
3312 static struct record record = {
3313         .opts = {
3314                 .sample_time         = true,
3315                 .mmap_pages          = UINT_MAX,
3316                 .user_freq           = UINT_MAX,
3317                 .user_interval       = ULLONG_MAX,
3318                 .freq                = 4000,
3319                 .target              = {
3320                         .uses_mmap   = true,
3321                         .default_per_cpu = true,
3322                 },
3323                 .mmap_flush          = MMAP_FLUSH_DEFAULT,
3324                 .nr_threads_synthesize = 1,
3325                 .ctl_fd              = -1,
3326                 .ctl_fd_ack          = -1,
3327                 .synth               = PERF_SYNTH_ALL,
3328         },
3329         .tool = {
3330                 .sample         = process_sample_event,
3331                 .fork           = perf_event__process_fork,
3332                 .exit           = perf_event__process_exit,
3333                 .comm           = perf_event__process_comm,
3334                 .namespaces     = perf_event__process_namespaces,
3335                 .mmap           = build_id__process_mmap,
3336                 .mmap2          = build_id__process_mmap2,
3337                 .itrace_start   = process_timestamp_boundary,
3338                 .aux            = process_timestamp_boundary,
3339                 .ordered_events = true,
3340         },
3341 };
3342 
3343 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3344         "\n\t\t\t\tDefault: fp";
3345 
3346 static bool dry_run;
3347 
3348 static struct parse_events_option_args parse_events_option_args = {
3349         .evlistp = &record.evlist,
3350 };
3351 
3352 static struct parse_events_option_args switch_output_parse_events_option_args = {
3353         .evlistp = &record.sb_evlist,
3354 };
3355 
3356 /*
3357  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3358  * with it and switch to use the library functions in perf_evlist that came
3359  * from builtin-record.c, i.e. use record_opts,
3360  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3361  * using pipes, etc.
3362  */
3363 static struct option __record_options[] = {
3364         OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3365                      "event selector. use 'perf list' to list available events",
3366                      parse_events_option),
3367         OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3368                      "event filter", parse_filter),
3369         OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3370                            NULL, "don't record events from perf itself",
3371                            exclude_perf),
3372         OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3373                     "record events on existing process id"),
3374         OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3375                     "record events on existing thread id"),
3376         OPT_INTEGER('r', "realtime", &record.realtime_prio,
3377                     "collect data with this RT SCHED_FIFO priority"),
3378         OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3379                     "collect data without buffering"),
3380         OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3381                     "collect raw sample records from all opened counters"),
3382         OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3383                             "system-wide collection from all CPUs"),
3384         OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3385                     "list of cpus to monitor"),
3386         OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3387         OPT_STRING('o', "output", &record.data.path, "file",
3388                     "output file name"),
3389         OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3390                         &record.opts.no_inherit_set,
3391                         "child tasks do not inherit counters"),
3392         OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3393                     "synthesize non-sample events at the end of output"),
3394         OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3395         OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3396         OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3397                     "Fail if the specified frequency can't be used"),
3398         OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3399                      "profile at this frequency",
3400                       record__parse_freq),
3401         OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3402                      "number of mmap data pages and AUX area tracing mmap pages",
3403                      record__parse_mmap_pages),
3404         OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3405                      "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3406                      record__mmap_flush_parse),
3407         OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3408                            NULL, "enables call-graph recording" ,
3409                            &record_callchain_opt),
3410         OPT_CALLBACK(0, "call-graph", &record.opts,
3411                      "record_mode[,record_size]", record_callchain_help,
3412                      &record_parse_callchain_opt),
3413         OPT_INCR('v', "verbose", &verbose,
3414                     "be more verbose (show counter open errors, etc)"),
3415         OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3416         OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3417                     "per thread counts"),
3418         OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3419         OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3420                     "Record the sample physical addresses"),
3421         OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3422                     "Record the sampled data address data page size"),
3423         OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3424                     "Record the sampled code address (ip) page size"),
3425         OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3426         OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3427                     "Record the sample identifier"),
3428         OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3429                         &record.opts.sample_time_set,
3430                         "Record the sample timestamps"),
3431         OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3432                         "Record the sample period"),
3433         OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3434                     "don't sample"),
3435         OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3436                         &record.no_buildid_cache_set,
3437                         "do not update the buildid cache"),
3438         OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3439                         &record.no_buildid_set,
3440                         "do not collect buildids in perf.data"),
3441         OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3442                      "monitor event in cgroup name only",
3443                      parse_cgroups),
3444         OPT_CALLBACK('D', "delay", &record, "ms",
3445                      "ms to wait before starting measurement after program start (-1: start with events disabled), "
3446                      "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3447                      record__parse_event_enable_time),
3448         OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3449         OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3450                    "user to profile"),
3451 
3452         OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3453                      "branch any", "sample any taken branches",
3454                      parse_branch_stack),
3455 
3456         OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3457                      "branch filter mask", "branch stack filter modes",
3458                      parse_branch_stack),
3459         OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3460                     "sample by weight (on special events only)"),
3461         OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3462                     "sample transaction flags (special events only)"),
3463         OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3464                     "use per-thread mmaps"),
3465         OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3466                     "sample selected machine registers on interrupt,"
3467                     " use '-I?' to list register names", parse_intr_regs),
3468         OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3469                     "sample selected machine registers on interrupt,"
3470                     " use '--user-regs=?' to list register names", parse_user_regs),
3471         OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3472                     "Record running/enabled time of read (:S) events"),
3473         OPT_CALLBACK('k', "clockid", &record.opts,
3474         "clockid", "clockid to use for events, see clock_gettime()",
3475         parse_clockid),
3476         OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3477                           "opts", "AUX area tracing Snapshot Mode", ""),
3478         OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3479                           "opts", "sample AUX area", ""),
3480         OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3481                         "per thread proc mmap processing timeout in ms"),
3482         OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3483                     "Record namespaces events"),
3484         OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3485                     "Record cgroup events"),
3486         OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3487                         &record.opts.record_switch_events_set,
3488                         "Record context switch events"),
3489         OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3490                          "Configure all used events to run in kernel space.",
3491                          PARSE_OPT_EXCLUSIVE),
3492         OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3493                          "Configure all used events to run in user space.",
3494                          PARSE_OPT_EXCLUSIVE),
3495         OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3496                     "collect kernel callchains"),
3497         OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3498                     "collect user callchains"),
3499         OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3500                    "file", "vmlinux pathname"),
3501         OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3502                     "Record build-id of all DSOs regardless of hits"),
3503         OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3504                     "Record build-id in map events"),
3505         OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3506                     "append timestamp to output filename"),
3507         OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3508                     "Record timestamp boundary (time of first/last samples)"),
3509         OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3510                           &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3511                           "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3512                           "signal"),
3513         OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3514                          &record.switch_output_event_set, "switch output event",
3515                          "switch output event selector. use 'perf list' to list available events",
3516                          parse_events_option_new_evlist),
3517         OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3518                    "Limit number of switch output generated files"),
3519         OPT_BOOLEAN(0, "dry-run", &dry_run,
3520                     "Parse options then exit"),
3521 #ifdef HAVE_AIO_SUPPORT
3522         OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3523                      &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3524                      record__aio_parse),
3525 #endif
3526         OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3527                      "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3528                      record__parse_affinity),
3529 #ifdef HAVE_ZSTD_SUPPORT
3530         OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3531                             "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3532                             record__parse_comp_level),
3533 #endif
3534         OPT_CALLBACK(0, "max-size", &record.output_max_size,
3535                      "size", "Limit the maximum size of the output file", parse_output_max_size),
3536         OPT_UINTEGER(0, "num-thread-synthesize",
3537                      &record.opts.nr_threads_synthesize,
3538                      "number of threads to run for event synthesis"),
3539 #ifdef HAVE_LIBPFM
3540         OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3541                 "libpfm4 event selector. use 'perf list' to list available events",
3542                 parse_libpfm_events_option),
3543 #endif
3544         OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3545                      "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3546                      "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3547                      "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3548                      "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3549                       parse_control_option),
3550         OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3551                      "Fine-tune event synthesis: default=all", parse_record_synth_option),
3552         OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3553                           &record.debuginfod.set, "debuginfod urls",
3554                           "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3555                           "system"),
3556         OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3557                             "write collected trace data into several data files using parallel threads",
3558                             record__parse_threads),
3559         OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3560         OPT_END()
3561 };
3562 
3563 struct option *record_options = __record_options;
3564 
3565 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3566 {
3567         struct perf_cpu cpu;
3568         int idx;
3569 
3570         if (cpu_map__is_dummy(cpus))
3571                 return 0;
3572 
3573         perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3574                 /* Return ENODEV is input cpu is greater than max cpu */
3575                 if ((unsigned long)cpu.cpu > mask->nbits)
3576                         return -ENODEV;
3577                 __set_bit(cpu.cpu, mask->bits);
3578         }
3579 
3580         return 0;
3581 }
3582 
3583 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3584 {
3585         struct perf_cpu_map *cpus;
3586 
3587         cpus = perf_cpu_map__new(mask_spec);
3588         if (!cpus)
3589                 return -ENOMEM;
3590 
3591         bitmap_zero(mask->bits, mask->nbits);
3592         if (record__mmap_cpu_mask_init(mask, cpus))
3593                 return -ENODEV;
3594 
3595         perf_cpu_map__put(cpus);
3596 
3597         return 0;
3598 }
3599 
3600 static void record__free_thread_masks(struct record *rec, int nr_threads)
3601 {
3602         int t;
3603 
3604         if (rec->thread_masks)
3605                 for (t = 0; t < nr_threads; t++)
3606                         record__thread_mask_free(&rec->thread_masks[t]);
3607 
3608         zfree(&rec->thread_masks);
3609 }
3610 
3611 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3612 {
3613         int t, ret;
3614 
3615         rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3616         if (!rec->thread_masks) {
3617                 pr_err("Failed to allocate thread masks\n");
3618                 return -ENOMEM;
3619         }
3620 
3621         for (t = 0; t < nr_threads; t++) {
3622                 ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3623                 if (ret) {
3624                         pr_err("Failed to allocate thread masks[%d]\n", t);
3625                         goto out_free;
3626                 }
3627         }
3628 
3629         return 0;
3630 
3631 out_free:
3632         record__free_thread_masks(rec, nr_threads);
3633 
3634         return ret;
3635 }
3636 
3637 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3638 {
3639         int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3640 
3641         ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3642         if (ret)
3643                 return ret;
3644 
3645         rec->nr_threads = nr_cpus;
3646         pr_debug("nr_threads: %d\n", rec->nr_threads);
3647 
3648         for (t = 0; t < rec->nr_threads; t++) {
3649                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3650                 __set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3651                 if (verbose > 0) {
3652                         pr_debug("thread_masks[%d]: ", t);
3653                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3654                         pr_debug("thread_masks[%d]: ", t);
3655                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3656                 }
3657         }
3658 
3659         return 0;
3660 }
3661 
3662 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3663                                           const char **maps_spec, const char **affinity_spec,
3664                                           u32 nr_spec)
3665 {
3666         u32 s;
3667         int ret = 0, t = 0;
3668         struct mmap_cpu_mask cpus_mask;
3669         struct thread_mask thread_mask, full_mask, *thread_masks;
3670 
3671         ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3672         if (ret) {
3673                 pr_err("Failed to allocate CPUs mask\n");
3674                 return ret;
3675         }
3676 
3677         ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3678         if (ret) {
3679                 pr_err("Failed to init cpu mask\n");
3680                 goto out_free_cpu_mask;
3681         }
3682 
3683         ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3684         if (ret) {
3685                 pr_err("Failed to allocate full mask\n");
3686                 goto out_free_cpu_mask;
3687         }
3688 
3689         ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3690         if (ret) {
3691                 pr_err("Failed to allocate thread mask\n");
3692                 goto out_free_full_and_cpu_masks;
3693         }
3694 
3695         for (s = 0; s < nr_spec; s++) {
3696                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3697                 if (ret) {
3698                         pr_err("Failed to initialize maps thread mask\n");
3699                         goto out_free;
3700                 }
3701                 ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3702                 if (ret) {
3703                         pr_err("Failed to initialize affinity thread mask\n");
3704                         goto out_free;
3705                 }
3706 
3707                 /* ignore invalid CPUs but do not allow empty masks */
3708                 if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3709                                 cpus_mask.bits, thread_mask.maps.nbits)) {
3710                         pr_err("Empty maps mask: %s\n", maps_spec[s]);
3711                         ret = -EINVAL;
3712                         goto out_free;
3713                 }
3714                 if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3715                                 cpus_mask.bits, thread_mask.affinity.nbits)) {
3716                         pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3717                         ret = -EINVAL;
3718                         goto out_free;
3719                 }
3720 
3721                 /* do not allow intersection with other masks (full_mask) */
3722                 if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3723                                       thread_mask.maps.nbits)) {
3724                         pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3725                         ret = -EINVAL;
3726                         goto out_free;
3727                 }
3728                 if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3729                                       thread_mask.affinity.nbits)) {
3730                         pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3731                         ret = -EINVAL;
3732                         goto out_free;
3733                 }
3734 
3735                 bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3736                           thread_mask.maps.bits, full_mask.maps.nbits);
3737                 bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3738                           thread_mask.affinity.bits, full_mask.maps.nbits);
3739 
3740                 thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3741                 if (!thread_masks) {
3742                         pr_err("Failed to reallocate thread masks\n");
3743                         ret = -ENOMEM;
3744                         goto out_free;
3745                 }
3746                 rec->thread_masks = thread_masks;
3747                 rec->thread_masks[t] = thread_mask;
3748                 if (verbose > 0) {
3749                         pr_debug("thread_masks[%d]: ", t);
3750                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3751                         pr_debug("thread_masks[%d]: ", t);
3752                         mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3753                 }
3754                 t++;
3755                 ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3756                 if (ret) {
3757                         pr_err("Failed to allocate thread mask\n");
3758                         goto out_free_full_and_cpu_masks;
3759                 }
3760         }
3761         rec->nr_threads = t;
3762         pr_debug("nr_threads: %d\n", rec->nr_threads);
3763         if (!rec->nr_threads)
3764                 ret = -EINVAL;
3765 
3766 out_free:
3767         record__thread_mask_free(&thread_mask);
3768 out_free_full_and_cpu_masks:
3769         record__thread_mask_free(&full_mask);
3770 out_free_cpu_mask:
3771         record__mmap_cpu_mask_free(&cpus_mask);
3772 
3773         return ret;
3774 }
3775 
3776 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3777 {
3778         int ret;
3779         struct cpu_topology *topo;
3780 
3781         topo = cpu_topology__new();
3782         if (!topo) {
3783                 pr_err("Failed to allocate CPU topology\n");
3784                 return -ENOMEM;
3785         }
3786 
3787         ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3788                                              topo->core_cpus_list, topo->core_cpus_lists);
3789         cpu_topology__delete(topo);
3790 
3791         return ret;
3792 }
3793 
3794 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3795 {
3796         int ret;
3797         struct cpu_topology *topo;
3798 
3799         topo = cpu_topology__new();
3800         if (!topo) {
3801                 pr_err("Failed to allocate CPU topology\n");
3802                 return -ENOMEM;
3803         }
3804 
3805         ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3806                                              topo->package_cpus_list, topo->package_cpus_lists);
3807         cpu_topology__delete(topo);
3808 
3809         return ret;
3810 }
3811 
3812 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3813 {
3814         u32 s;
3815         int ret;
3816         const char **spec;
3817         struct numa_topology *topo;
3818 
3819         topo = numa_topology__new();
3820         if (!topo) {
3821                 pr_err("Failed to allocate NUMA topology\n");
3822                 return -ENOMEM;
3823         }
3824 
3825         spec = zalloc(topo->nr * sizeof(char *));
3826         if (!spec) {
3827                 pr_err("Failed to allocate NUMA spec\n");
3828                 ret = -ENOMEM;
3829                 goto out_delete_topo;
3830         }
3831         for (s = 0; s < topo->nr; s++)
3832                 spec[s] = topo->nodes[s].cpus;
3833 
3834         ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3835 
3836         zfree(&spec);
3837 
3838 out_delete_topo:
3839         numa_topology__delete(topo);
3840 
3841         return ret;
3842 }
3843 
3844 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3845 {
3846         int t, ret;
3847         u32 s, nr_spec = 0;
3848         char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3849         char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3850 
3851         for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3852                 spec = strtok_r(user_spec, ":", &spec_ptr);
3853                 if (spec == NULL)
3854                         break;
3855                 pr_debug2("threads_spec[%d]: %s\n", t, spec);
3856                 mask = strtok_r(spec, "/", &mask_ptr);
3857                 if (mask == NULL)
3858                         break;
3859                 pr_debug2("  maps mask: %s\n", mask);
3860                 tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3861                 if (!tmp_spec) {
3862                         pr_err("Failed to reallocate maps spec\n");
3863                         ret = -ENOMEM;
3864                         goto out_free;
3865                 }
3866                 maps_spec = tmp_spec;
3867                 maps_spec[nr_spec] = dup_mask = strdup(mask);
3868                 if (!maps_spec[nr_spec]) {
3869                         pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3870                         ret = -ENOMEM;
3871                         goto out_free;
3872                 }
3873                 mask = strtok_r(NULL, "/", &mask_ptr);
3874                 if (mask == NULL) {
3875                         pr_err("Invalid thread maps or affinity specs\n");
3876                         ret = -EINVAL;
3877                         goto out_free;
3878                 }
3879                 pr_debug2("  affinity mask: %s\n", mask);
3880                 tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3881                 if (!tmp_spec) {
3882                         pr_err("Failed to reallocate affinity spec\n");
3883                         ret = -ENOMEM;
3884                         goto out_free;
3885                 }
3886                 affinity_spec = tmp_spec;
3887                 affinity_spec[nr_spec] = strdup(mask);
3888                 if (!affinity_spec[nr_spec]) {
3889                         pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3890                         ret = -ENOMEM;
3891                         goto out_free;
3892                 }
3893                 dup_mask = NULL;
3894                 nr_spec++;
3895         }
3896 
3897         ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3898                                              (const char **)affinity_spec, nr_spec);
3899 
3900 out_free:
3901         free(dup_mask);
3902         for (s = 0; s < nr_spec; s++) {
3903                 if (maps_spec)
3904                         free(maps_spec[s]);
3905                 if (affinity_spec)
3906                         free(affinity_spec[s]);
3907         }
3908         free(affinity_spec);
3909         free(maps_spec);
3910 
3911         return ret;
3912 }
3913 
3914 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3915 {
3916         int ret;
3917 
3918         ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3919         if (ret)
3920                 return ret;
3921 
3922         if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3923                 return -ENODEV;
3924 
3925         rec->nr_threads = 1;
3926 
3927         return 0;
3928 }
3929 
3930 static int record__init_thread_masks(struct record *rec)
3931 {
3932         int ret = 0;
3933         struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3934 
3935         if (!record__threads_enabled(rec))
3936                 return record__init_thread_default_masks(rec, cpus);
3937 
3938         if (evlist__per_thread(rec->evlist)) {
3939                 pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3940                 return -EINVAL;
3941         }
3942 
3943         switch (rec->opts.threads_spec) {
3944         case THREAD_SPEC__CPU:
3945                 ret = record__init_thread_cpu_masks(rec, cpus);
3946                 break;
3947         case THREAD_SPEC__CORE:
3948                 ret = record__init_thread_core_masks(rec, cpus);
3949                 break;
3950         case THREAD_SPEC__PACKAGE:
3951                 ret = record__init_thread_package_masks(rec, cpus);
3952                 break;
3953         case THREAD_SPEC__NUMA:
3954                 ret = record__init_thread_numa_masks(rec, cpus);
3955                 break;
3956         case THREAD_SPEC__USER:
3957                 ret = record__init_thread_user_masks(rec, cpus);
3958                 break;
3959         default:
3960                 break;
3961         }
3962 
3963         return ret;
3964 }
3965 
3966 int cmd_record(int argc, const char **argv)
3967 {
3968         int err;
3969         struct record *rec = &record;
3970         char errbuf[BUFSIZ];
3971 
3972         setlocale(LC_ALL, "");
3973 
3974 #ifndef HAVE_BPF_SKEL
3975 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3976         set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3977 # undef set_nobuild
3978 #endif
3979 
3980         /* Disable eager loading of kernel symbols that adds overhead to perf record. */
3981         symbol_conf.lazy_load_kernel_maps = true;
3982         rec->opts.affinity = PERF_AFFINITY_SYS;
3983 
3984         rec->evlist = evlist__new();
3985         if (rec->evlist == NULL)
3986                 return -ENOMEM;
3987 
3988         err = perf_config(perf_record_config, rec);
3989         if (err)
3990                 return err;
3991 
3992         argc = parse_options(argc, argv, record_options, record_usage,
3993                             PARSE_OPT_STOP_AT_NON_OPTION);
3994         if (quiet)
3995                 perf_quiet_option();
3996 
3997         err = symbol__validate_sym_arguments();
3998         if (err)
3999                 return err;
4000 
4001         perf_debuginfod_setup(&record.debuginfod);
4002 
4003         /* Make system wide (-a) the default target. */
4004         if (!argc && target__none(&rec->opts.target))
4005                 rec->opts.target.system_wide = true;
4006 
4007         if (nr_cgroups && !rec->opts.target.system_wide) {
4008                 usage_with_options_msg(record_usage, record_options,
4009                         "cgroup monitoring only available in system-wide mode");
4010 
4011         }
4012 
4013         if (rec->buildid_mmap) {
4014                 if (!perf_can_record_build_id()) {
4015                         pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4016                         err = -EINVAL;
4017                         goto out_opts;
4018                 }
4019                 pr_debug("Enabling build id in mmap2 events.\n");
4020                 /* Enable mmap build id synthesizing. */
4021                 symbol_conf.buildid_mmap2 = true;
4022                 /* Enable perf_event_attr::build_id bit. */
4023                 rec->opts.build_id = true;
4024                 /* Disable build id cache. */
4025                 rec->no_buildid = true;
4026         }
4027 
4028         if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4029                 pr_err("Kernel has no cgroup sampling support.\n");
4030                 err = -EINVAL;
4031                 goto out_opts;
4032         }
4033 
4034         if (rec->opts.kcore)
4035                 rec->opts.text_poke = true;
4036 
4037         if (rec->opts.kcore || record__threads_enabled(rec))
4038                 rec->data.is_dir = true;
4039 
4040         if (record__threads_enabled(rec)) {
4041                 if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4042                         pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4043                         goto out_opts;
4044                 }
4045                 if (record__aio_enabled(rec)) {
4046                         pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4047                         goto out_opts;
4048                 }
4049         }
4050 
4051         if (rec->opts.comp_level != 0) {
4052                 pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4053                 rec->no_buildid = true;
4054         }
4055 
4056         if (rec->opts.record_switch_events &&
4057             !perf_can_record_switch_events()) {
4058                 ui__error("kernel does not support recording context switch events\n");
4059                 parse_options_usage(record_usage, record_options, "switch-events", 0);
4060                 err = -EINVAL;
4061                 goto out_opts;
4062         }
4063 
4064         if (switch_output_setup(rec)) {
4065                 parse_options_usage(record_usage, record_options, "switch-output", 0);
4066                 err = -EINVAL;
4067                 goto out_opts;
4068         }
4069 
4070         if (rec->switch_output.time) {
4071                 signal(SIGALRM, alarm_sig_handler);
4072                 alarm(rec->switch_output.time);
4073         }
4074 
4075         if (rec->switch_output.num_files) {
4076                 rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4077                                                       sizeof(char *));
4078                 if (!rec->switch_output.filenames) {
4079                         err = -EINVAL;
4080                         goto out_opts;
4081                 }
4082         }
4083 
4084         if (rec->timestamp_filename && record__threads_enabled(rec)) {
4085                 rec->timestamp_filename = false;
4086                 pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4087         }
4088 
4089         /*
4090          * Allow aliases to facilitate the lookup of symbols for address
4091          * filters. Refer to auxtrace_parse_filters().
4092          */
4093         symbol_conf.allow_aliases = true;
4094 
4095         symbol__init(NULL);
4096 
4097         err = record__auxtrace_init(rec);
4098         if (err)
4099                 goto out;
4100 
4101         if (dry_run)
4102                 goto out;
4103 
4104         err = -ENOMEM;
4105 
4106         if (rec->no_buildid_cache || rec->no_buildid) {
4107                 disable_buildid_cache();
4108         } else if (rec->switch_output.enabled) {
4109                 /*
4110                  * In 'perf record --switch-output', disable buildid
4111                  * generation by default to reduce data file switching
4112                  * overhead. Still generate buildid if they are required
4113                  * explicitly using
4114                  *
4115                  *  perf record --switch-output --no-no-buildid \
4116                  *              --no-no-buildid-cache
4117                  *
4118                  * Following code equals to:
4119                  *
4120                  * if ((rec->no_buildid || !rec->no_buildid_set) &&
4121                  *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4122                  *         disable_buildid_cache();
4123                  */
4124                 bool disable = true;
4125 
4126                 if (rec->no_buildid_set && !rec->no_buildid)
4127                         disable = false;
4128                 if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4129                         disable = false;
4130                 if (disable) {
4131                         rec->no_buildid = true;
4132                         rec->no_buildid_cache = true;
4133                         disable_buildid_cache();
4134                 }
4135         }
4136 
4137         if (record.opts.overwrite)
4138                 record.opts.tail_synthesize = true;
4139 
4140         if (rec->evlist->core.nr_entries == 0) {
4141                 bool can_profile_kernel = perf_event_paranoid_check(1);
4142 
4143                 err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4144                 if (err)
4145                         goto out;
4146         }
4147 
4148         if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4149                 rec->opts.no_inherit = true;
4150 
4151         err = target__validate(&rec->opts.target);
4152         if (err) {
4153                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4154                 ui__warning("%s\n", errbuf);
4155         }
4156 
4157         err = target__parse_uid(&rec->opts.target);
4158         if (err) {
4159                 int saved_errno = errno;
4160 
4161                 target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4162                 ui__error("%s", errbuf);
4163 
4164                 err = -saved_errno;
4165                 goto out;
4166         }
4167 
4168         /* Enable ignoring missing threads when -u/-p option is defined. */
4169         rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4170 
4171         evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4172 
4173         if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4174                 arch__add_leaf_frame_record_opts(&rec->opts);
4175 
4176         err = -ENOMEM;
4177         if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4178                 if (rec->opts.target.pid != NULL) {
4179                         pr_err("Couldn't create thread/CPU maps: %s\n",
4180                                 errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4181                         goto out;
4182                 }
4183                 else
4184                         usage_with_options(record_usage, record_options);
4185         }
4186 
4187         err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4188         if (err)
4189                 goto out;
4190 
4191         /*
4192          * We take all buildids when the file contains
4193          * AUX area tracing data because we do not decode the
4194          * trace because it would take too long.
4195          */
4196         if (rec->opts.full_auxtrace)
4197                 rec->buildid_all = true;
4198 
4199         if (rec->opts.text_poke) {
4200                 err = record__config_text_poke(rec->evlist);
4201                 if (err) {
4202                         pr_err("record__config_text_poke failed, error %d\n", err);
4203                         goto out;
4204                 }
4205         }
4206 
4207         if (rec->off_cpu) {
4208                 err = record__config_off_cpu(rec);
4209                 if (err) {
4210                         pr_err("record__config_off_cpu failed, error %d\n", err);
4211                         goto out;
4212                 }
4213         }
4214 
4215         if (record_opts__config(&rec->opts)) {
4216                 err = -EINVAL;
4217                 goto out;
4218         }
4219 
4220         err = record__config_tracking_events(rec);
4221         if (err) {
4222                 pr_err("record__config_tracking_events failed, error %d\n", err);
4223                 goto out;
4224         }
4225 
4226         err = record__init_thread_masks(rec);
4227         if (err) {
4228                 pr_err("Failed to initialize parallel data streaming masks\n");
4229                 goto out;
4230         }
4231 
4232         if (rec->opts.nr_cblocks > nr_cblocks_max)
4233                 rec->opts.nr_cblocks = nr_cblocks_max;
4234         pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4235 
4236         pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4237         pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4238 
4239         if (rec->opts.comp_level > comp_level_max)
4240                 rec->opts.comp_level = comp_level_max;
4241         pr_debug("comp level: %d\n", rec->opts.comp_level);
4242 
4243         err = __cmd_record(&record, argc, argv);
4244 out:
4245         evlist__delete(rec->evlist);
4246         symbol__exit();
4247         auxtrace_record__free(rec->itr);
4248 out_opts:
4249         record__free_thread_masks(rec, rec->nr_threads);
4250         rec->nr_threads = 0;
4251         evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4252         return err;
4253 }
4254 
4255 static void snapshot_sig_handler(int sig __maybe_unused)
4256 {
4257         struct record *rec = &record;
4258 
4259         hit_auxtrace_snapshot_trigger(rec);
4260 
4261         if (switch_output_signal(rec))
4262                 trigger_hit(&switch_output_trigger);
4263 }
4264 
4265 static void alarm_sig_handler(int sig __maybe_unused)
4266 {
4267         struct record *rec = &record;
4268 
4269         if (switch_output_time(rec))
4270                 trigger_hit(&switch_output_trigger);
4271 }
4272 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php