~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/tools/power/x86/turbostat/turbostat.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-only
  2 /*
  3  * turbostat -- show CPU frequency and C-state residency
  4  * on modern Intel and AMD processors.
  5  *
  6  * Copyright (c) 2024 Intel Corporation.
  7  * Len Brown <len.brown@intel.com>
  8  */
  9 
 10 #define _GNU_SOURCE
 11 #include MSRHEADER
 12 
 13 // copied from arch/x86/include/asm/cpu_device_id.h
 14 #define VFM_MODEL_BIT   0
 15 #define VFM_FAMILY_BIT  8
 16 #define VFM_VENDOR_BIT  16
 17 #define VFM_RSVD_BIT    24
 18 
 19 #define VFM_MODEL_MASK  GENMASK(VFM_FAMILY_BIT - 1, VFM_MODEL_BIT)
 20 #define VFM_FAMILY_MASK GENMASK(VFM_VENDOR_BIT - 1, VFM_FAMILY_BIT)
 21 #define VFM_VENDOR_MASK GENMASK(VFM_RSVD_BIT - 1, VFM_VENDOR_BIT)
 22 
 23 #define VFM_MODEL(vfm)  (((vfm) & VFM_MODEL_MASK) >> VFM_MODEL_BIT)
 24 #define VFM_FAMILY(vfm) (((vfm) & VFM_FAMILY_MASK) >> VFM_FAMILY_BIT)
 25 #define VFM_VENDOR(vfm) (((vfm) & VFM_VENDOR_MASK) >> VFM_VENDOR_BIT)
 26 
 27 #define VFM_MAKE(_vendor, _family, _model) (    \
 28         ((_model) << VFM_MODEL_BIT) |           \
 29         ((_family) << VFM_FAMILY_BIT) |         \
 30         ((_vendor) << VFM_VENDOR_BIT)           \
 31 )
 32 // end copied section
 33 
 34 #define X86_VENDOR_INTEL        0
 35 
 36 #include INTEL_FAMILY_HEADER
 37 #include BUILD_BUG_HEADER
 38 #include <stdarg.h>
 39 #include <stdio.h>
 40 #include <err.h>
 41 #include <unistd.h>
 42 #include <sys/types.h>
 43 #include <sys/wait.h>
 44 #include <sys/stat.h>
 45 #include <sys/select.h>
 46 #include <sys/resource.h>
 47 #include <sys/mman.h>
 48 #include <fcntl.h>
 49 #include <signal.h>
 50 #include <sys/time.h>
 51 #include <stdlib.h>
 52 #include <getopt.h>
 53 #include <dirent.h>
 54 #include <string.h>
 55 #include <ctype.h>
 56 #include <sched.h>
 57 #include <time.h>
 58 #include <cpuid.h>
 59 #include <sys/capability.h>
 60 #include <errno.h>
 61 #include <math.h>
 62 #include <linux/perf_event.h>
 63 #include <asm/unistd.h>
 64 #include <stdbool.h>
 65 #include <assert.h>
 66 #include <linux/kernel.h>
 67 
 68 #define UNUSED(x) (void)(x)
 69 
 70 /*
 71  * This list matches the column headers, except
 72  * 1. built-in only, the sysfs counters are not here -- we learn of those at run-time
 73  * 2. Core and CPU are moved to the end, we can't have strings that contain them
 74  *    matching on them for --show and --hide.
 75  */
 76 
 77 /*
 78  * buffer size used by sscanf() for added column names
 79  * Usually truncated to 7 characters, but also handles 18 columns for raw 64-bit counters
 80  */
 81 #define NAME_BYTES 20
 82 #define PATH_BYTES 128
 83 #define PERF_NAME_BYTES 128
 84 
 85 #define MAX_NOFILE 0x8000
 86 
 87 #define COUNTER_KIND_PERF_PREFIX "perf/"
 88 #define COUNTER_KIND_PERF_PREFIX_LEN strlen(COUNTER_KIND_PERF_PREFIX)
 89 #define PERF_DEV_NAME_BYTES 32
 90 #define PERF_EVT_NAME_BYTES 32
 91 
 92 enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE };
 93 enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M };
 94 enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE };
 95 enum counter_source { COUNTER_SOURCE_NONE, COUNTER_SOURCE_PERF, COUNTER_SOURCE_MSR };
 96 
 97 struct perf_counter_info {
 98         struct perf_counter_info *next;
 99 
100         /* How to open the counter / What counter it is. */
101         char device[PERF_DEV_NAME_BYTES];
102         char event[PERF_EVT_NAME_BYTES];
103 
104         /* How to show/format the counter. */
105         char name[PERF_NAME_BYTES];
106         unsigned int width;
107         enum counter_scope scope;
108         enum counter_type type;
109         enum counter_format format;
110         double scale;
111 
112         /* For reading the counter. */
113         int *fd_perf_per_domain;
114         size_t num_domains;
115 };
116 
117 struct sysfs_path {
118         char path[PATH_BYTES];
119         int id;
120         struct sysfs_path *next;
121 };
122 
123 struct msr_counter {
124         unsigned int msr_num;
125         char name[NAME_BYTES];
126         struct sysfs_path *sp;
127         unsigned int width;
128         enum counter_type type;
129         enum counter_format format;
130         struct msr_counter *next;
131         unsigned int flags;
132 #define FLAGS_HIDE      (1 << 0)
133 #define FLAGS_SHOW      (1 << 1)
134 #define SYSFS_PERCPU    (1 << 1)
135 };
136 
137 struct msr_counter bic[] = {
138         { 0x0, "usec", NULL, 0, 0, 0, NULL, 0 },
139         { 0x0, "Time_Of_Day_Seconds", NULL, 0, 0, 0, NULL, 0 },
140         { 0x0, "Package", NULL, 0, 0, 0, NULL, 0 },
141         { 0x0, "Node", NULL, 0, 0, 0, NULL, 0 },
142         { 0x0, "Avg_MHz", NULL, 0, 0, 0, NULL, 0 },
143         { 0x0, "Busy%", NULL, 0, 0, 0, NULL, 0 },
144         { 0x0, "Bzy_MHz", NULL, 0, 0, 0, NULL, 0 },
145         { 0x0, "TSC_MHz", NULL, 0, 0, 0, NULL, 0 },
146         { 0x0, "IRQ", NULL, 0, 0, 0, NULL, 0 },
147         { 0x0, "SMI", NULL, 32, 0, FORMAT_DELTA, NULL, 0 },
148         { 0x0, "sysfs", NULL, 0, 0, 0, NULL, 0 },
149         { 0x0, "CPU%c1", NULL, 0, 0, 0, NULL, 0 },
150         { 0x0, "CPU%c3", NULL, 0, 0, 0, NULL, 0 },
151         { 0x0, "CPU%c6", NULL, 0, 0, 0, NULL, 0 },
152         { 0x0, "CPU%c7", NULL, 0, 0, 0, NULL, 0 },
153         { 0x0, "ThreadC", NULL, 0, 0, 0, NULL, 0 },
154         { 0x0, "CoreTmp", NULL, 0, 0, 0, NULL, 0 },
155         { 0x0, "CoreCnt", NULL, 0, 0, 0, NULL, 0 },
156         { 0x0, "PkgTmp", NULL, 0, 0, 0, NULL, 0 },
157         { 0x0, "GFX%rc6", NULL, 0, 0, 0, NULL, 0 },
158         { 0x0, "GFXMHz", NULL, 0, 0, 0, NULL, 0 },
159         { 0x0, "Pkg%pc2", NULL, 0, 0, 0, NULL, 0 },
160         { 0x0, "Pkg%pc3", NULL, 0, 0, 0, NULL, 0 },
161         { 0x0, "Pkg%pc6", NULL, 0, 0, 0, NULL, 0 },
162         { 0x0, "Pkg%pc7", NULL, 0, 0, 0, NULL, 0 },
163         { 0x0, "Pkg%pc8", NULL, 0, 0, 0, NULL, 0 },
164         { 0x0, "Pkg%pc9", NULL, 0, 0, 0, NULL, 0 },
165         { 0x0, "Pk%pc10", NULL, 0, 0, 0, NULL, 0 },
166         { 0x0, "CPU%LPI", NULL, 0, 0, 0, NULL, 0 },
167         { 0x0, "SYS%LPI", NULL, 0, 0, 0, NULL, 0 },
168         { 0x0, "PkgWatt", NULL, 0, 0, 0, NULL, 0 },
169         { 0x0, "CorWatt", NULL, 0, 0, 0, NULL, 0 },
170         { 0x0, "GFXWatt", NULL, 0, 0, 0, NULL, 0 },
171         { 0x0, "PkgCnt", NULL, 0, 0, 0, NULL, 0 },
172         { 0x0, "RAMWatt", NULL, 0, 0, 0, NULL, 0 },
173         { 0x0, "PKG_%", NULL, 0, 0, 0, NULL, 0 },
174         { 0x0, "RAM_%", NULL, 0, 0, 0, NULL, 0 },
175         { 0x0, "Pkg_J", NULL, 0, 0, 0, NULL, 0 },
176         { 0x0, "Cor_J", NULL, 0, 0, 0, NULL, 0 },
177         { 0x0, "GFX_J", NULL, 0, 0, 0, NULL, 0 },
178         { 0x0, "RAM_J", NULL, 0, 0, 0, NULL, 0 },
179         { 0x0, "Mod%c6", NULL, 0, 0, 0, NULL, 0 },
180         { 0x0, "Totl%C0", NULL, 0, 0, 0, NULL, 0 },
181         { 0x0, "Any%C0", NULL, 0, 0, 0, NULL, 0 },
182         { 0x0, "GFX%C0", NULL, 0, 0, 0, NULL, 0 },
183         { 0x0, "CPUGFX%", NULL, 0, 0, 0, NULL, 0 },
184         { 0x0, "Core", NULL, 0, 0, 0, NULL, 0 },
185         { 0x0, "CPU", NULL, 0, 0, 0, NULL, 0 },
186         { 0x0, "APIC", NULL, 0, 0, 0, NULL, 0 },
187         { 0x0, "X2APIC", NULL, 0, 0, 0, NULL, 0 },
188         { 0x0, "Die", NULL, 0, 0, 0, NULL, 0 },
189         { 0x0, "GFXAMHz", NULL, 0, 0, 0, NULL, 0 },
190         { 0x0, "IPC", NULL, 0, 0, 0, NULL, 0 },
191         { 0x0, "CoreThr", NULL, 0, 0, 0, NULL, 0 },
192         { 0x0, "UncMHz", NULL, 0, 0, 0, NULL, 0 },
193         { 0x0, "SAM%mc6", NULL, 0, 0, 0, NULL, 0 },
194         { 0x0, "SAMMHz", NULL, 0, 0, 0, NULL, 0 },
195         { 0x0, "SAMAMHz", NULL, 0, 0, 0, NULL, 0 },
196         { 0x0, "Die%c6", NULL, 0, 0, 0, NULL, 0 },
197 };
198 
199 #define MAX_BIC (sizeof(bic) / sizeof(struct msr_counter))
200 #define BIC_USEC        (1ULL << 0)
201 #define BIC_TOD         (1ULL << 1)
202 #define BIC_Package     (1ULL << 2)
203 #define BIC_Node        (1ULL << 3)
204 #define BIC_Avg_MHz     (1ULL << 4)
205 #define BIC_Busy        (1ULL << 5)
206 #define BIC_Bzy_MHz     (1ULL << 6)
207 #define BIC_TSC_MHz     (1ULL << 7)
208 #define BIC_IRQ         (1ULL << 8)
209 #define BIC_SMI         (1ULL << 9)
210 #define BIC_sysfs       (1ULL << 10)
211 #define BIC_CPU_c1      (1ULL << 11)
212 #define BIC_CPU_c3      (1ULL << 12)
213 #define BIC_CPU_c6      (1ULL << 13)
214 #define BIC_CPU_c7      (1ULL << 14)
215 #define BIC_ThreadC     (1ULL << 15)
216 #define BIC_CoreTmp     (1ULL << 16)
217 #define BIC_CoreCnt     (1ULL << 17)
218 #define BIC_PkgTmp      (1ULL << 18)
219 #define BIC_GFX_rc6     (1ULL << 19)
220 #define BIC_GFXMHz      (1ULL << 20)
221 #define BIC_Pkgpc2      (1ULL << 21)
222 #define BIC_Pkgpc3      (1ULL << 22)
223 #define BIC_Pkgpc6      (1ULL << 23)
224 #define BIC_Pkgpc7      (1ULL << 24)
225 #define BIC_Pkgpc8      (1ULL << 25)
226 #define BIC_Pkgpc9      (1ULL << 26)
227 #define BIC_Pkgpc10     (1ULL << 27)
228 #define BIC_CPU_LPI     (1ULL << 28)
229 #define BIC_SYS_LPI     (1ULL << 29)
230 #define BIC_PkgWatt     (1ULL << 30)
231 #define BIC_CorWatt     (1ULL << 31)
232 #define BIC_GFXWatt     (1ULL << 32)
233 #define BIC_PkgCnt      (1ULL << 33)
234 #define BIC_RAMWatt     (1ULL << 34)
235 #define BIC_PKG__       (1ULL << 35)
236 #define BIC_RAM__       (1ULL << 36)
237 #define BIC_Pkg_J       (1ULL << 37)
238 #define BIC_Cor_J       (1ULL << 38)
239 #define BIC_GFX_J       (1ULL << 39)
240 #define BIC_RAM_J       (1ULL << 40)
241 #define BIC_Mod_c6      (1ULL << 41)
242 #define BIC_Totl_c0     (1ULL << 42)
243 #define BIC_Any_c0      (1ULL << 43)
244 #define BIC_GFX_c0      (1ULL << 44)
245 #define BIC_CPUGFX      (1ULL << 45)
246 #define BIC_Core        (1ULL << 46)
247 #define BIC_CPU         (1ULL << 47)
248 #define BIC_APIC        (1ULL << 48)
249 #define BIC_X2APIC      (1ULL << 49)
250 #define BIC_Die         (1ULL << 50)
251 #define BIC_GFXACTMHz   (1ULL << 51)
252 #define BIC_IPC         (1ULL << 52)
253 #define BIC_CORE_THROT_CNT      (1ULL << 53)
254 #define BIC_UNCORE_MHZ          (1ULL << 54)
255 #define BIC_SAM_mc6             (1ULL << 55)
256 #define BIC_SAMMHz              (1ULL << 56)
257 #define BIC_SAMACTMHz           (1ULL << 57)
258 #define BIC_Diec6               (1ULL << 58)
259 
260 #define BIC_TOPOLOGY (BIC_Package | BIC_Node | BIC_CoreCnt | BIC_PkgCnt | BIC_Core | BIC_CPU | BIC_Die )
261 #define BIC_THERMAL_PWR ( BIC_CoreTmp | BIC_PkgTmp | BIC_PkgWatt | BIC_CorWatt | BIC_GFXWatt | BIC_RAMWatt | BIC_PKG__ | BIC_RAM__)
262 #define BIC_FREQUENCY (BIC_Avg_MHz | BIC_Busy | BIC_Bzy_MHz | BIC_TSC_MHz | BIC_GFXMHz | BIC_GFXACTMHz | BIC_SAMMHz | BIC_SAMACTMHz | BIC_UNCORE_MHZ)
263 #define BIC_IDLE (BIC_sysfs | BIC_CPU_c1 | BIC_CPU_c3 | BIC_CPU_c6 | BIC_CPU_c7 | BIC_GFX_rc6 | BIC_Pkgpc2 | BIC_Pkgpc3 | BIC_Pkgpc6 | BIC_Pkgpc7 | BIC_Pkgpc8 | BIC_Pkgpc9 | BIC_Pkgpc10 | BIC_CPU_LPI | BIC_SYS_LPI | BIC_Mod_c6 | BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_SAM_mc6 | BIC_Diec6)
264 #define BIC_OTHER ( BIC_IRQ | BIC_SMI | BIC_ThreadC | BIC_CoreTmp | BIC_IPC)
265 
266 #define BIC_DISABLED_BY_DEFAULT (BIC_USEC | BIC_TOD | BIC_APIC | BIC_X2APIC)
267 
268 unsigned long long bic_enabled = (0xFFFFFFFFFFFFFFFFULL & ~BIC_DISABLED_BY_DEFAULT);
269 unsigned long long bic_present = BIC_USEC | BIC_TOD | BIC_sysfs | BIC_APIC | BIC_X2APIC;
270 
271 #define DO_BIC(COUNTER_NAME) (bic_enabled & bic_present & COUNTER_NAME)
272 #define DO_BIC_READ(COUNTER_NAME) (bic_present & COUNTER_NAME)
273 #define ENABLE_BIC(COUNTER_NAME) (bic_enabled |= COUNTER_NAME)
274 #define BIC_PRESENT(COUNTER_BIT) (bic_present |= COUNTER_BIT)
275 #define BIC_NOT_PRESENT(COUNTER_BIT) (bic_present &= ~COUNTER_BIT)
276 #define BIC_IS_ENABLED(COUNTER_BIT) (bic_enabled & COUNTER_BIT)
277 
278 /*
279  * MSR_PKG_CST_CONFIG_CONTROL decoding for pkg_cstate_limit:
280  * If you change the values, note they are used both in comparisons
281  * (>= PCL__7) and to index pkg_cstate_limit_strings[].
282  */
283 #define PCLUKN 0                /* Unknown */
284 #define PCLRSV 1                /* Reserved */
285 #define PCL__0 2                /* PC0 */
286 #define PCL__1 3                /* PC1 */
287 #define PCL__2 4                /* PC2 */
288 #define PCL__3 5                /* PC3 */
289 #define PCL__4 6                /* PC4 */
290 #define PCL__6 7                /* PC6 */
291 #define PCL_6N 8                /* PC6 No Retention */
292 #define PCL_6R 9                /* PC6 Retention */
293 #define PCL__7 10               /* PC7 */
294 #define PCL_7S 11               /* PC7 Shrink */
295 #define PCL__8 12               /* PC8 */
296 #define PCL__9 13               /* PC9 */
297 #define PCL_10 14               /* PC10 */
298 #define PCLUNL 15               /* Unlimited */
299 
300 struct amperf_group_fd;
301 
302 char *proc_stat = "/proc/stat";
303 FILE *outf;
304 int *fd_percpu;
305 int *fd_instr_count_percpu;
306 struct timeval interval_tv = { 5, 0 };
307 struct timespec interval_ts = { 5, 0 };
308 
309 unsigned int num_iterations;
310 unsigned int header_iterations;
311 unsigned int debug;
312 unsigned int quiet;
313 unsigned int shown;
314 unsigned int sums_need_wide_columns;
315 unsigned int rapl_joules;
316 unsigned int summary_only;
317 unsigned int list_header_only;
318 unsigned int dump_only;
319 unsigned int has_aperf;
320 unsigned int has_aperf_access;
321 unsigned int has_epb;
322 unsigned int has_turbo;
323 unsigned int is_hybrid;
324 unsigned int units = 1000000;   /* MHz etc */
325 unsigned int genuine_intel;
326 unsigned int authentic_amd;
327 unsigned int hygon_genuine;
328 unsigned int max_level, max_extended_level;
329 unsigned int has_invariant_tsc;
330 unsigned int aperf_mperf_multiplier = 1;
331 double bclk;
332 double base_hz;
333 unsigned int has_base_hz;
334 double tsc_tweak = 1.0;
335 unsigned int show_pkg_only;
336 unsigned int show_core_only;
337 char *output_buffer, *outp;
338 unsigned int do_dts;
339 unsigned int do_ptm;
340 unsigned int do_ipc;
341 unsigned long long cpuidle_cur_cpu_lpi_us;
342 unsigned long long cpuidle_cur_sys_lpi_us;
343 unsigned int tj_max;
344 unsigned int tj_max_override;
345 double rapl_power_units, rapl_time_units;
346 double rapl_dram_energy_units, rapl_energy_units;
347 double rapl_joule_counter_range;
348 unsigned int crystal_hz;
349 unsigned long long tsc_hz;
350 int base_cpu;
351 unsigned int has_hwp;           /* IA32_PM_ENABLE, IA32_HWP_CAPABILITIES */
352                         /* IA32_HWP_REQUEST, IA32_HWP_STATUS */
353 unsigned int has_hwp_notify;    /* IA32_HWP_INTERRUPT */
354 unsigned int has_hwp_activity_window;   /* IA32_HWP_REQUEST[bits 41:32] */
355 unsigned int has_hwp_epp;       /* IA32_HWP_REQUEST[bits 31:24] */
356 unsigned int has_hwp_pkg;       /* IA32_HWP_REQUEST_PKG */
357 unsigned int first_counter_read = 1;
358 int ignore_stdin;
359 bool no_msr;
360 bool no_perf;
361 
362 enum gfx_sysfs_idx {
363         GFX_rc6,
364         GFX_MHz,
365         GFX_ACTMHz,
366         SAM_mc6,
367         SAM_MHz,
368         SAM_ACTMHz,
369         GFX_MAX
370 };
371 
372 struct gfx_sysfs_info {
373         const char *path;
374         FILE *fp;
375         unsigned int val;
376         unsigned long long val_ull;
377 };
378 
379 static struct gfx_sysfs_info gfx_info[GFX_MAX];
380 
381 int get_msr(int cpu, off_t offset, unsigned long long *msr);
382 int add_counter(unsigned int msr_num, char *path, char *name,
383                 unsigned int width, enum counter_scope scope,
384                 enum counter_type type, enum counter_format format, int flags, int package_num);
385 
386 /* Model specific support Start */
387 
388 /* List of features that may diverge among different platforms */
389 struct platform_features {
390         bool has_msr_misc_feature_control;      /* MSR_MISC_FEATURE_CONTROL */
391         bool has_msr_misc_pwr_mgmt;     /* MSR_MISC_PWR_MGMT */
392         bool has_nhm_msrs;      /* MSR_PLATFORM_INFO, MSR_IA32_TEMPERATURE_TARGET, MSR_SMI_COUNT, MSR_PKG_CST_CONFIG_CONTROL, MSR_IA32_POWER_CTL, TRL MSRs */
393         bool has_config_tdp;    /* MSR_CONFIG_TDP_NOMINAL/LEVEL_1/LEVEL_2/CONTROL, MSR_TURBO_ACTIVATION_RATIO */
394         int bclk_freq;          /* CPU base clock */
395         int crystal_freq;       /* Crystal clock to use when not available from CPUID.15 */
396         int supported_cstates;  /* Core cstates and Package cstates supported */
397         int cst_limit;          /* MSR_PKG_CST_CONFIG_CONTROL */
398         bool has_cst_auto_convension;   /* AUTOMATIC_CSTATE_CONVERSION bit in MSR_PKG_CST_CONFIG_CONTROL */
399         bool has_irtl_msrs;     /* MSR_PKGC3/PKGC6/PKGC7/PKGC8/PKGC9/PKGC10_IRTL */
400         bool has_msr_core_c1_res;       /* MSR_CORE_C1_RES */
401         bool has_msr_module_c6_res_ms;  /* MSR_MODULE_C6_RES_MS */
402         bool has_msr_c6_demotion_policy_config; /* MSR_CC6_DEMOTION_POLICY_CONFIG/MSR_MC6_DEMOTION_POLICY_CONFIG */
403         bool has_msr_atom_pkg_c6_residency;     /* MSR_ATOM_PKG_C6_RESIDENCY */
404         bool has_msr_knl_core_c6_residency;     /* MSR_KNL_CORE_C6_RESIDENCY */
405         bool has_ext_cst_msrs;  /* MSR_PKG_WEIGHTED_CORE_C0_RES/MSR_PKG_ANY_CORE_C0_RES/MSR_PKG_ANY_GFXE_C0_RES/MSR_PKG_BOTH_CORE_GFXE_C0_RES */
406         bool has_cst_prewake_bit;       /* Cstate prewake bit in MSR_IA32_POWER_CTL */
407         int trl_msrs;           /* MSR_TURBO_RATIO_LIMIT/LIMIT1/LIMIT2/SECONDARY, Atom TRL MSRs */
408         int plr_msrs;           /* MSR_CORE/GFX/RING_PERF_LIMIT_REASONS */
409         int rapl_msrs;          /* RAPL PKG/DRAM/CORE/GFX MSRs, AMD RAPL MSRs */
410         bool has_per_core_rapl; /* Indicates cores energy collection is per-core, not per-package. AMD specific for now */
411         bool has_rapl_divisor;  /* Divisor for Energy unit raw value from MSR_RAPL_POWER_UNIT */
412         bool has_fixed_rapl_unit;       /* Fixed Energy Unit used for DRAM RAPL Domain */
413         int rapl_quirk_tdp;     /* Hardcoded TDP value when cannot be retrieved from hardware */
414         int tcc_offset_bits;    /* TCC Offset bits in MSR_IA32_TEMPERATURE_TARGET */
415         bool enable_tsc_tweak;  /* Use CPU Base freq instead of TSC freq for aperf/mperf counter */
416         bool need_perf_multiplier;      /* mperf/aperf multiplier */
417 };
418 
419 struct platform_data {
420         unsigned int vfm;
421         const struct platform_features *features;
422 };
423 
424 /* For BCLK */
425 enum bclk_freq {
426         BCLK_100MHZ = 1,
427         BCLK_133MHZ,
428         BCLK_SLV,
429 };
430 
431 #define SLM_BCLK_FREQS 5
432 double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0 };
433 
434 double slm_bclk(void)
435 {
436         unsigned long long msr = 3;
437         unsigned int i;
438         double freq;
439 
440         if (get_msr(base_cpu, MSR_FSB_FREQ, &msr))
441                 fprintf(outf, "SLM BCLK: unknown\n");
442 
443         i = msr & 0xf;
444         if (i >= SLM_BCLK_FREQS) {
445                 fprintf(outf, "SLM BCLK[%d] invalid\n", i);
446                 i = 3;
447         }
448         freq = slm_freq_table[i];
449 
450         if (!quiet)
451                 fprintf(outf, "SLM BCLK: %.1f Mhz\n", freq);
452 
453         return freq;
454 }
455 
456 /* For Package cstate limit */
457 enum package_cstate_limit {
458         CST_LIMIT_NHM = 1,
459         CST_LIMIT_SNB,
460         CST_LIMIT_HSW,
461         CST_LIMIT_SKX,
462         CST_LIMIT_ICX,
463         CST_LIMIT_SLV,
464         CST_LIMIT_AMT,
465         CST_LIMIT_KNL,
466         CST_LIMIT_GMT,
467 };
468 
469 /* For Turbo Ratio Limit MSRs */
470 enum turbo_ratio_limit_msrs {
471         TRL_BASE = BIT(0),
472         TRL_LIMIT1 = BIT(1),
473         TRL_LIMIT2 = BIT(2),
474         TRL_ATOM = BIT(3),
475         TRL_KNL = BIT(4),
476         TRL_CORECOUNT = BIT(5),
477 };
478 
479 /* For Perf Limit Reason MSRs */
480 enum perf_limit_reason_msrs {
481         PLR_CORE = BIT(0),
482         PLR_GFX = BIT(1),
483         PLR_RING = BIT(2),
484 };
485 
486 /* For RAPL MSRs */
487 enum rapl_msrs {
488         RAPL_PKG_POWER_LIMIT = BIT(0),  /* 0x610 MSR_PKG_POWER_LIMIT */
489         RAPL_PKG_ENERGY_STATUS = BIT(1),        /* 0x611 MSR_PKG_ENERGY_STATUS */
490         RAPL_PKG_PERF_STATUS = BIT(2),  /* 0x613 MSR_PKG_PERF_STATUS */
491         RAPL_PKG_POWER_INFO = BIT(3),   /* 0x614 MSR_PKG_POWER_INFO */
492         RAPL_DRAM_POWER_LIMIT = BIT(4), /* 0x618 MSR_DRAM_POWER_LIMIT */
493         RAPL_DRAM_ENERGY_STATUS = BIT(5),       /* 0x619 MSR_DRAM_ENERGY_STATUS */
494         RAPL_DRAM_PERF_STATUS = BIT(6), /* 0x61b MSR_DRAM_PERF_STATUS */
495         RAPL_DRAM_POWER_INFO = BIT(7),  /* 0x61c MSR_DRAM_POWER_INFO */
496         RAPL_CORE_POWER_LIMIT = BIT(8), /* 0x638 MSR_PP0_POWER_LIMIT */
497         RAPL_CORE_ENERGY_STATUS = BIT(9),       /* 0x639 MSR_PP0_ENERGY_STATUS */
498         RAPL_CORE_POLICY = BIT(10),     /* 0x63a MSR_PP0_POLICY */
499         RAPL_GFX_POWER_LIMIT = BIT(11), /* 0x640 MSR_PP1_POWER_LIMIT */
500         RAPL_GFX_ENERGY_STATUS = BIT(12),       /* 0x641 MSR_PP1_ENERGY_STATUS */
501         RAPL_GFX_POLICY = BIT(13),      /* 0x642 MSR_PP1_POLICY */
502         RAPL_AMD_PWR_UNIT = BIT(14),    /* 0xc0010299 MSR_AMD_RAPL_POWER_UNIT */
503         RAPL_AMD_CORE_ENERGY_STAT = BIT(15),    /* 0xc001029a MSR_AMD_CORE_ENERGY_STATUS */
504         RAPL_AMD_PKG_ENERGY_STAT = BIT(16),     /* 0xc001029b MSR_AMD_PKG_ENERGY_STATUS */
505 };
506 
507 #define RAPL_PKG        (RAPL_PKG_ENERGY_STATUS | RAPL_PKG_POWER_LIMIT)
508 #define RAPL_DRAM       (RAPL_DRAM_ENERGY_STATUS | RAPL_DRAM_POWER_LIMIT)
509 #define RAPL_CORE       (RAPL_CORE_ENERGY_STATUS | RAPL_CORE_POWER_LIMIT)
510 #define RAPL_GFX        (RAPL_GFX_POWER_LIMIT | RAPL_GFX_ENERGY_STATUS)
511 
512 #define RAPL_PKG_ALL    (RAPL_PKG | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO)
513 #define RAPL_DRAM_ALL   (RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_DRAM_POWER_INFO)
514 #define RAPL_CORE_ALL   (RAPL_CORE | RAPL_CORE_POLICY)
515 #define RAPL_GFX_ALL    (RAPL_GFX | RAPL_GFX_POLIGY)
516 
517 #define RAPL_AMD_F17H   (RAPL_AMD_PWR_UNIT | RAPL_AMD_CORE_ENERGY_STAT | RAPL_AMD_PKG_ENERGY_STAT)
518 
519 /* For Cstates */
520 enum cstates {
521         CC1 = BIT(0),
522         CC3 = BIT(1),
523         CC6 = BIT(2),
524         CC7 = BIT(3),
525         PC2 = BIT(4),
526         PC3 = BIT(5),
527         PC6 = BIT(6),
528         PC7 = BIT(7),
529         PC8 = BIT(8),
530         PC9 = BIT(9),
531         PC10 = BIT(10),
532 };
533 
534 static const struct platform_features nhm_features = {
535         .has_msr_misc_pwr_mgmt = 1,
536         .has_nhm_msrs = 1,
537         .bclk_freq = BCLK_133MHZ,
538         .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
539         .cst_limit = CST_LIMIT_NHM,
540         .trl_msrs = TRL_BASE,
541 };
542 
543 static const struct platform_features nhx_features = {
544         .has_msr_misc_pwr_mgmt = 1,
545         .has_nhm_msrs = 1,
546         .bclk_freq = BCLK_133MHZ,
547         .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
548         .cst_limit = CST_LIMIT_NHM,
549 };
550 
551 static const struct platform_features snb_features = {
552         .has_msr_misc_feature_control = 1,
553         .has_msr_misc_pwr_mgmt = 1,
554         .has_nhm_msrs = 1,
555         .bclk_freq = BCLK_100MHZ,
556         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
557         .cst_limit = CST_LIMIT_SNB,
558         .has_irtl_msrs = 1,
559         .trl_msrs = TRL_BASE,
560         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
561 };
562 
563 static const struct platform_features snx_features = {
564         .has_msr_misc_feature_control = 1,
565         .has_msr_misc_pwr_mgmt = 1,
566         .has_nhm_msrs = 1,
567         .bclk_freq = BCLK_100MHZ,
568         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
569         .cst_limit = CST_LIMIT_SNB,
570         .has_irtl_msrs = 1,
571         .trl_msrs = TRL_BASE,
572         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
573 };
574 
575 static const struct platform_features ivb_features = {
576         .has_msr_misc_feature_control = 1,
577         .has_msr_misc_pwr_mgmt = 1,
578         .has_nhm_msrs = 1,
579         .has_config_tdp = 1,
580         .bclk_freq = BCLK_100MHZ,
581         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
582         .cst_limit = CST_LIMIT_SNB,
583         .has_irtl_msrs = 1,
584         .trl_msrs = TRL_BASE,
585         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
586 };
587 
588 static const struct platform_features ivx_features = {
589         .has_msr_misc_feature_control = 1,
590         .has_msr_misc_pwr_mgmt = 1,
591         .has_nhm_msrs = 1,
592         .bclk_freq = BCLK_100MHZ,
593         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
594         .cst_limit = CST_LIMIT_SNB,
595         .has_irtl_msrs = 1,
596         .trl_msrs = TRL_BASE | TRL_LIMIT1,
597         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM_ALL,
598 };
599 
600 static const struct platform_features hsw_features = {
601         .has_msr_misc_feature_control = 1,
602         .has_msr_misc_pwr_mgmt = 1,
603         .has_nhm_msrs = 1,
604         .has_config_tdp = 1,
605         .bclk_freq = BCLK_100MHZ,
606         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
607         .cst_limit = CST_LIMIT_HSW,
608         .has_irtl_msrs = 1,
609         .trl_msrs = TRL_BASE,
610         .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
611         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
612 };
613 
614 static const struct platform_features hsx_features = {
615         .has_msr_misc_feature_control = 1,
616         .has_msr_misc_pwr_mgmt = 1,
617         .has_nhm_msrs = 1,
618         .has_config_tdp = 1,
619         .bclk_freq = BCLK_100MHZ,
620         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
621         .cst_limit = CST_LIMIT_HSW,
622         .has_irtl_msrs = 1,
623         .trl_msrs = TRL_BASE | TRL_LIMIT1 | TRL_LIMIT2,
624         .plr_msrs = PLR_CORE | PLR_RING,
625         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
626         .has_fixed_rapl_unit = 1,
627 };
628 
629 static const struct platform_features hswl_features = {
630         .has_msr_misc_feature_control = 1,
631         .has_msr_misc_pwr_mgmt = 1,
632         .has_nhm_msrs = 1,
633         .has_config_tdp = 1,
634         .bclk_freq = BCLK_100MHZ,
635         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
636         .cst_limit = CST_LIMIT_HSW,
637         .has_irtl_msrs = 1,
638         .trl_msrs = TRL_BASE,
639         .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
640         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
641 };
642 
643 static const struct platform_features hswg_features = {
644         .has_msr_misc_feature_control = 1,
645         .has_msr_misc_pwr_mgmt = 1,
646         .has_nhm_msrs = 1,
647         .has_config_tdp = 1,
648         .bclk_freq = BCLK_100MHZ,
649         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
650         .cst_limit = CST_LIMIT_HSW,
651         .has_irtl_msrs = 1,
652         .trl_msrs = TRL_BASE,
653         .plr_msrs = PLR_CORE | PLR_GFX | PLR_RING,
654         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
655 };
656 
657 static const struct platform_features bdw_features = {
658         .has_msr_misc_feature_control = 1,
659         .has_msr_misc_pwr_mgmt = 1,
660         .has_nhm_msrs = 1,
661         .has_config_tdp = 1,
662         .bclk_freq = BCLK_100MHZ,
663         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
664         .cst_limit = CST_LIMIT_HSW,
665         .has_irtl_msrs = 1,
666         .trl_msrs = TRL_BASE,
667         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
668 };
669 
670 static const struct platform_features bdwg_features = {
671         .has_msr_misc_feature_control = 1,
672         .has_msr_misc_pwr_mgmt = 1,
673         .has_nhm_msrs = 1,
674         .has_config_tdp = 1,
675         .bclk_freq = BCLK_100MHZ,
676         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7,
677         .cst_limit = CST_LIMIT_HSW,
678         .has_irtl_msrs = 1,
679         .trl_msrs = TRL_BASE,
680         .rapl_msrs = RAPL_PKG | RAPL_CORE_ALL | RAPL_GFX | RAPL_PKG_POWER_INFO,
681 };
682 
683 static const struct platform_features bdx_features = {
684         .has_msr_misc_feature_control = 1,
685         .has_msr_misc_pwr_mgmt = 1,
686         .has_nhm_msrs = 1,
687         .has_config_tdp = 1,
688         .bclk_freq = BCLK_100MHZ,
689         .supported_cstates = CC1 | CC3 | CC6 | PC2 | PC3 | PC6,
690         .cst_limit = CST_LIMIT_HSW,
691         .has_irtl_msrs = 1,
692         .has_cst_auto_convension = 1,
693         .trl_msrs = TRL_BASE,
694         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
695         .has_fixed_rapl_unit = 1,
696 };
697 
698 static const struct platform_features skl_features = {
699         .has_msr_misc_feature_control = 1,
700         .has_msr_misc_pwr_mgmt = 1,
701         .has_nhm_msrs = 1,
702         .has_config_tdp = 1,
703         .bclk_freq = BCLK_100MHZ,
704         .crystal_freq = 24000000,
705         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
706         .cst_limit = CST_LIMIT_HSW,
707         .has_irtl_msrs = 1,
708         .has_ext_cst_msrs = 1,
709         .trl_msrs = TRL_BASE,
710         .tcc_offset_bits = 6,
711         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
712         .enable_tsc_tweak = 1,
713 };
714 
715 static const struct platform_features cnl_features = {
716         .has_msr_misc_feature_control = 1,
717         .has_msr_misc_pwr_mgmt = 1,
718         .has_nhm_msrs = 1,
719         .has_config_tdp = 1,
720         .bclk_freq = BCLK_100MHZ,
721         .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
722         .cst_limit = CST_LIMIT_HSW,
723         .has_irtl_msrs = 1,
724         .has_msr_core_c1_res = 1,
725         .has_ext_cst_msrs = 1,
726         .trl_msrs = TRL_BASE,
727         .tcc_offset_bits = 6,
728         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
729         .enable_tsc_tweak = 1,
730 };
731 
732 static const struct platform_features adl_features = {
733         .has_msr_misc_feature_control = 1,
734         .has_msr_misc_pwr_mgmt = 1,
735         .has_nhm_msrs = 1,
736         .has_config_tdp = 1,
737         .bclk_freq = BCLK_100MHZ,
738         .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC8 | PC10,
739         .cst_limit = CST_LIMIT_HSW,
740         .has_irtl_msrs = 1,
741         .has_msr_core_c1_res = 1,
742         .has_ext_cst_msrs = 1,
743         .trl_msrs = TRL_BASE,
744         .tcc_offset_bits = 6,
745         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
746         .enable_tsc_tweak = 1,
747 };
748 
749 static const struct platform_features arl_features = {
750         .has_msr_misc_feature_control = 1,
751         .has_msr_misc_pwr_mgmt = 1,
752         .has_nhm_msrs = 1,
753         .has_config_tdp = 1,
754         .bclk_freq = BCLK_100MHZ,
755         .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC10,
756         .cst_limit = CST_LIMIT_HSW,
757         .has_irtl_msrs = 1,
758         .has_msr_core_c1_res = 1,
759         .has_ext_cst_msrs = 1,
760         .trl_msrs = TRL_BASE,
761         .tcc_offset_bits = 6,
762         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
763         .enable_tsc_tweak = 1,
764 };
765 
766 static const struct platform_features skx_features = {
767         .has_msr_misc_feature_control = 1,
768         .has_msr_misc_pwr_mgmt = 1,
769         .has_nhm_msrs = 1,
770         .has_config_tdp = 1,
771         .bclk_freq = BCLK_100MHZ,
772         .supported_cstates = CC1 | CC6 | PC2 | PC6,
773         .cst_limit = CST_LIMIT_SKX,
774         .has_irtl_msrs = 1,
775         .has_cst_auto_convension = 1,
776         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
777         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
778         .has_fixed_rapl_unit = 1,
779 };
780 
781 static const struct platform_features icx_features = {
782         .has_msr_misc_feature_control = 1,
783         .has_msr_misc_pwr_mgmt = 1,
784         .has_nhm_msrs = 1,
785         .has_config_tdp = 1,
786         .bclk_freq = BCLK_100MHZ,
787         .supported_cstates = CC1 | CC6 | PC2 | PC6,
788         .cst_limit = CST_LIMIT_ICX,
789         .has_msr_core_c1_res = 1,
790         .has_irtl_msrs = 1,
791         .has_cst_prewake_bit = 1,
792         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
793         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
794         .has_fixed_rapl_unit = 1,
795 };
796 
797 static const struct platform_features spr_features = {
798         .has_msr_misc_feature_control = 1,
799         .has_msr_misc_pwr_mgmt = 1,
800         .has_nhm_msrs = 1,
801         .has_config_tdp = 1,
802         .bclk_freq = BCLK_100MHZ,
803         .supported_cstates = CC1 | CC6 | PC2 | PC6,
804         .cst_limit = CST_LIMIT_SKX,
805         .has_msr_core_c1_res = 1,
806         .has_irtl_msrs = 1,
807         .has_cst_prewake_bit = 1,
808         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
809         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
810 };
811 
812 static const struct platform_features srf_features = {
813         .has_msr_misc_feature_control = 1,
814         .has_msr_misc_pwr_mgmt = 1,
815         .has_nhm_msrs = 1,
816         .has_config_tdp = 1,
817         .bclk_freq = BCLK_100MHZ,
818         .supported_cstates = CC1 | CC6 | PC2 | PC6,
819         .cst_limit = CST_LIMIT_SKX,
820         .has_msr_core_c1_res = 1,
821         .has_msr_module_c6_res_ms = 1,
822         .has_irtl_msrs = 1,
823         .has_cst_prewake_bit = 1,
824         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
825         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
826 };
827 
828 static const struct platform_features grr_features = {
829         .has_msr_misc_feature_control = 1,
830         .has_msr_misc_pwr_mgmt = 1,
831         .has_nhm_msrs = 1,
832         .has_config_tdp = 1,
833         .bclk_freq = BCLK_100MHZ,
834         .supported_cstates = CC1 | CC6,
835         .cst_limit = CST_LIMIT_SKX,
836         .has_msr_core_c1_res = 1,
837         .has_msr_module_c6_res_ms = 1,
838         .has_irtl_msrs = 1,
839         .has_cst_prewake_bit = 1,
840         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
841         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
842 };
843 
844 static const struct platform_features slv_features = {
845         .has_nhm_msrs = 1,
846         .bclk_freq = BCLK_SLV,
847         .supported_cstates = CC1 | CC6 | PC6,
848         .cst_limit = CST_LIMIT_SLV,
849         .has_msr_core_c1_res = 1,
850         .has_msr_module_c6_res_ms = 1,
851         .has_msr_c6_demotion_policy_config = 1,
852         .has_msr_atom_pkg_c6_residency = 1,
853         .trl_msrs = TRL_ATOM,
854         .rapl_msrs = RAPL_PKG | RAPL_CORE,
855         .has_rapl_divisor = 1,
856         .rapl_quirk_tdp = 30,
857 };
858 
859 static const struct platform_features slvd_features = {
860         .has_msr_misc_pwr_mgmt = 1,
861         .has_nhm_msrs = 1,
862         .bclk_freq = BCLK_SLV,
863         .supported_cstates = CC1 | CC6 | PC3 | PC6,
864         .cst_limit = CST_LIMIT_SLV,
865         .has_msr_atom_pkg_c6_residency = 1,
866         .trl_msrs = TRL_BASE,
867         .rapl_msrs = RAPL_PKG | RAPL_CORE,
868         .rapl_quirk_tdp = 30,
869 };
870 
871 static const struct platform_features amt_features = {
872         .has_nhm_msrs = 1,
873         .bclk_freq = BCLK_133MHZ,
874         .supported_cstates = CC1 | CC3 | CC6 | PC3 | PC6,
875         .cst_limit = CST_LIMIT_AMT,
876         .trl_msrs = TRL_BASE,
877 };
878 
879 static const struct platform_features gmt_features = {
880         .has_msr_misc_pwr_mgmt = 1,
881         .has_nhm_msrs = 1,
882         .bclk_freq = BCLK_100MHZ,
883         .crystal_freq = 19200000,
884         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
885         .cst_limit = CST_LIMIT_GMT,
886         .has_irtl_msrs = 1,
887         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
888         .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
889 };
890 
891 static const struct platform_features gmtd_features = {
892         .has_msr_misc_pwr_mgmt = 1,
893         .has_nhm_msrs = 1,
894         .bclk_freq = BCLK_100MHZ,
895         .crystal_freq = 25000000,
896         .supported_cstates = CC1 | CC6 | PC2 | PC6,
897         .cst_limit = CST_LIMIT_GMT,
898         .has_irtl_msrs = 1,
899         .has_msr_core_c1_res = 1,
900         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
901         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL | RAPL_CORE_ENERGY_STATUS,
902 };
903 
904 static const struct platform_features gmtp_features = {
905         .has_msr_misc_pwr_mgmt = 1,
906         .has_nhm_msrs = 1,
907         .bclk_freq = BCLK_100MHZ,
908         .crystal_freq = 19200000,
909         .supported_cstates = CC1 | CC3 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
910         .cst_limit = CST_LIMIT_GMT,
911         .has_irtl_msrs = 1,
912         .trl_msrs = TRL_BASE,
913         .rapl_msrs = RAPL_PKG | RAPL_PKG_POWER_INFO,
914 };
915 
916 static const struct platform_features tmt_features = {
917         .has_msr_misc_pwr_mgmt = 1,
918         .has_nhm_msrs = 1,
919         .bclk_freq = BCLK_100MHZ,
920         .supported_cstates = CC1 | CC6 | CC7 | PC2 | PC3 | PC6 | PC7 | PC8 | PC9 | PC10,
921         .cst_limit = CST_LIMIT_GMT,
922         .has_irtl_msrs = 1,
923         .trl_msrs = TRL_BASE,
924         .rapl_msrs = RAPL_PKG_ALL | RAPL_CORE_ALL | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_GFX,
925         .enable_tsc_tweak = 1,
926 };
927 
928 static const struct platform_features tmtd_features = {
929         .has_msr_misc_pwr_mgmt = 1,
930         .has_nhm_msrs = 1,
931         .bclk_freq = BCLK_100MHZ,
932         .supported_cstates = CC1 | CC6,
933         .cst_limit = CST_LIMIT_GMT,
934         .has_irtl_msrs = 1,
935         .trl_msrs = TRL_BASE | TRL_CORECOUNT,
936         .rapl_msrs = RAPL_PKG_ALL,
937 };
938 
939 static const struct platform_features knl_features = {
940         .has_msr_misc_pwr_mgmt = 1,
941         .has_nhm_msrs = 1,
942         .has_config_tdp = 1,
943         .bclk_freq = BCLK_100MHZ,
944         .supported_cstates = CC1 | CC6 | PC3 | PC6,
945         .cst_limit = CST_LIMIT_KNL,
946         .has_msr_knl_core_c6_residency = 1,
947         .trl_msrs = TRL_KNL,
948         .rapl_msrs = RAPL_PKG_ALL | RAPL_DRAM_ALL,
949         .has_fixed_rapl_unit = 1,
950         .need_perf_multiplier = 1,
951 };
952 
953 static const struct platform_features default_features = {
954 };
955 
956 static const struct platform_features amd_features_with_rapl = {
957         .rapl_msrs = RAPL_AMD_F17H,
958         .has_per_core_rapl = 1,
959         .rapl_quirk_tdp = 280,  /* This is the max stock TDP of HEDT/Server Fam17h+ chips */
960 };
961 
962 static const struct platform_data turbostat_pdata[] = {
963         { INTEL_NEHALEM, &nhm_features },
964         { INTEL_NEHALEM_G, &nhm_features },
965         { INTEL_NEHALEM_EP, &nhm_features },
966         { INTEL_NEHALEM_EX, &nhx_features },
967         { INTEL_WESTMERE, &nhm_features },
968         { INTEL_WESTMERE_EP, &nhm_features },
969         { INTEL_WESTMERE_EX, &nhx_features },
970         { INTEL_SANDYBRIDGE, &snb_features },
971         { INTEL_SANDYBRIDGE_X, &snx_features },
972         { INTEL_IVYBRIDGE, &ivb_features },
973         { INTEL_IVYBRIDGE_X, &ivx_features },
974         { INTEL_HASWELL, &hsw_features },
975         { INTEL_HASWELL_X, &hsx_features },
976         { INTEL_HASWELL_L, &hswl_features },
977         { INTEL_HASWELL_G, &hswg_features },
978         { INTEL_BROADWELL, &bdw_features },
979         { INTEL_BROADWELL_G, &bdwg_features },
980         { INTEL_BROADWELL_X, &bdx_features },
981         { INTEL_BROADWELL_D, &bdx_features },
982         { INTEL_SKYLAKE_L, &skl_features },
983         { INTEL_SKYLAKE, &skl_features },
984         { INTEL_SKYLAKE_X, &skx_features },
985         { INTEL_KABYLAKE_L, &skl_features },
986         { INTEL_KABYLAKE, &skl_features },
987         { INTEL_COMETLAKE, &skl_features },
988         { INTEL_COMETLAKE_L, &skl_features },
989         { INTEL_CANNONLAKE_L, &cnl_features },
990         { INTEL_ICELAKE_X, &icx_features },
991         { INTEL_ICELAKE_D, &icx_features },
992         { INTEL_ICELAKE_L, &cnl_features },
993         { INTEL_ICELAKE_NNPI, &cnl_features },
994         { INTEL_ROCKETLAKE, &cnl_features },
995         { INTEL_TIGERLAKE_L, &cnl_features },
996         { INTEL_TIGERLAKE, &cnl_features },
997         { INTEL_SAPPHIRERAPIDS_X, &spr_features },
998         { INTEL_EMERALDRAPIDS_X, &spr_features },
999         { INTEL_GRANITERAPIDS_X, &spr_features },
1000         { INTEL_LAKEFIELD, &cnl_features },
1001         { INTEL_ALDERLAKE, &adl_features },
1002         { INTEL_ALDERLAKE_L, &adl_features },
1003         { INTEL_RAPTORLAKE, &adl_features },
1004         { INTEL_RAPTORLAKE_P, &adl_features },
1005         { INTEL_RAPTORLAKE_S, &adl_features },
1006         { INTEL_METEORLAKE, &cnl_features },
1007         { INTEL_METEORLAKE_L, &cnl_features },
1008         { INTEL_ARROWLAKE_H, &arl_features },
1009         { INTEL_ARROWLAKE_U, &arl_features },
1010         { INTEL_ARROWLAKE, &arl_features },
1011         { INTEL_LUNARLAKE_M, &arl_features },
1012         { INTEL_ATOM_SILVERMONT, &slv_features },
1013         { INTEL_ATOM_SILVERMONT_D, &slvd_features },
1014         { INTEL_ATOM_AIRMONT, &amt_features },
1015         { INTEL_ATOM_GOLDMONT, &gmt_features },
1016         { INTEL_ATOM_GOLDMONT_D, &gmtd_features },
1017         { INTEL_ATOM_GOLDMONT_PLUS, &gmtp_features },
1018         { INTEL_ATOM_TREMONT_D, &tmtd_features },
1019         { INTEL_ATOM_TREMONT, &tmt_features },
1020         { INTEL_ATOM_TREMONT_L, &tmt_features },
1021         { INTEL_ATOM_GRACEMONT, &adl_features },
1022         { INTEL_ATOM_CRESTMONT_X, &srf_features },
1023         { INTEL_ATOM_CRESTMONT, &grr_features },
1024         { INTEL_XEON_PHI_KNL, &knl_features },
1025         { INTEL_XEON_PHI_KNM, &knl_features },
1026         /*
1027          * Missing support for
1028          * INTEL_ICELAKE
1029          * INTEL_ATOM_SILVERMONT_MID
1030          * INTEL_ATOM_AIRMONT_MID
1031          * INTEL_ATOM_AIRMONT_NP
1032          */
1033         { 0, NULL },
1034 };
1035 
1036 static const struct platform_features *platform;
1037 
1038 void probe_platform_features(unsigned int family, unsigned int model)
1039 {
1040         int i;
1041 
1042         platform = &default_features;
1043 
1044         if (authentic_amd || hygon_genuine) {
1045                 if (max_extended_level >= 0x80000007) {
1046                         unsigned int eax, ebx, ecx, edx;
1047 
1048                         __cpuid(0x80000007, eax, ebx, ecx, edx);
1049                         /* RAPL (Fam 17h+) */
1050                         if ((edx & (1 << 14)) && family >= 0x17)
1051                                 platform = &amd_features_with_rapl;
1052                 }
1053                 return;
1054         }
1055 
1056         if (!genuine_intel)
1057                 return;
1058 
1059         for (i = 0; turbostat_pdata[i].features; i++) {
1060                 if (VFM_FAMILY(turbostat_pdata[i].vfm) == family && VFM_MODEL(turbostat_pdata[i].vfm) == model) {
1061                         platform = turbostat_pdata[i].features;
1062                         return;
1063                 }
1064         }
1065 }
1066 
1067 /* Model specific support End */
1068 
1069 #define TJMAX_DEFAULT   100
1070 
1071 /* MSRs that are not yet in the kernel-provided header. */
1072 #define MSR_RAPL_PWR_UNIT       0xc0010299
1073 #define MSR_CORE_ENERGY_STAT    0xc001029a
1074 #define MSR_PKG_ENERGY_STAT     0xc001029b
1075 
1076 #define MAX(a, b) ((a) > (b) ? (a) : (b))
1077 
1078 int backwards_count;
1079 char *progname;
1080 
1081 #define CPU_SUBSET_MAXCPUS      1024    /* need to use before probe... */
1082 cpu_set_t *cpu_present_set, *cpu_effective_set, *cpu_allowed_set, *cpu_affinity_set, *cpu_subset;
1083 size_t cpu_present_setsize, cpu_effective_setsize, cpu_allowed_setsize, cpu_affinity_setsize, cpu_subset_size;
1084 #define MAX_ADDED_THREAD_COUNTERS 24
1085 #define MAX_ADDED_CORE_COUNTERS 8
1086 #define MAX_ADDED_PACKAGE_COUNTERS 16
1087 #define PMT_MAX_ADDED_THREAD_COUNTERS 24
1088 #define PMT_MAX_ADDED_CORE_COUNTERS 8
1089 #define PMT_MAX_ADDED_PACKAGE_COUNTERS 16
1090 #define BITMASK_SIZE 32
1091 
1092 #define ZERO_ARRAY(arr) (memset(arr, 0, sizeof(arr)) + __must_be_array(arr))
1093 
1094 /* Indexes used to map data read from perf and MSRs into global variables */
1095 enum rapl_rci_index {
1096         RAPL_RCI_INDEX_ENERGY_PKG = 0,
1097         RAPL_RCI_INDEX_ENERGY_CORES = 1,
1098         RAPL_RCI_INDEX_DRAM = 2,
1099         RAPL_RCI_INDEX_GFX = 3,
1100         RAPL_RCI_INDEX_PKG_PERF_STATUS = 4,
1101         RAPL_RCI_INDEX_DRAM_PERF_STATUS = 5,
1102         RAPL_RCI_INDEX_CORE_ENERGY = 6,
1103         NUM_RAPL_COUNTERS,
1104 };
1105 
1106 enum rapl_unit {
1107         RAPL_UNIT_INVALID,
1108         RAPL_UNIT_JOULES,
1109         RAPL_UNIT_WATTS,
1110 };
1111 
1112 struct rapl_counter_info_t {
1113         unsigned long long data[NUM_RAPL_COUNTERS];
1114         enum counter_source source[NUM_RAPL_COUNTERS];
1115         unsigned long long flags[NUM_RAPL_COUNTERS];
1116         double scale[NUM_RAPL_COUNTERS];
1117         enum rapl_unit unit[NUM_RAPL_COUNTERS];
1118         unsigned long long msr[NUM_RAPL_COUNTERS];
1119         unsigned long long msr_mask[NUM_RAPL_COUNTERS];
1120         int msr_shift[NUM_RAPL_COUNTERS];
1121 
1122         int fd_perf;
1123 };
1124 
1125 /* struct rapl_counter_info_t for each RAPL domain */
1126 struct rapl_counter_info_t *rapl_counter_info_perdomain;
1127 unsigned int rapl_counter_info_perdomain_size;
1128 
1129 #define RAPL_COUNTER_FLAG_USE_MSR_SUM (1u << 1)
1130 
1131 struct rapl_counter_arch_info {
1132         int feature_mask;       /* Mask for testing if the counter is supported on host */
1133         const char *perf_subsys;
1134         const char *perf_name;
1135         unsigned long long msr;
1136         unsigned long long msr_mask;
1137         int msr_shift;          /* Positive mean shift right, negative mean shift left */
1138         double *platform_rapl_msr_scale;        /* Scale applied to values read by MSR (platform dependent, filled at runtime) */
1139         unsigned int rci_index; /* Maps data from perf counters to global variables */
1140         unsigned long long bic;
1141         double compat_scale;    /* Some counters require constant scaling to be in the same range as other, similar ones */
1142         unsigned long long flags;
1143 };
1144 
1145 static const struct rapl_counter_arch_info rapl_counter_arch_infos[] = {
1146         {
1147          .feature_mask = RAPL_PKG,
1148          .perf_subsys = "power",
1149          .perf_name = "energy-pkg",
1150          .msr = MSR_PKG_ENERGY_STATUS,
1151          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1152          .msr_shift = 0,
1153          .platform_rapl_msr_scale = &rapl_energy_units,
1154          .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1155          .bic = BIC_PkgWatt | BIC_Pkg_J,
1156          .compat_scale = 1.0,
1157          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1158           },
1159         {
1160          .feature_mask = RAPL_AMD_F17H,
1161          .perf_subsys = "power",
1162          .perf_name = "energy-pkg",
1163          .msr = MSR_PKG_ENERGY_STAT,
1164          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1165          .msr_shift = 0,
1166          .platform_rapl_msr_scale = &rapl_energy_units,
1167          .rci_index = RAPL_RCI_INDEX_ENERGY_PKG,
1168          .bic = BIC_PkgWatt | BIC_Pkg_J,
1169          .compat_scale = 1.0,
1170          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1171           },
1172         {
1173          .feature_mask = RAPL_CORE_ENERGY_STATUS,
1174          .perf_subsys = "power",
1175          .perf_name = "energy-cores",
1176          .msr = MSR_PP0_ENERGY_STATUS,
1177          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1178          .msr_shift = 0,
1179          .platform_rapl_msr_scale = &rapl_energy_units,
1180          .rci_index = RAPL_RCI_INDEX_ENERGY_CORES,
1181          .bic = BIC_CorWatt | BIC_Cor_J,
1182          .compat_scale = 1.0,
1183          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1184           },
1185         {
1186          .feature_mask = RAPL_DRAM,
1187          .perf_subsys = "power",
1188          .perf_name = "energy-ram",
1189          .msr = MSR_DRAM_ENERGY_STATUS,
1190          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1191          .msr_shift = 0,
1192          .platform_rapl_msr_scale = &rapl_dram_energy_units,
1193          .rci_index = RAPL_RCI_INDEX_DRAM,
1194          .bic = BIC_RAMWatt | BIC_RAM_J,
1195          .compat_scale = 1.0,
1196          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1197           },
1198         {
1199          .feature_mask = RAPL_GFX,
1200          .perf_subsys = "power",
1201          .perf_name = "energy-gpu",
1202          .msr = MSR_PP1_ENERGY_STATUS,
1203          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1204          .msr_shift = 0,
1205          .platform_rapl_msr_scale = &rapl_energy_units,
1206          .rci_index = RAPL_RCI_INDEX_GFX,
1207          .bic = BIC_GFXWatt | BIC_GFX_J,
1208          .compat_scale = 1.0,
1209          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1210           },
1211         {
1212          .feature_mask = RAPL_PKG_PERF_STATUS,
1213          .perf_subsys = NULL,
1214          .perf_name = NULL,
1215          .msr = MSR_PKG_PERF_STATUS,
1216          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1217          .msr_shift = 0,
1218          .platform_rapl_msr_scale = &rapl_time_units,
1219          .rci_index = RAPL_RCI_INDEX_PKG_PERF_STATUS,
1220          .bic = BIC_PKG__,
1221          .compat_scale = 100.0,
1222          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1223           },
1224         {
1225          .feature_mask = RAPL_DRAM_PERF_STATUS,
1226          .perf_subsys = NULL,
1227          .perf_name = NULL,
1228          .msr = MSR_DRAM_PERF_STATUS,
1229          .msr_mask = 0xFFFFFFFFFFFFFFFF,
1230          .msr_shift = 0,
1231          .platform_rapl_msr_scale = &rapl_time_units,
1232          .rci_index = RAPL_RCI_INDEX_DRAM_PERF_STATUS,
1233          .bic = BIC_RAM__,
1234          .compat_scale = 100.0,
1235          .flags = RAPL_COUNTER_FLAG_USE_MSR_SUM,
1236           },
1237         {
1238          .feature_mask = RAPL_AMD_F17H,
1239          .perf_subsys = NULL,
1240          .perf_name = NULL,
1241          .msr = MSR_CORE_ENERGY_STAT,
1242          .msr_mask = 0xFFFFFFFF,
1243          .msr_shift = 0,
1244          .platform_rapl_msr_scale = &rapl_energy_units,
1245          .rci_index = RAPL_RCI_INDEX_CORE_ENERGY,
1246          .bic = BIC_CorWatt | BIC_Cor_J,
1247          .compat_scale = 1.0,
1248          .flags = 0,
1249           },
1250 };
1251 
1252 struct rapl_counter {
1253         unsigned long long raw_value;
1254         enum rapl_unit unit;
1255         double scale;
1256 };
1257 
1258 /* Indexes used to map data read from perf and MSRs into global variables */
1259 enum ccstate_rci_index {
1260         CCSTATE_RCI_INDEX_C1_RESIDENCY = 0,
1261         CCSTATE_RCI_INDEX_C3_RESIDENCY = 1,
1262         CCSTATE_RCI_INDEX_C6_RESIDENCY = 2,
1263         CCSTATE_RCI_INDEX_C7_RESIDENCY = 3,
1264         PCSTATE_RCI_INDEX_C2_RESIDENCY = 4,
1265         PCSTATE_RCI_INDEX_C3_RESIDENCY = 5,
1266         PCSTATE_RCI_INDEX_C6_RESIDENCY = 6,
1267         PCSTATE_RCI_INDEX_C7_RESIDENCY = 7,
1268         PCSTATE_RCI_INDEX_C8_RESIDENCY = 8,
1269         PCSTATE_RCI_INDEX_C9_RESIDENCY = 9,
1270         PCSTATE_RCI_INDEX_C10_RESIDENCY = 10,
1271         NUM_CSTATE_COUNTERS,
1272 };
1273 
1274 struct cstate_counter_info_t {
1275         unsigned long long data[NUM_CSTATE_COUNTERS];
1276         enum counter_source source[NUM_CSTATE_COUNTERS];
1277         unsigned long long msr[NUM_CSTATE_COUNTERS];
1278         int fd_perf_core;
1279         int fd_perf_pkg;
1280 };
1281 
1282 struct cstate_counter_info_t *ccstate_counter_info;
1283 unsigned int ccstate_counter_info_size;
1284 
1285 #define CSTATE_COUNTER_FLAG_COLLECT_PER_CORE   (1u << 0)
1286 #define CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD ((1u << 1) | CSTATE_COUNTER_FLAG_COLLECT_PER_CORE)
1287 #define CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY (1u << 2)
1288 
1289 struct cstate_counter_arch_info {
1290         int feature_mask;       /* Mask for testing if the counter is supported on host */
1291         const char *perf_subsys;
1292         const char *perf_name;
1293         unsigned long long msr;
1294         unsigned int rci_index; /* Maps data from perf counters to global variables */
1295         unsigned long long bic;
1296         unsigned long long flags;
1297         int pkg_cstate_limit;
1298 };
1299 
1300 static struct cstate_counter_arch_info ccstate_counter_arch_infos[] = {
1301         {
1302          .feature_mask = CC1,
1303          .perf_subsys = "cstate_core",
1304          .perf_name = "c1-residency",
1305          .msr = MSR_CORE_C1_RES,
1306          .rci_index = CCSTATE_RCI_INDEX_C1_RESIDENCY,
1307          .bic = BIC_CPU_c1,
1308          .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD,
1309          .pkg_cstate_limit = 0,
1310           },
1311         {
1312          .feature_mask = CC3,
1313          .perf_subsys = "cstate_core",
1314          .perf_name = "c3-residency",
1315          .msr = MSR_CORE_C3_RESIDENCY,
1316          .rci_index = CCSTATE_RCI_INDEX_C3_RESIDENCY,
1317          .bic = BIC_CPU_c3,
1318          .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1319          .pkg_cstate_limit = 0,
1320           },
1321         {
1322          .feature_mask = CC6,
1323          .perf_subsys = "cstate_core",
1324          .perf_name = "c6-residency",
1325          .msr = MSR_CORE_C6_RESIDENCY,
1326          .rci_index = CCSTATE_RCI_INDEX_C6_RESIDENCY,
1327          .bic = BIC_CPU_c6,
1328          .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1329          .pkg_cstate_limit = 0,
1330           },
1331         {
1332          .feature_mask = CC7,
1333          .perf_subsys = "cstate_core",
1334          .perf_name = "c7-residency",
1335          .msr = MSR_CORE_C7_RESIDENCY,
1336          .rci_index = CCSTATE_RCI_INDEX_C7_RESIDENCY,
1337          .bic = BIC_CPU_c7,
1338          .flags = CSTATE_COUNTER_FLAG_COLLECT_PER_CORE | CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY,
1339          .pkg_cstate_limit = 0,
1340           },
1341         {
1342          .feature_mask = PC2,
1343          .perf_subsys = "cstate_pkg",
1344          .perf_name = "c2-residency",
1345          .msr = MSR_PKG_C2_RESIDENCY,
1346          .rci_index = PCSTATE_RCI_INDEX_C2_RESIDENCY,
1347          .bic = BIC_Pkgpc2,
1348          .flags = 0,
1349          .pkg_cstate_limit = PCL__2,
1350           },
1351         {
1352          .feature_mask = PC3,
1353          .perf_subsys = "cstate_pkg",
1354          .perf_name = "c3-residency",
1355          .msr = MSR_PKG_C3_RESIDENCY,
1356          .rci_index = PCSTATE_RCI_INDEX_C3_RESIDENCY,
1357          .bic = BIC_Pkgpc3,
1358          .flags = 0,
1359          .pkg_cstate_limit = PCL__3,
1360           },
1361         {
1362          .feature_mask = PC6,
1363          .perf_subsys = "cstate_pkg",
1364          .perf_name = "c6-residency",
1365          .msr = MSR_PKG_C6_RESIDENCY,
1366          .rci_index = PCSTATE_RCI_INDEX_C6_RESIDENCY,
1367          .bic = BIC_Pkgpc6,
1368          .flags = 0,
1369          .pkg_cstate_limit = PCL__6,
1370           },
1371         {
1372          .feature_mask = PC7,
1373          .perf_subsys = "cstate_pkg",
1374          .perf_name = "c7-residency",
1375          .msr = MSR_PKG_C7_RESIDENCY,
1376          .rci_index = PCSTATE_RCI_INDEX_C7_RESIDENCY,
1377          .bic = BIC_Pkgpc7,
1378          .flags = 0,
1379          .pkg_cstate_limit = PCL__7,
1380           },
1381         {
1382          .feature_mask = PC8,
1383          .perf_subsys = "cstate_pkg",
1384          .perf_name = "c8-residency",
1385          .msr = MSR_PKG_C8_RESIDENCY,
1386          .rci_index = PCSTATE_RCI_INDEX_C8_RESIDENCY,
1387          .bic = BIC_Pkgpc8,
1388          .flags = 0,
1389          .pkg_cstate_limit = PCL__8,
1390           },
1391         {
1392          .feature_mask = PC9,
1393          .perf_subsys = "cstate_pkg",
1394          .perf_name = "c9-residency",
1395          .msr = MSR_PKG_C9_RESIDENCY,
1396          .rci_index = PCSTATE_RCI_INDEX_C9_RESIDENCY,
1397          .bic = BIC_Pkgpc9,
1398          .flags = 0,
1399          .pkg_cstate_limit = PCL__9,
1400           },
1401         {
1402          .feature_mask = PC10,
1403          .perf_subsys = "cstate_pkg",
1404          .perf_name = "c10-residency",
1405          .msr = MSR_PKG_C10_RESIDENCY,
1406          .rci_index = PCSTATE_RCI_INDEX_C10_RESIDENCY,
1407          .bic = BIC_Pkgpc10,
1408          .flags = 0,
1409          .pkg_cstate_limit = PCL_10,
1410           },
1411 };
1412 
1413 /* Indexes used to map data read from perf and MSRs into global variables */
1414 enum msr_rci_index {
1415         MSR_RCI_INDEX_APERF = 0,
1416         MSR_RCI_INDEX_MPERF = 1,
1417         MSR_RCI_INDEX_SMI = 2,
1418         NUM_MSR_COUNTERS,
1419 };
1420 
1421 struct msr_counter_info_t {
1422         unsigned long long data[NUM_MSR_COUNTERS];
1423         enum counter_source source[NUM_MSR_COUNTERS];
1424         unsigned long long msr[NUM_MSR_COUNTERS];
1425         unsigned long long msr_mask[NUM_MSR_COUNTERS];
1426         int fd_perf;
1427 };
1428 
1429 struct msr_counter_info_t *msr_counter_info;
1430 unsigned int msr_counter_info_size;
1431 
1432 struct msr_counter_arch_info {
1433         const char *perf_subsys;
1434         const char *perf_name;
1435         unsigned long long msr;
1436         unsigned long long msr_mask;
1437         unsigned int rci_index; /* Maps data from perf counters to global variables */
1438         bool needed;
1439         bool present;
1440 };
1441 
1442 enum msr_arch_info_index {
1443         MSR_ARCH_INFO_APERF_INDEX = 0,
1444         MSR_ARCH_INFO_MPERF_INDEX = 1,
1445         MSR_ARCH_INFO_SMI_INDEX = 2,
1446 };
1447 
1448 static struct msr_counter_arch_info msr_counter_arch_infos[] = {
1449         [MSR_ARCH_INFO_APERF_INDEX] = {
1450                                        .perf_subsys = "msr",
1451                                        .perf_name = "aperf",
1452                                        .msr = MSR_IA32_APERF,
1453                                        .msr_mask = 0xFFFFFFFFFFFFFFFF,
1454                                        .rci_index = MSR_RCI_INDEX_APERF,
1455                                         },
1456 
1457         [MSR_ARCH_INFO_MPERF_INDEX] = {
1458                                        .perf_subsys = "msr",
1459                                        .perf_name = "mperf",
1460                                        .msr = MSR_IA32_MPERF,
1461                                        .msr_mask = 0xFFFFFFFFFFFFFFFF,
1462                                        .rci_index = MSR_RCI_INDEX_MPERF,
1463                                         },
1464 
1465         [MSR_ARCH_INFO_SMI_INDEX] = {
1466                                      .perf_subsys = "msr",
1467                                      .perf_name = "smi",
1468                                      .msr = MSR_SMI_COUNT,
1469                                      .msr_mask = 0xFFFFFFFF,
1470                                      .rci_index = MSR_RCI_INDEX_SMI,
1471                                       },
1472 };
1473 
1474 /* Can be redefined when compiling, useful for testing. */
1475 #ifndef SYSFS_TELEM_PATH
1476 #define SYSFS_TELEM_PATH "/sys/class/intel_pmt"
1477 #endif
1478 
1479 #define PMT_COUNTER_MTL_DC6_OFFSET 120
1480 #define PMT_COUNTER_MTL_DC6_LSB    0
1481 #define PMT_COUNTER_MTL_DC6_MSB    63
1482 #define PMT_MTL_DC6_GUID           0x1a067102
1483 
1484 #define PMT_COUNTER_NAME_SIZE_BYTES      16
1485 #define PMT_COUNTER_TYPE_NAME_SIZE_BYTES 32
1486 
1487 struct pmt_mmio {
1488         struct pmt_mmio *next;
1489 
1490         unsigned int guid;
1491         unsigned int size;
1492 
1493         /* Base pointer to the mmaped memory. */
1494         void *mmio_base;
1495 
1496         /*
1497          * Offset to be applied to the mmio_base
1498          * to get the beginning of the PMT counters for given GUID.
1499          */
1500         unsigned long pmt_offset;
1501 } *pmt_mmios;
1502 
1503 enum pmt_datatype {
1504         PMT_TYPE_RAW,
1505         PMT_TYPE_XTAL_TIME,
1506 };
1507 
1508 struct pmt_domain_info {
1509         /*
1510          * Pointer to the MMIO obtained by applying a counter offset
1511          * to the mmio_base of the mmaped region for the given GUID.
1512          *
1513          * This is where to read the raw value of the counter from.
1514          */
1515         unsigned long *pcounter;
1516 };
1517 
1518 struct pmt_counter {
1519         struct pmt_counter *next;
1520 
1521         /* PMT metadata */
1522         char name[PMT_COUNTER_NAME_SIZE_BYTES];
1523         enum pmt_datatype type;
1524         enum counter_scope scope;
1525         unsigned int lsb;
1526         unsigned int msb;
1527 
1528         /* BIC-like metadata */
1529         enum counter_format format;
1530 
1531         unsigned int num_domains;
1532         struct pmt_domain_info *domains;
1533 };
1534 
1535 unsigned int pmt_counter_get_width(const struct pmt_counter *p)
1536 {
1537         return (p->msb - p->lsb) + 1;
1538 }
1539 
1540 void pmt_counter_resize_(struct pmt_counter *pcounter, unsigned int new_size)
1541 {
1542         struct pmt_domain_info *new_mem;
1543 
1544         new_mem = (struct pmt_domain_info *)reallocarray(pcounter->domains, new_size, sizeof(*pcounter->domains));
1545         if (!new_mem) {
1546                 fprintf(stderr, "%s: failed to allocate memory for PMT counters\n", __func__);
1547                 exit(1);
1548         }
1549 
1550         /* Zero initialize just allocated memory. */
1551         const size_t num_new_domains = new_size - pcounter->num_domains;
1552 
1553         memset(&new_mem[pcounter->num_domains], 0, num_new_domains * sizeof(*pcounter->domains));
1554 
1555         pcounter->num_domains = new_size;
1556         pcounter->domains = new_mem;
1557 }
1558 
1559 void pmt_counter_resize(struct pmt_counter *pcounter, unsigned int new_size)
1560 {
1561         /*
1562          * Allocate more memory ahead of time.
1563          *
1564          * Always allocate space for at least 8 elements
1565          * and double the size when growing.
1566          */
1567         if (new_size < 8)
1568                 new_size = 8;
1569         new_size = MAX(new_size, pcounter->num_domains * 2);
1570 
1571         pmt_counter_resize_(pcounter, new_size);
1572 }
1573 
1574 struct thread_data {
1575         struct timeval tv_begin;
1576         struct timeval tv_end;
1577         struct timeval tv_delta;
1578         unsigned long long tsc;
1579         unsigned long long aperf;
1580         unsigned long long mperf;
1581         unsigned long long c1;
1582         unsigned long long instr_count;
1583         unsigned long long irq_count;
1584         unsigned int smi_count;
1585         unsigned int cpu_id;
1586         unsigned int apic_id;
1587         unsigned int x2apic_id;
1588         unsigned int flags;
1589         bool is_atom;
1590         unsigned long long counter[MAX_ADDED_THREAD_COUNTERS];
1591         unsigned long long perf_counter[MAX_ADDED_THREAD_COUNTERS];
1592         unsigned long long pmt_counter[PMT_MAX_ADDED_THREAD_COUNTERS];
1593 } *thread_even, *thread_odd;
1594 
1595 struct core_data {
1596         int base_cpu;
1597         unsigned long long c3;
1598         unsigned long long c6;
1599         unsigned long long c7;
1600         unsigned long long mc6_us;      /* duplicate as per-core for now, even though per module */
1601         unsigned int core_temp_c;
1602         struct rapl_counter core_energy;        /* MSR_CORE_ENERGY_STAT */
1603         unsigned int core_id;
1604         unsigned long long core_throt_cnt;
1605         unsigned long long counter[MAX_ADDED_CORE_COUNTERS];
1606         unsigned long long perf_counter[MAX_ADDED_CORE_COUNTERS];
1607         unsigned long long pmt_counter[PMT_MAX_ADDED_CORE_COUNTERS];
1608 } *core_even, *core_odd;
1609 
1610 struct pkg_data {
1611         int base_cpu;
1612         unsigned long long pc2;
1613         unsigned long long pc3;
1614         unsigned long long pc6;
1615         unsigned long long pc7;
1616         unsigned long long pc8;
1617         unsigned long long pc9;
1618         unsigned long long pc10;
1619         long long cpu_lpi;
1620         long long sys_lpi;
1621         unsigned long long pkg_wtd_core_c0;
1622         unsigned long long pkg_any_core_c0;
1623         unsigned long long pkg_any_gfxe_c0;
1624         unsigned long long pkg_both_core_gfxe_c0;
1625         long long gfx_rc6_ms;
1626         unsigned int gfx_mhz;
1627         unsigned int gfx_act_mhz;
1628         long long sam_mc6_ms;
1629         unsigned int sam_mhz;
1630         unsigned int sam_act_mhz;
1631         unsigned int package_id;
1632         struct rapl_counter energy_pkg; /* MSR_PKG_ENERGY_STATUS */
1633         struct rapl_counter energy_dram;        /* MSR_DRAM_ENERGY_STATUS */
1634         struct rapl_counter energy_cores;       /* MSR_PP0_ENERGY_STATUS */
1635         struct rapl_counter energy_gfx; /* MSR_PP1_ENERGY_STATUS */
1636         struct rapl_counter rapl_pkg_perf_status;       /* MSR_PKG_PERF_STATUS */
1637         struct rapl_counter rapl_dram_perf_status;      /* MSR_DRAM_PERF_STATUS */
1638         unsigned int pkg_temp_c;
1639         unsigned int uncore_mhz;
1640         unsigned long long die_c6;
1641         unsigned long long counter[MAX_ADDED_PACKAGE_COUNTERS];
1642         unsigned long long perf_counter[MAX_ADDED_PACKAGE_COUNTERS];
1643         unsigned long long pmt_counter[PMT_MAX_ADDED_PACKAGE_COUNTERS];
1644 } *package_even, *package_odd;
1645 
1646 #define ODD_COUNTERS thread_odd, core_odd, package_odd
1647 #define EVEN_COUNTERS thread_even, core_even, package_even
1648 
1649 #define GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no)          \
1650         ((thread_base) +                                                      \
1651          ((pkg_no) *                                                          \
1652           topo.nodes_per_pkg * topo.cores_per_node * topo.threads_per_core) + \
1653          ((node_no) * topo.cores_per_node * topo.threads_per_core) +          \
1654          ((core_no) * topo.threads_per_core) +                                \
1655          (thread_no))
1656 
1657 #define GET_CORE(core_base, core_no, node_no, pkg_no)                   \
1658         ((core_base) +                                                  \
1659          ((pkg_no) *  topo.nodes_per_pkg * topo.cores_per_node) +       \
1660          ((node_no) * topo.cores_per_node) +                            \
1661          (core_no))
1662 
1663 #define GET_PKG(pkg_base, pkg_no) (pkg_base + pkg_no)
1664 
1665 /*
1666  * The accumulated sum of MSR is defined as a monotonic
1667  * increasing MSR, it will be accumulated periodically,
1668  * despite its register's bit width.
1669  */
1670 enum {
1671         IDX_PKG_ENERGY,
1672         IDX_DRAM_ENERGY,
1673         IDX_PP0_ENERGY,
1674         IDX_PP1_ENERGY,
1675         IDX_PKG_PERF,
1676         IDX_DRAM_PERF,
1677         IDX_COUNT,
1678 };
1679 
1680 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr);
1681 
1682 struct msr_sum_array {
1683         /* get_msr_sum() = sum + (get_msr() - last) */
1684         struct {
1685                 /*The accumulated MSR value is updated by the timer */
1686                 unsigned long long sum;
1687                 /*The MSR footprint recorded in last timer */
1688                 unsigned long long last;
1689         } entries[IDX_COUNT];
1690 };
1691 
1692 /* The percpu MSR sum array.*/
1693 struct msr_sum_array *per_cpu_msr_sum;
1694 
1695 off_t idx_to_offset(int idx)
1696 {
1697         off_t offset;
1698 
1699         switch (idx) {
1700         case IDX_PKG_ENERGY:
1701                 if (platform->rapl_msrs & RAPL_AMD_F17H)
1702                         offset = MSR_PKG_ENERGY_STAT;
1703                 else
1704                         offset = MSR_PKG_ENERGY_STATUS;
1705                 break;
1706         case IDX_DRAM_ENERGY:
1707                 offset = MSR_DRAM_ENERGY_STATUS;
1708                 break;
1709         case IDX_PP0_ENERGY:
1710                 offset = MSR_PP0_ENERGY_STATUS;
1711                 break;
1712         case IDX_PP1_ENERGY:
1713                 offset = MSR_PP1_ENERGY_STATUS;
1714                 break;
1715         case IDX_PKG_PERF:
1716                 offset = MSR_PKG_PERF_STATUS;
1717                 break;
1718         case IDX_DRAM_PERF:
1719                 offset = MSR_DRAM_PERF_STATUS;
1720                 break;
1721         default:
1722                 offset = -1;
1723         }
1724         return offset;
1725 }
1726 
1727 int offset_to_idx(off_t offset)
1728 {
1729         int idx;
1730 
1731         switch (offset) {
1732         case MSR_PKG_ENERGY_STATUS:
1733         case MSR_PKG_ENERGY_STAT:
1734                 idx = IDX_PKG_ENERGY;
1735                 break;
1736         case MSR_DRAM_ENERGY_STATUS:
1737                 idx = IDX_DRAM_ENERGY;
1738                 break;
1739         case MSR_PP0_ENERGY_STATUS:
1740                 idx = IDX_PP0_ENERGY;
1741                 break;
1742         case MSR_PP1_ENERGY_STATUS:
1743                 idx = IDX_PP1_ENERGY;
1744                 break;
1745         case MSR_PKG_PERF_STATUS:
1746                 idx = IDX_PKG_PERF;
1747                 break;
1748         case MSR_DRAM_PERF_STATUS:
1749                 idx = IDX_DRAM_PERF;
1750                 break;
1751         default:
1752                 idx = -1;
1753         }
1754         return idx;
1755 }
1756 
1757 int idx_valid(int idx)
1758 {
1759         switch (idx) {
1760         case IDX_PKG_ENERGY:
1761                 return platform->rapl_msrs & (RAPL_PKG | RAPL_AMD_F17H);
1762         case IDX_DRAM_ENERGY:
1763                 return platform->rapl_msrs & RAPL_DRAM;
1764         case IDX_PP0_ENERGY:
1765                 return platform->rapl_msrs & RAPL_CORE_ENERGY_STATUS;
1766         case IDX_PP1_ENERGY:
1767                 return platform->rapl_msrs & RAPL_GFX;
1768         case IDX_PKG_PERF:
1769                 return platform->rapl_msrs & RAPL_PKG_PERF_STATUS;
1770         case IDX_DRAM_PERF:
1771                 return platform->rapl_msrs & RAPL_DRAM_PERF_STATUS;
1772         default:
1773                 return 0;
1774         }
1775 }
1776 
1777 struct sys_counters {
1778         /* MSR added counters */
1779         unsigned int added_thread_counters;
1780         unsigned int added_core_counters;
1781         unsigned int added_package_counters;
1782         struct msr_counter *tp;
1783         struct msr_counter *cp;
1784         struct msr_counter *pp;
1785 
1786         /* perf added counters */
1787         unsigned int added_thread_perf_counters;
1788         unsigned int added_core_perf_counters;
1789         unsigned int added_package_perf_counters;
1790         struct perf_counter_info *perf_tp;
1791         struct perf_counter_info *perf_cp;
1792         struct perf_counter_info *perf_pp;
1793 
1794         struct pmt_counter *pmt_tp;
1795         struct pmt_counter *pmt_cp;
1796         struct pmt_counter *pmt_pp;
1797 } sys;
1798 
1799 static size_t free_msr_counters_(struct msr_counter **pp)
1800 {
1801         struct msr_counter *p = NULL;
1802         size_t num_freed = 0;
1803 
1804         while (*pp) {
1805                 p = *pp;
1806 
1807                 if (p->msr_num != 0) {
1808                         *pp = p->next;
1809 
1810                         free(p);
1811                         ++num_freed;
1812 
1813                         continue;
1814                 }
1815 
1816                 pp = &p->next;
1817         }
1818 
1819         return num_freed;
1820 }
1821 
1822 /*
1823  * Free all added counters accessed via msr.
1824  */
1825 static void free_sys_msr_counters(void)
1826 {
1827         /* Thread counters */
1828         sys.added_thread_counters -= free_msr_counters_(&sys.tp);
1829 
1830         /* Core counters */
1831         sys.added_core_counters -= free_msr_counters_(&sys.cp);
1832 
1833         /* Package counters */
1834         sys.added_package_counters -= free_msr_counters_(&sys.pp);
1835 }
1836 
1837 struct system_summary {
1838         struct thread_data threads;
1839         struct core_data cores;
1840         struct pkg_data packages;
1841 } average;
1842 
1843 struct cpu_topology {
1844         int physical_package_id;
1845         int die_id;
1846         int logical_cpu_id;
1847         int physical_node_id;
1848         int logical_node_id;    /* 0-based count within the package */
1849         int physical_core_id;
1850         int thread_id;
1851         cpu_set_t *put_ids;     /* Processing Unit/Thread IDs */
1852 } *cpus;
1853 
1854 struct topo_params {
1855         int num_packages;
1856         int num_die;
1857         int num_cpus;
1858         int num_cores;
1859         int allowed_packages;
1860         int allowed_cpus;
1861         int allowed_cores;
1862         int max_cpu_num;
1863         int max_core_id;
1864         int max_package_id;
1865         int max_die_id;
1866         int max_node_num;
1867         int nodes_per_pkg;
1868         int cores_per_node;
1869         int threads_per_core;
1870 } topo;
1871 
1872 struct timeval tv_even, tv_odd, tv_delta;
1873 
1874 int *irq_column_2_cpu;          /* /proc/interrupts column numbers */
1875 int *irqs_per_cpu;              /* indexed by cpu_num */
1876 
1877 void setup_all_buffers(bool startup);
1878 
1879 char *sys_lpi_file;
1880 char *sys_lpi_file_sysfs = "/sys/devices/system/cpu/cpuidle/low_power_idle_system_residency_us";
1881 char *sys_lpi_file_debugfs = "/sys/kernel/debug/pmc_core/slp_s0_residency_usec";
1882 
1883 int cpu_is_not_present(int cpu)
1884 {
1885         return !CPU_ISSET_S(cpu, cpu_present_setsize, cpu_present_set);
1886 }
1887 
1888 int cpu_is_not_allowed(int cpu)
1889 {
1890         return !CPU_ISSET_S(cpu, cpu_allowed_setsize, cpu_allowed_set);
1891 }
1892 
1893 /*
1894  * run func(thread, core, package) in topology order
1895  * skip non-present cpus
1896  */
1897 
1898 int for_all_cpus(int (func) (struct thread_data *, struct core_data *, struct pkg_data *),
1899                  struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base)
1900 {
1901         int retval, pkg_no, core_no, thread_no, node_no;
1902 
1903         for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
1904                 for (node_no = 0; node_no < topo.nodes_per_pkg; node_no++) {
1905                         for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
1906                                 for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
1907                                         struct thread_data *t;
1908                                         struct core_data *c;
1909                                         struct pkg_data *p;
1910                                         t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
1911 
1912                                         if (cpu_is_not_allowed(t->cpu_id))
1913                                                 continue;
1914 
1915                                         c = GET_CORE(core_base, core_no, node_no, pkg_no);
1916                                         p = GET_PKG(pkg_base, pkg_no);
1917 
1918                                         retval = func(t, c, p);
1919                                         if (retval)
1920                                                 return retval;
1921                                 }
1922                         }
1923                 }
1924         }
1925         return 0;
1926 }
1927 
1928 int is_cpu_first_thread_in_core(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1929 {
1930         UNUSED(p);
1931 
1932         return ((int)t->cpu_id == c->base_cpu || c->base_cpu < 0);
1933 }
1934 
1935 int is_cpu_first_core_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1936 {
1937         UNUSED(c);
1938 
1939         return ((int)t->cpu_id == p->base_cpu || p->base_cpu < 0);
1940 }
1941 
1942 int is_cpu_first_thread_in_package(struct thread_data *t, struct core_data *c, struct pkg_data *p)
1943 {
1944         return is_cpu_first_thread_in_core(t, c, p) && is_cpu_first_core_in_package(t, c, p);
1945 }
1946 
1947 int cpu_migrate(int cpu)
1948 {
1949         CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
1950         CPU_SET_S(cpu, cpu_affinity_setsize, cpu_affinity_set);
1951         if (sched_setaffinity(0, cpu_affinity_setsize, cpu_affinity_set) == -1)
1952                 return -1;
1953         else
1954                 return 0;
1955 }
1956 
1957 int get_msr_fd(int cpu)
1958 {
1959         char pathname[32];
1960         int fd;
1961 
1962         fd = fd_percpu[cpu];
1963 
1964         if (fd)
1965                 return fd;
1966 
1967         sprintf(pathname, "/dev/cpu/%d/msr", cpu);
1968         fd = open(pathname, O_RDONLY);
1969         if (fd < 0)
1970                 err(-1, "%s open failed, try chown or chmod +r /dev/cpu/*/msr, "
1971                     "or run with --no-msr, or run as root", pathname);
1972 
1973         fd_percpu[cpu] = fd;
1974 
1975         return fd;
1976 }
1977 
1978 static void bic_disable_msr_access(void)
1979 {
1980         const unsigned long bic_msrs = BIC_Mod_c6 | BIC_CoreTmp |
1981             BIC_Totl_c0 | BIC_Any_c0 | BIC_GFX_c0 | BIC_CPUGFX | BIC_PkgTmp;
1982 
1983         bic_enabled &= ~bic_msrs;
1984 
1985         free_sys_msr_counters();
1986 }
1987 
1988 static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
1989 {
1990         assert(!no_perf);
1991 
1992         return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
1993 }
1994 
1995 static long open_perf_counter(int cpu, unsigned int type, unsigned int config, int group_fd, __u64 read_format)
1996 {
1997         struct perf_event_attr attr;
1998         const pid_t pid = -1;
1999         const unsigned long flags = 0;
2000 
2001         assert(!no_perf);
2002 
2003         memset(&attr, 0, sizeof(struct perf_event_attr));
2004 
2005         attr.type = type;
2006         attr.size = sizeof(struct perf_event_attr);
2007         attr.config = config;
2008         attr.disabled = 0;
2009         attr.sample_type = PERF_SAMPLE_IDENTIFIER;
2010         attr.read_format = read_format;
2011 
2012         const int fd = perf_event_open(&attr, pid, cpu, group_fd, flags);
2013 
2014         return fd;
2015 }
2016 
2017 int get_instr_count_fd(int cpu)
2018 {
2019         if (fd_instr_count_percpu[cpu])
2020                 return fd_instr_count_percpu[cpu];
2021 
2022         fd_instr_count_percpu[cpu] = open_perf_counter(cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
2023 
2024         return fd_instr_count_percpu[cpu];
2025 }
2026 
2027 int get_msr(int cpu, off_t offset, unsigned long long *msr)
2028 {
2029         ssize_t retval;
2030 
2031         assert(!no_msr);
2032 
2033         retval = pread(get_msr_fd(cpu), msr, sizeof(*msr), offset);
2034 
2035         if (retval != sizeof *msr)
2036                 err(-1, "cpu%d: msr offset 0x%llx read failed", cpu, (unsigned long long)offset);
2037 
2038         return 0;
2039 }
2040 
2041 int probe_msr(int cpu, off_t offset)
2042 {
2043         ssize_t retval;
2044         unsigned long long dummy;
2045 
2046         assert(!no_msr);
2047 
2048         retval = pread(get_msr_fd(cpu), &dummy, sizeof(dummy), offset);
2049 
2050         if (retval != sizeof(dummy))
2051                 return 1;
2052 
2053         return 0;
2054 }
2055 
2056 /* Convert CPU ID to domain ID for given added perf counter. */
2057 unsigned int cpu_to_domain(const struct perf_counter_info *pc, int cpu)
2058 {
2059         switch (pc->scope) {
2060         case SCOPE_CPU:
2061                 return cpu;
2062 
2063         case SCOPE_CORE:
2064                 return cpus[cpu].physical_core_id;
2065 
2066         case SCOPE_PACKAGE:
2067                 return cpus[cpu].physical_package_id;
2068         }
2069 
2070         __builtin_unreachable();
2071 }
2072 
2073 #define MAX_DEFERRED 16
2074 char *deferred_add_names[MAX_DEFERRED];
2075 char *deferred_skip_names[MAX_DEFERRED];
2076 int deferred_add_index;
2077 int deferred_skip_index;
2078 
2079 /*
2080  * HIDE_LIST - hide this list of counters, show the rest [default]
2081  * SHOW_LIST - show this list of counters, hide the rest
2082  */
2083 enum show_hide_mode { SHOW_LIST, HIDE_LIST } global_show_hide_mode = HIDE_LIST;
2084 
2085 void help(void)
2086 {
2087         fprintf(outf,
2088                 "Usage: turbostat [OPTIONS][(--interval seconds) | COMMAND ...]\n"
2089                 "\n"
2090                 "Turbostat forks the specified COMMAND and prints statistics\n"
2091                 "when COMMAND completes.\n"
2092                 "If no COMMAND is specified, turbostat wakes every 5-seconds\n"
2093                 "to print statistics, until interrupted.\n"
2094                 "  -a, --add    add a counter\n"
2095                 "                 eg. --add msr0x10,u64,cpu,delta,MY_TSC\n"
2096                 "                 eg. --add perf/cstate_pkg/c2-residency,package,delta,percent,perfPC2\n"
2097                 "                 eg. --add pmt,name=XTAL,type=raw,domain=package0,offset=0,lsb=0,msb=63,guid=0x1a067102\n"
2098                 "  -c, --cpu    cpu-set limit output to summary plus cpu-set:\n"
2099                 "                 {core | package | j,k,l..m,n-p }\n"
2100                 "  -d, --debug  displays usec, Time_Of_Day_Seconds and more debugging\n"
2101                 "               debug messages are printed to stderr\n"
2102                 "  -D, --Dump   displays the raw counter values\n"
2103                 "  -e, --enable [all | column]\n"
2104                 "               shows all or the specified disabled column\n"
2105                 "  -H, --hide [column|column,column,...]\n"
2106                 "               hide the specified column(s)\n"
2107                 "  -i, --interval sec.subsec\n"
2108                 "               Override default 5-second measurement interval\n"
2109                 "  -J, --Joules displays energy in Joules instead of Watts\n"
2110                 "  -l, --list   list column headers only\n"
2111                 "  -M, --no-msr Disable all uses of the MSR driver\n"
2112                 "  -P, --no-perf Disable all uses of the perf API\n"
2113                 "  -n, --num_iterations num\n"
2114                 "               number of the measurement iterations\n"
2115                 "  -N, --header_iterations num\n"
2116                 "               print header every num iterations\n"
2117                 "  -o, --out file\n"
2118                 "               create or truncate \"file\" for all output\n"
2119                 "  -q, --quiet  skip decoding system configuration header\n"
2120                 "  -s, --show [column|column,column,...]\n"
2121                 "               show only the specified column(s)\n"
2122                 "  -S, --Summary\n"
2123                 "               limits output to 1-line system summary per interval\n"
2124                 "  -T, --TCC temperature\n"
2125                 "               sets the Thermal Control Circuit temperature in\n"
2126                 "                 degrees Celsius\n"
2127                 "  -h, --help   print this help message\n"
2128                 "  -v, --version        print version information\n" "\n" "For more help, run \"man turbostat\"\n");
2129 }
2130 
2131 /*
2132  * bic_lookup
2133  * for all the strings in comma separate name_list,
2134  * set the approprate bit in return value.
2135  */
2136 unsigned long long bic_lookup(char *name_list, enum show_hide_mode mode)
2137 {
2138         unsigned int i;
2139         unsigned long long retval = 0;
2140 
2141         while (name_list) {
2142                 char *comma;
2143 
2144                 comma = strchr(name_list, ',');
2145 
2146                 if (comma)
2147                         *comma = '\0';
2148 
2149                 for (i = 0; i < MAX_BIC; ++i) {
2150                         if (!strcmp(name_list, bic[i].name)) {
2151                                 retval |= (1ULL << i);
2152                                 break;
2153                         }
2154                         if (!strcmp(name_list, "all")) {
2155                                 retval |= ~0;
2156                                 break;
2157                         } else if (!strcmp(name_list, "topology")) {
2158                                 retval |= BIC_TOPOLOGY;
2159                                 break;
2160                         } else if (!strcmp(name_list, "power")) {
2161                                 retval |= BIC_THERMAL_PWR;
2162                                 break;
2163                         } else if (!strcmp(name_list, "idle")) {
2164                                 retval |= BIC_IDLE;
2165                                 break;
2166                         } else if (!strcmp(name_list, "frequency")) {
2167                                 retval |= BIC_FREQUENCY;
2168                                 break;
2169                         } else if (!strcmp(name_list, "other")) {
2170                                 retval |= BIC_OTHER;
2171                                 break;
2172                         }
2173 
2174                 }
2175                 if (i == MAX_BIC) {
2176                         if (mode == SHOW_LIST) {
2177                                 deferred_add_names[deferred_add_index++] = name_list;
2178                                 if (deferred_add_index >= MAX_DEFERRED) {
2179                                         fprintf(stderr, "More than max %d un-recognized --add options '%s'\n",
2180                                                 MAX_DEFERRED, name_list);
2181                                         help();
2182                                         exit(1);
2183                                 }
2184                         } else {
2185                                 deferred_skip_names[deferred_skip_index++] = name_list;
2186                                 if (debug)
2187                                         fprintf(stderr, "deferred \"%s\"\n", name_list);
2188                                 if (deferred_skip_index >= MAX_DEFERRED) {
2189                                         fprintf(stderr, "More than max %d un-recognized --skip options '%s'\n",
2190                                                 MAX_DEFERRED, name_list);
2191                                         help();
2192                                         exit(1);
2193                                 }
2194                         }
2195                 }
2196 
2197                 name_list = comma;
2198                 if (name_list)
2199                         name_list++;
2200 
2201         }
2202         return retval;
2203 }
2204 
2205 void print_header(char *delim)
2206 {
2207         struct msr_counter *mp;
2208         struct perf_counter_info *pp;
2209         struct pmt_counter *ppmt;
2210         int printed = 0;
2211 
2212         if (DO_BIC(BIC_USEC))
2213                 outp += sprintf(outp, "%susec", (printed++ ? delim : ""));
2214         if (DO_BIC(BIC_TOD))
2215                 outp += sprintf(outp, "%sTime_Of_Day_Seconds", (printed++ ? delim : ""));
2216         if (DO_BIC(BIC_Package))
2217                 outp += sprintf(outp, "%sPackage", (printed++ ? delim : ""));
2218         if (DO_BIC(BIC_Die))
2219                 outp += sprintf(outp, "%sDie", (printed++ ? delim : ""));
2220         if (DO_BIC(BIC_Node))
2221                 outp += sprintf(outp, "%sNode", (printed++ ? delim : ""));
2222         if (DO_BIC(BIC_Core))
2223                 outp += sprintf(outp, "%sCore", (printed++ ? delim : ""));
2224         if (DO_BIC(BIC_CPU))
2225                 outp += sprintf(outp, "%sCPU", (printed++ ? delim : ""));
2226         if (DO_BIC(BIC_APIC))
2227                 outp += sprintf(outp, "%sAPIC", (printed++ ? delim : ""));
2228         if (DO_BIC(BIC_X2APIC))
2229                 outp += sprintf(outp, "%sX2APIC", (printed++ ? delim : ""));
2230         if (DO_BIC(BIC_Avg_MHz))
2231                 outp += sprintf(outp, "%sAvg_MHz", (printed++ ? delim : ""));
2232         if (DO_BIC(BIC_Busy))
2233                 outp += sprintf(outp, "%sBusy%%", (printed++ ? delim : ""));
2234         if (DO_BIC(BIC_Bzy_MHz))
2235                 outp += sprintf(outp, "%sBzy_MHz", (printed++ ? delim : ""));
2236         if (DO_BIC(BIC_TSC_MHz))
2237                 outp += sprintf(outp, "%sTSC_MHz", (printed++ ? delim : ""));
2238 
2239         if (DO_BIC(BIC_IPC))
2240                 outp += sprintf(outp, "%sIPC", (printed++ ? delim : ""));
2241 
2242         if (DO_BIC(BIC_IRQ)) {
2243                 if (sums_need_wide_columns)
2244                         outp += sprintf(outp, "%s     IRQ", (printed++ ? delim : ""));
2245                 else
2246                         outp += sprintf(outp, "%sIRQ", (printed++ ? delim : ""));
2247         }
2248 
2249         if (DO_BIC(BIC_SMI))
2250                 outp += sprintf(outp, "%sSMI", (printed++ ? delim : ""));
2251 
2252         for (mp = sys.tp; mp; mp = mp->next) {
2253 
2254                 if (mp->format == FORMAT_RAW) {
2255                         if (mp->width == 64)
2256                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), mp->name);
2257                         else
2258                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), mp->name);
2259                 } else {
2260                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2261                                 outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), mp->name);
2262                         else
2263                                 outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), mp->name);
2264                 }
2265         }
2266 
2267         for (pp = sys.perf_tp; pp; pp = pp->next) {
2268 
2269                 if (pp->format == FORMAT_RAW) {
2270                         if (pp->width == 64)
2271                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
2272                         else
2273                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
2274                 } else {
2275                         if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2276                                 outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
2277                         else
2278                                 outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
2279                 }
2280         }
2281 
2282         ppmt = sys.pmt_tp;
2283         while (ppmt) {
2284                 switch (ppmt->type) {
2285                 case PMT_TYPE_RAW:
2286                         if (pmt_counter_get_width(ppmt) <= 32)
2287                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
2288                         else
2289                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
2290 
2291                         break;
2292 
2293                 case PMT_TYPE_XTAL_TIME:
2294                         outp += sprintf(outp, "%s%s", delim, ppmt->name);
2295                         break;
2296                 }
2297 
2298                 ppmt = ppmt->next;
2299         }
2300 
2301         if (DO_BIC(BIC_CPU_c1))
2302                 outp += sprintf(outp, "%sCPU%%c1", (printed++ ? delim : ""));
2303         if (DO_BIC(BIC_CPU_c3))
2304                 outp += sprintf(outp, "%sCPU%%c3", (printed++ ? delim : ""));
2305         if (DO_BIC(BIC_CPU_c6))
2306                 outp += sprintf(outp, "%sCPU%%c6", (printed++ ? delim : ""));
2307         if (DO_BIC(BIC_CPU_c7))
2308                 outp += sprintf(outp, "%sCPU%%c7", (printed++ ? delim : ""));
2309 
2310         if (DO_BIC(BIC_Mod_c6))
2311                 outp += sprintf(outp, "%sMod%%c6", (printed++ ? delim : ""));
2312 
2313         if (DO_BIC(BIC_CoreTmp))
2314                 outp += sprintf(outp, "%sCoreTmp", (printed++ ? delim : ""));
2315 
2316         if (DO_BIC(BIC_CORE_THROT_CNT))
2317                 outp += sprintf(outp, "%sCoreThr", (printed++ ? delim : ""));
2318 
2319         if (platform->rapl_msrs && !rapl_joules) {
2320                 if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
2321                         outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
2322         } else if (platform->rapl_msrs && rapl_joules) {
2323                 if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
2324                         outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
2325         }
2326 
2327         for (mp = sys.cp; mp; mp = mp->next) {
2328                 if (mp->format == FORMAT_RAW) {
2329                         if (mp->width == 64)
2330                                 outp += sprintf(outp, "%s%18.18s", delim, mp->name);
2331                         else
2332                                 outp += sprintf(outp, "%s%10.10s", delim, mp->name);
2333                 } else {
2334                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2335                                 outp += sprintf(outp, "%s%8s", delim, mp->name);
2336                         else
2337                                 outp += sprintf(outp, "%s%s", delim, mp->name);
2338                 }
2339         }
2340 
2341         for (pp = sys.perf_cp; pp; pp = pp->next) {
2342 
2343                 if (pp->format == FORMAT_RAW) {
2344                         if (pp->width == 64)
2345                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
2346                         else
2347                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
2348                 } else {
2349                         if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2350                                 outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
2351                         else
2352                                 outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
2353                 }
2354         }
2355 
2356         ppmt = sys.pmt_cp;
2357         while (ppmt) {
2358                 switch (ppmt->type) {
2359                 case PMT_TYPE_RAW:
2360                         if (pmt_counter_get_width(ppmt) <= 32)
2361                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
2362                         else
2363                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
2364 
2365                         break;
2366 
2367                 case PMT_TYPE_XTAL_TIME:
2368                         outp += sprintf(outp, "%s%s", delim, ppmt->name);
2369                         break;
2370                 }
2371 
2372                 ppmt = ppmt->next;
2373         }
2374 
2375         if (DO_BIC(BIC_PkgTmp))
2376                 outp += sprintf(outp, "%sPkgTmp", (printed++ ? delim : ""));
2377 
2378         if (DO_BIC(BIC_GFX_rc6))
2379                 outp += sprintf(outp, "%sGFX%%rc6", (printed++ ? delim : ""));
2380 
2381         if (DO_BIC(BIC_GFXMHz))
2382                 outp += sprintf(outp, "%sGFXMHz", (printed++ ? delim : ""));
2383 
2384         if (DO_BIC(BIC_GFXACTMHz))
2385                 outp += sprintf(outp, "%sGFXAMHz", (printed++ ? delim : ""));
2386 
2387         if (DO_BIC(BIC_SAM_mc6))
2388                 outp += sprintf(outp, "%sSAM%%mc6", (printed++ ? delim : ""));
2389 
2390         if (DO_BIC(BIC_SAMMHz))
2391                 outp += sprintf(outp, "%sSAMMHz", (printed++ ? delim : ""));
2392 
2393         if (DO_BIC(BIC_SAMACTMHz))
2394                 outp += sprintf(outp, "%sSAMAMHz", (printed++ ? delim : ""));
2395 
2396         if (DO_BIC(BIC_Totl_c0))
2397                 outp += sprintf(outp, "%sTotl%%C0", (printed++ ? delim : ""));
2398         if (DO_BIC(BIC_Any_c0))
2399                 outp += sprintf(outp, "%sAny%%C0", (printed++ ? delim : ""));
2400         if (DO_BIC(BIC_GFX_c0))
2401                 outp += sprintf(outp, "%sGFX%%C0", (printed++ ? delim : ""));
2402         if (DO_BIC(BIC_CPUGFX))
2403                 outp += sprintf(outp, "%sCPUGFX%%", (printed++ ? delim : ""));
2404 
2405         if (DO_BIC(BIC_Pkgpc2))
2406                 outp += sprintf(outp, "%sPkg%%pc2", (printed++ ? delim : ""));
2407         if (DO_BIC(BIC_Pkgpc3))
2408                 outp += sprintf(outp, "%sPkg%%pc3", (printed++ ? delim : ""));
2409         if (DO_BIC(BIC_Pkgpc6))
2410                 outp += sprintf(outp, "%sPkg%%pc6", (printed++ ? delim : ""));
2411         if (DO_BIC(BIC_Pkgpc7))
2412                 outp += sprintf(outp, "%sPkg%%pc7", (printed++ ? delim : ""));
2413         if (DO_BIC(BIC_Pkgpc8))
2414                 outp += sprintf(outp, "%sPkg%%pc8", (printed++ ? delim : ""));
2415         if (DO_BIC(BIC_Pkgpc9))
2416                 outp += sprintf(outp, "%sPkg%%pc9", (printed++ ? delim : ""));
2417         if (DO_BIC(BIC_Pkgpc10))
2418                 outp += sprintf(outp, "%sPk%%pc10", (printed++ ? delim : ""));
2419         if (DO_BIC(BIC_Diec6))
2420                 outp += sprintf(outp, "%sDie%%c6", (printed++ ? delim : ""));
2421         if (DO_BIC(BIC_CPU_LPI))
2422                 outp += sprintf(outp, "%sCPU%%LPI", (printed++ ? delim : ""));
2423         if (DO_BIC(BIC_SYS_LPI))
2424                 outp += sprintf(outp, "%sSYS%%LPI", (printed++ ? delim : ""));
2425 
2426         if (platform->rapl_msrs && !rapl_joules) {
2427                 if (DO_BIC(BIC_PkgWatt))
2428                         outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : ""));
2429                 if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
2430                         outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : ""));
2431                 if (DO_BIC(BIC_GFXWatt))
2432                         outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : ""));
2433                 if (DO_BIC(BIC_RAMWatt))
2434                         outp += sprintf(outp, "%sRAMWatt", (printed++ ? delim : ""));
2435                 if (DO_BIC(BIC_PKG__))
2436                         outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
2437                 if (DO_BIC(BIC_RAM__))
2438                         outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
2439         } else if (platform->rapl_msrs && rapl_joules) {
2440                 if (DO_BIC(BIC_Pkg_J))
2441                         outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : ""));
2442                 if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
2443                         outp += sprintf(outp, "%sCor_J", (printed++ ? delim : ""));
2444                 if (DO_BIC(BIC_GFX_J))
2445                         outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : ""));
2446                 if (DO_BIC(BIC_RAM_J))
2447                         outp += sprintf(outp, "%sRAM_J", (printed++ ? delim : ""));
2448                 if (DO_BIC(BIC_PKG__))
2449                         outp += sprintf(outp, "%sPKG_%%", (printed++ ? delim : ""));
2450                 if (DO_BIC(BIC_RAM__))
2451                         outp += sprintf(outp, "%sRAM_%%", (printed++ ? delim : ""));
2452         }
2453         if (DO_BIC(BIC_UNCORE_MHZ))
2454                 outp += sprintf(outp, "%sUncMHz", (printed++ ? delim : ""));
2455 
2456         for (mp = sys.pp; mp; mp = mp->next) {
2457                 if (mp->format == FORMAT_RAW) {
2458                         if (mp->width == 64)
2459                                 outp += sprintf(outp, "%s%18.18s", delim, mp->name);
2460                         else if (mp->width == 32)
2461                                 outp += sprintf(outp, "%s%10.10s", delim, mp->name);
2462                         else
2463                                 outp += sprintf(outp, "%s%7.7s", delim, mp->name);
2464                 } else {
2465                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2466                                 outp += sprintf(outp, "%s%8s", delim, mp->name);
2467                         else
2468                                 outp += sprintf(outp, "%s%7.7s", delim, mp->name);
2469                 }
2470         }
2471 
2472         for (pp = sys.perf_pp; pp; pp = pp->next) {
2473 
2474                 if (pp->format == FORMAT_RAW) {
2475                         if (pp->width == 64)
2476                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), pp->name);
2477                         else
2478                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), pp->name);
2479                 } else {
2480                         if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2481                                 outp += sprintf(outp, "%s%8s", (printed++ ? delim : ""), pp->name);
2482                         else
2483                                 outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), pp->name);
2484                 }
2485         }
2486 
2487         ppmt = sys.pmt_pp;
2488         while (ppmt) {
2489                 switch (ppmt->type) {
2490                 case PMT_TYPE_RAW:
2491                         if (pmt_counter_get_width(ppmt) <= 32)
2492                                 outp += sprintf(outp, "%s%10.10s", (printed++ ? delim : ""), ppmt->name);
2493                         else
2494                                 outp += sprintf(outp, "%s%18.18s", (printed++ ? delim : ""), ppmt->name);
2495 
2496                         break;
2497 
2498                 case PMT_TYPE_XTAL_TIME:
2499                         outp += sprintf(outp, "%s%s", delim, ppmt->name);
2500                         break;
2501                 }
2502 
2503                 ppmt = ppmt->next;
2504         }
2505 
2506         outp += sprintf(outp, "\n");
2507 }
2508 
2509 int dump_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2510 {
2511         int i;
2512         struct msr_counter *mp;
2513 
2514         outp += sprintf(outp, "t %p, c %p, p %p\n", t, c, p);
2515 
2516         if (t) {
2517                 outp += sprintf(outp, "CPU: %d flags 0x%x\n", t->cpu_id, t->flags);
2518                 outp += sprintf(outp, "TSC: %016llX\n", t->tsc);
2519                 outp += sprintf(outp, "aperf: %016llX\n", t->aperf);
2520                 outp += sprintf(outp, "mperf: %016llX\n", t->mperf);
2521                 outp += sprintf(outp, "c1: %016llX\n", t->c1);
2522 
2523                 if (DO_BIC(BIC_IPC))
2524                         outp += sprintf(outp, "IPC: %lld\n", t->instr_count);
2525 
2526                 if (DO_BIC(BIC_IRQ))
2527                         outp += sprintf(outp, "IRQ: %lld\n", t->irq_count);
2528                 if (DO_BIC(BIC_SMI))
2529                         outp += sprintf(outp, "SMI: %d\n", t->smi_count);
2530 
2531                 for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2532                         outp +=
2533                             sprintf(outp, "tADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
2534                                     t->counter[i], mp->sp->path);
2535                 }
2536         }
2537 
2538         if (c && is_cpu_first_thread_in_core(t, c, p)) {
2539                 outp += sprintf(outp, "core: %d\n", c->core_id);
2540                 outp += sprintf(outp, "c3: %016llX\n", c->c3);
2541                 outp += sprintf(outp, "c6: %016llX\n", c->c6);
2542                 outp += sprintf(outp, "c7: %016llX\n", c->c7);
2543                 outp += sprintf(outp, "DTS: %dC\n", c->core_temp_c);
2544                 outp += sprintf(outp, "cpu_throt_count: %016llX\n", c->core_throt_cnt);
2545 
2546                 const unsigned long long energy_value = c->core_energy.raw_value * c->core_energy.scale;
2547                 const double energy_scale = c->core_energy.scale;
2548 
2549                 if (c->core_energy.unit == RAPL_UNIT_JOULES)
2550                         outp += sprintf(outp, "Joules: %0llX (scale: %lf)\n", energy_value, energy_scale);
2551 
2552                 for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2553                         outp +=
2554                             sprintf(outp, "cADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
2555                                     c->counter[i], mp->sp->path);
2556                 }
2557                 outp += sprintf(outp, "mc6_us: %016llX\n", c->mc6_us);
2558         }
2559 
2560         if (p && is_cpu_first_core_in_package(t, c, p)) {
2561                 outp += sprintf(outp, "package: %d\n", p->package_id);
2562 
2563                 outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
2564                 outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0);
2565                 outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0);
2566                 outp += sprintf(outp, "CPU + GFX: %016llX\n", p->pkg_both_core_gfxe_c0);
2567 
2568                 outp += sprintf(outp, "pc2: %016llX\n", p->pc2);
2569                 if (DO_BIC(BIC_Pkgpc3))
2570                         outp += sprintf(outp, "pc3: %016llX\n", p->pc3);
2571                 if (DO_BIC(BIC_Pkgpc6))
2572                         outp += sprintf(outp, "pc6: %016llX\n", p->pc6);
2573                 if (DO_BIC(BIC_Pkgpc7))
2574                         outp += sprintf(outp, "pc7: %016llX\n", p->pc7);
2575                 outp += sprintf(outp, "pc8: %016llX\n", p->pc8);
2576                 outp += sprintf(outp, "pc9: %016llX\n", p->pc9);
2577                 outp += sprintf(outp, "pc10: %016llX\n", p->pc10);
2578                 outp += sprintf(outp, "cpu_lpi: %016llX\n", p->cpu_lpi);
2579                 outp += sprintf(outp, "sys_lpi: %016llX\n", p->sys_lpi);
2580                 outp += sprintf(outp, "Joules PKG: %0llX\n", p->energy_pkg.raw_value);
2581                 outp += sprintf(outp, "Joules COR: %0llX\n", p->energy_cores.raw_value);
2582                 outp += sprintf(outp, "Joules GFX: %0llX\n", p->energy_gfx.raw_value);
2583                 outp += sprintf(outp, "Joules RAM: %0llX\n", p->energy_dram.raw_value);
2584                 outp += sprintf(outp, "Throttle PKG: %0llX\n", p->rapl_pkg_perf_status.raw_value);
2585                 outp += sprintf(outp, "Throttle RAM: %0llX\n", p->rapl_dram_perf_status.raw_value);
2586                 outp += sprintf(outp, "PTM: %dC\n", p->pkg_temp_c);
2587 
2588                 for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
2589                         outp +=
2590                             sprintf(outp, "pADDED [%d] %8s msr0x%x: %08llX %s\n", i, mp->name, mp->msr_num,
2591                                     p->counter[i], mp->sp->path);
2592                 }
2593         }
2594 
2595         outp += sprintf(outp, "\n");
2596 
2597         return 0;
2598 }
2599 
2600 double rapl_counter_get_value(const struct rapl_counter *c, enum rapl_unit desired_unit, double interval)
2601 {
2602         assert(desired_unit != RAPL_UNIT_INVALID);
2603 
2604         /*
2605          * For now we don't expect anything other than joules,
2606          * so just simplify the logic.
2607          */
2608         assert(c->unit == RAPL_UNIT_JOULES);
2609 
2610         const double scaled = c->raw_value * c->scale;
2611 
2612         if (desired_unit == RAPL_UNIT_WATTS)
2613                 return scaled / interval;
2614         return scaled;
2615 }
2616 
2617 /*
2618  * column formatting convention & formats
2619  */
2620 int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
2621 {
2622         double interval_float, tsc;
2623         char *fmt8;
2624         int i;
2625         struct msr_counter *mp;
2626         struct perf_counter_info *pp;
2627         struct pmt_counter *ppmt;
2628         char *delim = "\t";
2629         int printed = 0;
2630 
2631         /* if showing only 1st thread in core and this isn't one, bail out */
2632         if (show_core_only && !is_cpu_first_thread_in_core(t, c, p))
2633                 return 0;
2634 
2635         /* if showing only 1st thread in pkg and this isn't one, bail out */
2636         if (show_pkg_only && !is_cpu_first_core_in_package(t, c, p))
2637                 return 0;
2638 
2639         /*if not summary line and --cpu is used */
2640         if ((t != &average.threads) && (cpu_subset && !CPU_ISSET_S(t->cpu_id, cpu_subset_size, cpu_subset)))
2641                 return 0;
2642 
2643         if (DO_BIC(BIC_USEC)) {
2644                 /* on each row, print how many usec each timestamp took to gather */
2645                 struct timeval tv;
2646 
2647                 timersub(&t->tv_end, &t->tv_begin, &tv);
2648                 outp += sprintf(outp, "%5ld\t", tv.tv_sec * 1000000 + tv.tv_usec);
2649         }
2650 
2651         /* Time_Of_Day_Seconds: on each row, print sec.usec last timestamp taken */
2652         if (DO_BIC(BIC_TOD))
2653                 outp += sprintf(outp, "%10ld.%06ld\t", t->tv_end.tv_sec, t->tv_end.tv_usec);
2654 
2655         interval_float = t->tv_delta.tv_sec + t->tv_delta.tv_usec / 1000000.0;
2656 
2657         tsc = t->tsc * tsc_tweak;
2658 
2659         /* topo columns, print blanks on 1st (average) line */
2660         if (t == &average.threads) {
2661                 if (DO_BIC(BIC_Package))
2662                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2663                 if (DO_BIC(BIC_Die))
2664                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2665                 if (DO_BIC(BIC_Node))
2666                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2667                 if (DO_BIC(BIC_Core))
2668                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2669                 if (DO_BIC(BIC_CPU))
2670                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2671                 if (DO_BIC(BIC_APIC))
2672                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2673                 if (DO_BIC(BIC_X2APIC))
2674                         outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2675         } else {
2676                 if (DO_BIC(BIC_Package)) {
2677                         if (p)
2678                                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->package_id);
2679                         else
2680                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2681                 }
2682                 if (DO_BIC(BIC_Die)) {
2683                         if (c)
2684                                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), cpus[t->cpu_id].die_id);
2685                         else
2686                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2687                 }
2688                 if (DO_BIC(BIC_Node)) {
2689                         if (t)
2690                                 outp += sprintf(outp, "%s%d",
2691                                                 (printed++ ? delim : ""), cpus[t->cpu_id].physical_node_id);
2692                         else
2693                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2694                 }
2695                 if (DO_BIC(BIC_Core)) {
2696                         if (c)
2697                                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_id);
2698                         else
2699                                 outp += sprintf(outp, "%s-", (printed++ ? delim : ""));
2700                 }
2701                 if (DO_BIC(BIC_CPU))
2702                         outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->cpu_id);
2703                 if (DO_BIC(BIC_APIC))
2704                         outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->apic_id);
2705                 if (DO_BIC(BIC_X2APIC))
2706                         outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->x2apic_id);
2707         }
2708 
2709         if (DO_BIC(BIC_Avg_MHz))
2710                 outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 / units * t->aperf / interval_float);
2711 
2712         if (DO_BIC(BIC_Busy))
2713                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->mperf / tsc);
2714 
2715         if (DO_BIC(BIC_Bzy_MHz)) {
2716                 if (has_base_hz)
2717                         outp +=
2718                             sprintf(outp, "%s%.0f", (printed++ ? delim : ""), base_hz / units * t->aperf / t->mperf);
2719                 else
2720                         outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""),
2721                                         tsc / units * t->aperf / t->mperf / interval_float);
2722         }
2723 
2724         if (DO_BIC(BIC_TSC_MHz))
2725                 outp += sprintf(outp, "%s%.0f", (printed++ ? delim : ""), 1.0 * t->tsc / units / interval_float);
2726 
2727         if (DO_BIC(BIC_IPC))
2728                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 1.0 * t->instr_count / t->aperf);
2729 
2730         /* IRQ */
2731         if (DO_BIC(BIC_IRQ)) {
2732                 if (sums_need_wide_columns)
2733                         outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->irq_count);
2734                 else
2735                         outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->irq_count);
2736         }
2737 
2738         /* SMI */
2739         if (DO_BIC(BIC_SMI))
2740                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), t->smi_count);
2741 
2742         /* Added counters */
2743         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
2744                 if (mp->format == FORMAT_RAW) {
2745                         if (mp->width == 32)
2746                                 outp +=
2747                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)t->counter[i]);
2748                         else
2749                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->counter[i]);
2750                 } else if (mp->format == FORMAT_DELTA) {
2751                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2752                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->counter[i]);
2753                         else
2754                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->counter[i]);
2755                 } else if (mp->format == FORMAT_PERCENT) {
2756                         if (mp->type == COUNTER_USEC)
2757                                 outp +=
2758                                     sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2759                                             t->counter[i] / interval_float / 10000);
2760                         else
2761                                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->counter[i] / tsc);
2762                 }
2763         }
2764 
2765         /* Added perf counters */
2766         for (i = 0, pp = sys.perf_tp; pp; ++i, pp = pp->next) {
2767                 if (pp->format == FORMAT_RAW) {
2768                         if (pp->width == 32)
2769                                 outp +=
2770                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
2771                                             (unsigned int)t->perf_counter[i]);
2772                         else
2773                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->perf_counter[i]);
2774                 } else if (pp->format == FORMAT_DELTA) {
2775                         if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2776                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), t->perf_counter[i]);
2777                         else
2778                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), t->perf_counter[i]);
2779                 } else if (pp->format == FORMAT_PERCENT) {
2780                         if (pp->type == COUNTER_USEC)
2781                                 outp +=
2782                                     sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2783                                             t->perf_counter[i] / interval_float / 10000);
2784                         else
2785                                 outp +=
2786                                     sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->perf_counter[i] / tsc);
2787                 }
2788         }
2789 
2790         for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
2791                 switch (ppmt->type) {
2792                 case PMT_TYPE_RAW:
2793                         if (pmt_counter_get_width(ppmt) <= 32)
2794                                 outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
2795                                                 (unsigned int)t->pmt_counter[i]);
2796                         else
2797                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), t->pmt_counter[i]);
2798 
2799                         break;
2800 
2801                 case PMT_TYPE_XTAL_TIME:
2802                         const unsigned long value_raw = t->pmt_counter[i];
2803                         const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
2804 
2805                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
2806                         break;
2807                 }
2808         }
2809 
2810         /* C1 */
2811         if (DO_BIC(BIC_CPU_c1))
2812                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * t->c1 / tsc);
2813 
2814         /* print per-core data only for 1st thread in core */
2815         if (!is_cpu_first_thread_in_core(t, c, p))
2816                 goto done;
2817 
2818         if (DO_BIC(BIC_CPU_c3))
2819                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c3 / tsc);
2820         if (DO_BIC(BIC_CPU_c6))
2821                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c6 / tsc);
2822         if (DO_BIC(BIC_CPU_c7))
2823                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->c7 / tsc);
2824 
2825         /* Mod%c6 */
2826         if (DO_BIC(BIC_Mod_c6))
2827                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->mc6_us / tsc);
2828 
2829         if (DO_BIC(BIC_CoreTmp))
2830                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), c->core_temp_c);
2831 
2832         /* Core throttle count */
2833         if (DO_BIC(BIC_CORE_THROT_CNT))
2834                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->core_throt_cnt);
2835 
2836         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
2837                 if (mp->format == FORMAT_RAW) {
2838                         if (mp->width == 32)
2839                                 outp +=
2840                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)c->counter[i]);
2841                         else
2842                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->counter[i]);
2843                 } else if (mp->format == FORMAT_DELTA) {
2844                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2845                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->counter[i]);
2846                         else
2847                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->counter[i]);
2848                 } else if (mp->format == FORMAT_PERCENT) {
2849                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->counter[i] / tsc);
2850                 }
2851         }
2852 
2853         for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
2854                 if (pp->format == FORMAT_RAW) {
2855                         if (pp->width == 32)
2856                                 outp +=
2857                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
2858                                             (unsigned int)c->perf_counter[i]);
2859                         else
2860                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->perf_counter[i]);
2861                 } else if (pp->format == FORMAT_DELTA) {
2862                         if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
2863                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), c->perf_counter[i]);
2864                         else
2865                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), c->perf_counter[i]);
2866                 } else if (pp->format == FORMAT_PERCENT) {
2867                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * c->perf_counter[i] / tsc);
2868                 }
2869         }
2870 
2871         for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
2872                 switch (ppmt->type) {
2873                 case PMT_TYPE_RAW:
2874                         if (pmt_counter_get_width(ppmt) <= 32)
2875                                 outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
2876                                                 (unsigned int)c->pmt_counter[i]);
2877                         else
2878                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), c->pmt_counter[i]);
2879 
2880                         break;
2881 
2882                 case PMT_TYPE_XTAL_TIME:
2883                         const unsigned long value_raw = c->pmt_counter[i];
2884                         const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
2885 
2886                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
2887                         break;
2888                 }
2889         }
2890 
2891         fmt8 = "%s%.2f";
2892 
2893         if (DO_BIC(BIC_CorWatt) && platform->has_per_core_rapl)
2894                 outp +=
2895                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2896                             rapl_counter_get_value(&c->core_energy, RAPL_UNIT_WATTS, interval_float));
2897         if (DO_BIC(BIC_Cor_J) && platform->has_per_core_rapl)
2898                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
2899                                 rapl_counter_get_value(&c->core_energy, RAPL_UNIT_JOULES, interval_float));
2900 
2901         /* print per-package data only for 1st core in package */
2902         if (!is_cpu_first_core_in_package(t, c, p))
2903                 goto done;
2904 
2905         /* PkgTmp */
2906         if (DO_BIC(BIC_PkgTmp))
2907                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->pkg_temp_c);
2908 
2909         /* GFXrc6 */
2910         if (DO_BIC(BIC_GFX_rc6)) {
2911                 if (p->gfx_rc6_ms == -1) {      /* detect GFX counter reset */
2912                         outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
2913                 } else {
2914                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2915                                         p->gfx_rc6_ms / 10.0 / interval_float);
2916                 }
2917         }
2918 
2919         /* GFXMHz */
2920         if (DO_BIC(BIC_GFXMHz))
2921                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_mhz);
2922 
2923         /* GFXACTMHz */
2924         if (DO_BIC(BIC_GFXACTMHz))
2925                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->gfx_act_mhz);
2926 
2927         /* SAMmc6 */
2928         if (DO_BIC(BIC_SAM_mc6)) {
2929                 if (p->sam_mc6_ms == -1) {      /* detect GFX counter reset */
2930                         outp += sprintf(outp, "%s**.**", (printed++ ? delim : ""));
2931                 } else {
2932                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2933                                         p->sam_mc6_ms / 10.0 / interval_float);
2934                 }
2935         }
2936 
2937         /* SAMMHz */
2938         if (DO_BIC(BIC_SAMMHz))
2939                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_mhz);
2940 
2941         /* SAMACTMHz */
2942         if (DO_BIC(BIC_SAMACTMHz))
2943                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->sam_act_mhz);
2944 
2945         /* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
2946         if (DO_BIC(BIC_Totl_c0))
2947                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_wtd_core_c0 / tsc);
2948         if (DO_BIC(BIC_Any_c0))
2949                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_core_c0 / tsc);
2950         if (DO_BIC(BIC_GFX_c0))
2951                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_any_gfxe_c0 / tsc);
2952         if (DO_BIC(BIC_CPUGFX))
2953                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pkg_both_core_gfxe_c0 / tsc);
2954 
2955         if (DO_BIC(BIC_Pkgpc2))
2956                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc2 / tsc);
2957         if (DO_BIC(BIC_Pkgpc3))
2958                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc3 / tsc);
2959         if (DO_BIC(BIC_Pkgpc6))
2960                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc6 / tsc);
2961         if (DO_BIC(BIC_Pkgpc7))
2962                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc7 / tsc);
2963         if (DO_BIC(BIC_Pkgpc8))
2964                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc8 / tsc);
2965         if (DO_BIC(BIC_Pkgpc9))
2966                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc9 / tsc);
2967         if (DO_BIC(BIC_Pkgpc10))
2968                 outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->pc10 / tsc);
2969 
2970         if (DO_BIC(BIC_Diec6))
2971                 outp +=
2972                     sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->die_c6 / crystal_hz / interval_float);
2973 
2974         if (DO_BIC(BIC_CPU_LPI)) {
2975                 if (p->cpu_lpi >= 0)
2976                         outp +=
2977                             sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2978                                     100.0 * p->cpu_lpi / 1000000.0 / interval_float);
2979                 else
2980                         outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
2981         }
2982         if (DO_BIC(BIC_SYS_LPI)) {
2983                 if (p->sys_lpi >= 0)
2984                         outp +=
2985                             sprintf(outp, "%s%.2f", (printed++ ? delim : ""),
2986                                     100.0 * p->sys_lpi / 1000000.0 / interval_float);
2987                 else
2988                         outp += sprintf(outp, "%s(neg)", (printed++ ? delim : ""));
2989         }
2990 
2991         if (DO_BIC(BIC_PkgWatt))
2992                 outp +=
2993                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2994                             rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_WATTS, interval_float));
2995         if (DO_BIC(BIC_CorWatt) && !platform->has_per_core_rapl)
2996                 outp +=
2997                     sprintf(outp, fmt8, (printed++ ? delim : ""),
2998                             rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_WATTS, interval_float));
2999         if (DO_BIC(BIC_GFXWatt))
3000                 outp +=
3001                     sprintf(outp, fmt8, (printed++ ? delim : ""),
3002                             rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_WATTS, interval_float));
3003         if (DO_BIC(BIC_RAMWatt))
3004                 outp +=
3005                     sprintf(outp, fmt8, (printed++ ? delim : ""),
3006                             rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_WATTS, interval_float));
3007         if (DO_BIC(BIC_Pkg_J))
3008                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3009                                 rapl_counter_get_value(&p->energy_pkg, RAPL_UNIT_JOULES, interval_float));
3010         if (DO_BIC(BIC_Cor_J) && !platform->has_per_core_rapl)
3011                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3012                                 rapl_counter_get_value(&p->energy_cores, RAPL_UNIT_JOULES, interval_float));
3013         if (DO_BIC(BIC_GFX_J))
3014                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3015                                 rapl_counter_get_value(&p->energy_gfx, RAPL_UNIT_JOULES, interval_float));
3016         if (DO_BIC(BIC_RAM_J))
3017                 outp += sprintf(outp, fmt8, (printed++ ? delim : ""),
3018                                 rapl_counter_get_value(&p->energy_dram, RAPL_UNIT_JOULES, interval_float));
3019         if (DO_BIC(BIC_PKG__))
3020                 outp +=
3021                     sprintf(outp, fmt8, (printed++ ? delim : ""),
3022                             rapl_counter_get_value(&p->rapl_pkg_perf_status, RAPL_UNIT_WATTS, interval_float));
3023         if (DO_BIC(BIC_RAM__))
3024                 outp +=
3025                     sprintf(outp, fmt8, (printed++ ? delim : ""),
3026                             rapl_counter_get_value(&p->rapl_dram_perf_status, RAPL_UNIT_WATTS, interval_float));
3027         /* UncMHz */
3028         if (DO_BIC(BIC_UNCORE_MHZ))
3029                 outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), p->uncore_mhz);
3030 
3031         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3032                 if (mp->format == FORMAT_RAW) {
3033                         if (mp->width == 32)
3034                                 outp +=
3035                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""), (unsigned int)p->counter[i]);
3036                         else
3037                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->counter[i]);
3038                 } else if (mp->format == FORMAT_DELTA) {
3039                         if ((mp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3040                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->counter[i]);
3041                         else
3042                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->counter[i]);
3043                 } else if (mp->format == FORMAT_PERCENT) {
3044                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->counter[i] / tsc);
3045                 } else if (mp->type == COUNTER_K2M)
3046                         outp += sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->counter[i] / 1000);
3047         }
3048 
3049         for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
3050                 if (pp->format == FORMAT_RAW) {
3051                         if (pp->width == 32)
3052                                 outp +=
3053                                     sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3054                                             (unsigned int)p->perf_counter[i]);
3055                         else
3056                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->perf_counter[i]);
3057                 } else if (pp->format == FORMAT_DELTA) {
3058                         if ((pp->type == COUNTER_ITEMS) && sums_need_wide_columns)
3059                                 outp += sprintf(outp, "%s%8lld", (printed++ ? delim : ""), p->perf_counter[i]);
3060                         else
3061                                 outp += sprintf(outp, "%s%lld", (printed++ ? delim : ""), p->perf_counter[i]);
3062                 } else if (pp->format == FORMAT_PERCENT) {
3063                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->perf_counter[i] / tsc);
3064                 } else if (pp->type == COUNTER_K2M) {
3065                         outp +=
3066                             sprintf(outp, "%s%d", (printed++ ? delim : ""), (unsigned int)p->perf_counter[i] / 1000);
3067                 }
3068         }
3069 
3070         for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
3071                 switch (ppmt->type) {
3072                 case PMT_TYPE_RAW:
3073                         if (pmt_counter_get_width(ppmt) <= 32)
3074                                 outp += sprintf(outp, "%s0x%08x", (printed++ ? delim : ""),
3075                                                 (unsigned int)p->pmt_counter[i]);
3076                         else
3077                                 outp += sprintf(outp, "%s0x%016llx", (printed++ ? delim : ""), p->pmt_counter[i]);
3078 
3079                         break;
3080 
3081                 case PMT_TYPE_XTAL_TIME:
3082                         const unsigned long value_raw = p->pmt_counter[i];
3083                         const double value_converted = 100.0 * value_raw / crystal_hz / interval_float;
3084 
3085                         outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted);
3086                         break;
3087                 }
3088         }
3089 
3090 done:
3091         if (*(outp - 1) != '\n')
3092                 outp += sprintf(outp, "\n");
3093 
3094         return 0;
3095 }
3096 
3097 void flush_output_stdout(void)
3098 {
3099         FILE *filep;
3100 
3101         if (outf == stderr)
3102                 filep = stdout;
3103         else
3104                 filep = outf;
3105 
3106         fputs(output_buffer, filep);
3107         fflush(filep);
3108 
3109         outp = output_buffer;
3110 }
3111 
3112 void flush_output_stderr(void)
3113 {
3114         fputs(output_buffer, outf);
3115         fflush(outf);
3116         outp = output_buffer;
3117 }
3118 
3119 void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3120 {
3121         static int count;
3122 
3123         if ((!count || (header_iterations && !(count % header_iterations))) || !summary_only)
3124                 print_header("\t");
3125 
3126         format_counters(&average.threads, &average.cores, &average.packages);
3127 
3128         count++;
3129 
3130         if (summary_only)
3131                 return;
3132 
3133         for_all_cpus(format_counters, t, c, p);
3134 }
3135 
3136 #define DELTA_WRAP32(new, old)                  \
3137         old = ((((unsigned long long)new << 32) - ((unsigned long long)old << 32)) >> 32);
3138 
3139 int delta_package(struct pkg_data *new, struct pkg_data *old)
3140 {
3141         int i;
3142         struct msr_counter *mp;
3143         struct perf_counter_info *pp;
3144         struct pmt_counter *ppmt;
3145 
3146         if (DO_BIC(BIC_Totl_c0))
3147                 old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
3148         if (DO_BIC(BIC_Any_c0))
3149                 old->pkg_any_core_c0 = new->pkg_any_core_c0 - old->pkg_any_core_c0;
3150         if (DO_BIC(BIC_GFX_c0))
3151                 old->pkg_any_gfxe_c0 = new->pkg_any_gfxe_c0 - old->pkg_any_gfxe_c0;
3152         if (DO_BIC(BIC_CPUGFX))
3153                 old->pkg_both_core_gfxe_c0 = new->pkg_both_core_gfxe_c0 - old->pkg_both_core_gfxe_c0;
3154 
3155         old->pc2 = new->pc2 - old->pc2;
3156         if (DO_BIC(BIC_Pkgpc3))
3157                 old->pc3 = new->pc3 - old->pc3;
3158         if (DO_BIC(BIC_Pkgpc6))
3159                 old->pc6 = new->pc6 - old->pc6;
3160         if (DO_BIC(BIC_Pkgpc7))
3161                 old->pc7 = new->pc7 - old->pc7;
3162         old->pc8 = new->pc8 - old->pc8;
3163         old->pc9 = new->pc9 - old->pc9;
3164         old->pc10 = new->pc10 - old->pc10;
3165         old->die_c6 = new->die_c6 - old->die_c6;
3166         old->cpu_lpi = new->cpu_lpi - old->cpu_lpi;
3167         old->sys_lpi = new->sys_lpi - old->sys_lpi;
3168         old->pkg_temp_c = new->pkg_temp_c;
3169 
3170         /* flag an error when rc6 counter resets/wraps */
3171         if (old->gfx_rc6_ms > new->gfx_rc6_ms)
3172                 old->gfx_rc6_ms = -1;
3173         else
3174                 old->gfx_rc6_ms = new->gfx_rc6_ms - old->gfx_rc6_ms;
3175 
3176         old->uncore_mhz = new->uncore_mhz;
3177         old->gfx_mhz = new->gfx_mhz;
3178         old->gfx_act_mhz = new->gfx_act_mhz;
3179 
3180         /* flag an error when mc6 counter resets/wraps */
3181         if (old->sam_mc6_ms > new->sam_mc6_ms)
3182                 old->sam_mc6_ms = -1;
3183         else
3184                 old->sam_mc6_ms = new->sam_mc6_ms - old->sam_mc6_ms;
3185 
3186         old->sam_mhz = new->sam_mhz;
3187         old->sam_act_mhz = new->sam_act_mhz;
3188 
3189         old->energy_pkg.raw_value = new->energy_pkg.raw_value - old->energy_pkg.raw_value;
3190         old->energy_cores.raw_value = new->energy_cores.raw_value - old->energy_cores.raw_value;
3191         old->energy_gfx.raw_value = new->energy_gfx.raw_value - old->energy_gfx.raw_value;
3192         old->energy_dram.raw_value = new->energy_dram.raw_value - old->energy_dram.raw_value;
3193         old->rapl_pkg_perf_status.raw_value = new->rapl_pkg_perf_status.raw_value - old->rapl_pkg_perf_status.raw_value;
3194         old->rapl_dram_perf_status.raw_value =
3195             new->rapl_dram_perf_status.raw_value - old->rapl_dram_perf_status.raw_value;
3196 
3197         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3198                 if (mp->format == FORMAT_RAW)
3199                         old->counter[i] = new->counter[i];
3200                 else if (mp->format == FORMAT_AVERAGE)
3201                         old->counter[i] = new->counter[i];
3202                 else
3203                         old->counter[i] = new->counter[i] - old->counter[i];
3204         }
3205 
3206         for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
3207                 if (pp->format == FORMAT_RAW)
3208                         old->perf_counter[i] = new->perf_counter[i];
3209                 else if (pp->format == FORMAT_AVERAGE)
3210                         old->perf_counter[i] = new->perf_counter[i];
3211                 else
3212                         old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
3213         }
3214 
3215         for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
3216                 if (ppmt->format == FORMAT_RAW)
3217                         old->pmt_counter[i] = new->pmt_counter[i];
3218                 else
3219                         old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
3220         }
3221 
3222         return 0;
3223 }
3224 
3225 void delta_core(struct core_data *new, struct core_data *old)
3226 {
3227         int i;
3228         struct msr_counter *mp;
3229         struct perf_counter_info *pp;
3230         struct pmt_counter *ppmt;
3231 
3232         old->c3 = new->c3 - old->c3;
3233         old->c6 = new->c6 - old->c6;
3234         old->c7 = new->c7 - old->c7;
3235         old->core_temp_c = new->core_temp_c;
3236         old->core_throt_cnt = new->core_throt_cnt;
3237         old->mc6_us = new->mc6_us - old->mc6_us;
3238 
3239         DELTA_WRAP32(new->core_energy.raw_value, old->core_energy.raw_value);
3240 
3241         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3242                 if (mp->format == FORMAT_RAW)
3243                         old->counter[i] = new->counter[i];
3244                 else
3245                         old->counter[i] = new->counter[i] - old->counter[i];
3246         }
3247 
3248         for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
3249                 if (pp->format == FORMAT_RAW)
3250                         old->perf_counter[i] = new->perf_counter[i];
3251                 else
3252                         old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
3253         }
3254 
3255         for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
3256                 if (ppmt->format == FORMAT_RAW)
3257                         old->pmt_counter[i] = new->pmt_counter[i];
3258                 else
3259                         old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
3260         }
3261 }
3262 
3263 int soft_c1_residency_display(int bic)
3264 {
3265         if (!DO_BIC(BIC_CPU_c1) || platform->has_msr_core_c1_res)
3266                 return 0;
3267 
3268         return DO_BIC_READ(bic);
3269 }
3270 
3271 /*
3272  * old = new - old
3273  */
3274 int delta_thread(struct thread_data *new, struct thread_data *old, struct core_data *core_delta)
3275 {
3276         int i;
3277         struct msr_counter *mp;
3278         struct perf_counter_info *pp;
3279         struct pmt_counter *ppmt;
3280 
3281         /* we run cpuid just the 1st time, copy the results */
3282         if (DO_BIC(BIC_APIC))
3283                 new->apic_id = old->apic_id;
3284         if (DO_BIC(BIC_X2APIC))
3285                 new->x2apic_id = old->x2apic_id;
3286 
3287         /*
3288          * the timestamps from start of measurement interval are in "old"
3289          * the timestamp from end of measurement interval are in "new"
3290          * over-write old w/ new so we can print end of interval values
3291          */
3292 
3293         timersub(&new->tv_begin, &old->tv_begin, &old->tv_delta);
3294         old->tv_begin = new->tv_begin;
3295         old->tv_end = new->tv_end;
3296 
3297         old->tsc = new->tsc - old->tsc;
3298 
3299         /* check for TSC < 1 Mcycles over interval */
3300         if (old->tsc < (1000 * 1000))
3301                 errx(-3, "Insanely slow TSC rate, TSC stops in idle?\n"
3302                      "You can disable all c-states by booting with \"idle=poll\"\n"
3303                      "or just the deep ones with \"processor.max_cstate=1\"");
3304 
3305         old->c1 = new->c1 - old->c1;
3306 
3307         if (DO_BIC(BIC_Avg_MHz) || DO_BIC(BIC_Busy) || DO_BIC(BIC_Bzy_MHz) || DO_BIC(BIC_IPC)
3308             || soft_c1_residency_display(BIC_Avg_MHz)) {
3309                 if ((new->aperf > old->aperf) && (new->mperf > old->mperf)) {
3310                         old->aperf = new->aperf - old->aperf;
3311                         old->mperf = new->mperf - old->mperf;
3312                 } else {
3313                         return -1;
3314                 }
3315         }
3316 
3317         if (platform->has_msr_core_c1_res) {
3318                 /*
3319                  * Some models have a dedicated C1 residency MSR,
3320                  * which should be more accurate than the derivation below.
3321                  */
3322         } else {
3323                 /*
3324                  * As counter collection is not atomic,
3325                  * it is possible for mperf's non-halted cycles + idle states
3326                  * to exceed TSC's all cycles: show c1 = 0% in that case.
3327                  */
3328                 if ((old->mperf + core_delta->c3 + core_delta->c6 + core_delta->c7) > (old->tsc * tsc_tweak))
3329                         old->c1 = 0;
3330                 else {
3331                         /* normal case, derive c1 */
3332                         old->c1 = (old->tsc * tsc_tweak) - old->mperf - core_delta->c3
3333                             - core_delta->c6 - core_delta->c7;
3334                 }
3335         }
3336 
3337         if (old->mperf == 0) {
3338                 if (debug > 1)
3339                         fprintf(outf, "cpu%d MPERF 0!\n", old->cpu_id);
3340                 old->mperf = 1; /* divide by 0 protection */
3341         }
3342 
3343         if (DO_BIC(BIC_IPC))
3344                 old->instr_count = new->instr_count - old->instr_count;
3345 
3346         if (DO_BIC(BIC_IRQ))
3347                 old->irq_count = new->irq_count - old->irq_count;
3348 
3349         if (DO_BIC(BIC_SMI))
3350                 old->smi_count = new->smi_count - old->smi_count;
3351 
3352         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3353                 if (mp->format == FORMAT_RAW)
3354                         old->counter[i] = new->counter[i];
3355                 else
3356                         old->counter[i] = new->counter[i] - old->counter[i];
3357         }
3358 
3359         for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
3360                 if (pp->format == FORMAT_RAW)
3361                         old->perf_counter[i] = new->perf_counter[i];
3362                 else
3363                         old->perf_counter[i] = new->perf_counter[i] - old->perf_counter[i];
3364         }
3365 
3366         for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
3367                 if (ppmt->format == FORMAT_RAW)
3368                         old->pmt_counter[i] = new->pmt_counter[i];
3369                 else
3370                         old->pmt_counter[i] = new->pmt_counter[i] - old->pmt_counter[i];
3371         }
3372 
3373         return 0;
3374 }
3375 
3376 int delta_cpu(struct thread_data *t, struct core_data *c,
3377               struct pkg_data *p, struct thread_data *t2, struct core_data *c2, struct pkg_data *p2)
3378 {
3379         int retval = 0;
3380 
3381         /* calculate core delta only for 1st thread in core */
3382         if (is_cpu_first_thread_in_core(t, c, p))
3383                 delta_core(c, c2);
3384 
3385         /* always calculate thread delta */
3386         retval = delta_thread(t, t2, c2);       /* c2 is core delta */
3387         if (retval)
3388                 return retval;
3389 
3390         /* calculate package delta only for 1st core in package */
3391         if (is_cpu_first_core_in_package(t, c, p))
3392                 retval = delta_package(p, p2);
3393 
3394         return retval;
3395 }
3396 
3397 void rapl_counter_clear(struct rapl_counter *c)
3398 {
3399         c->raw_value = 0;
3400         c->scale = 0.0;
3401         c->unit = RAPL_UNIT_INVALID;
3402 }
3403 
3404 void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3405 {
3406         int i;
3407         struct msr_counter *mp;
3408 
3409         t->tv_begin.tv_sec = 0;
3410         t->tv_begin.tv_usec = 0;
3411         t->tv_end.tv_sec = 0;
3412         t->tv_end.tv_usec = 0;
3413         t->tv_delta.tv_sec = 0;
3414         t->tv_delta.tv_usec = 0;
3415 
3416         t->tsc = 0;
3417         t->aperf = 0;
3418         t->mperf = 0;
3419         t->c1 = 0;
3420 
3421         t->instr_count = 0;
3422 
3423         t->irq_count = 0;
3424         t->smi_count = 0;
3425 
3426         c->c3 = 0;
3427         c->c6 = 0;
3428         c->c7 = 0;
3429         c->mc6_us = 0;
3430         c->core_temp_c = 0;
3431         rapl_counter_clear(&c->core_energy);
3432         c->core_throt_cnt = 0;
3433 
3434         p->pkg_wtd_core_c0 = 0;
3435         p->pkg_any_core_c0 = 0;
3436         p->pkg_any_gfxe_c0 = 0;
3437         p->pkg_both_core_gfxe_c0 = 0;
3438 
3439         p->pc2 = 0;
3440         if (DO_BIC(BIC_Pkgpc3))
3441                 p->pc3 = 0;
3442         if (DO_BIC(BIC_Pkgpc6))
3443                 p->pc6 = 0;
3444         if (DO_BIC(BIC_Pkgpc7))
3445                 p->pc7 = 0;
3446         p->pc8 = 0;
3447         p->pc9 = 0;
3448         p->pc10 = 0;
3449         p->die_c6 = 0;
3450         p->cpu_lpi = 0;
3451         p->sys_lpi = 0;
3452 
3453         rapl_counter_clear(&p->energy_pkg);
3454         rapl_counter_clear(&p->energy_dram);
3455         rapl_counter_clear(&p->energy_cores);
3456         rapl_counter_clear(&p->energy_gfx);
3457         rapl_counter_clear(&p->rapl_pkg_perf_status);
3458         rapl_counter_clear(&p->rapl_dram_perf_status);
3459         p->pkg_temp_c = 0;
3460 
3461         p->gfx_rc6_ms = 0;
3462         p->uncore_mhz = 0;
3463         p->gfx_mhz = 0;
3464         p->gfx_act_mhz = 0;
3465         p->sam_mc6_ms = 0;
3466         p->sam_mhz = 0;
3467         p->sam_act_mhz = 0;
3468         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next)
3469                 t->counter[i] = 0;
3470 
3471         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next)
3472                 c->counter[i] = 0;
3473 
3474         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next)
3475                 p->counter[i] = 0;
3476 
3477         memset(&t->perf_counter[0], 0, sizeof(t->perf_counter));
3478         memset(&c->perf_counter[0], 0, sizeof(c->perf_counter));
3479         memset(&p->perf_counter[0], 0, sizeof(p->perf_counter));
3480 
3481         memset(&t->pmt_counter[0], 0, ARRAY_SIZE(t->pmt_counter));
3482         memset(&c->pmt_counter[0], 0, ARRAY_SIZE(c->pmt_counter));
3483         memset(&p->pmt_counter[0], 0, ARRAY_SIZE(p->pmt_counter));
3484 }
3485 
3486 void rapl_counter_accumulate(struct rapl_counter *dst, const struct rapl_counter *src)
3487 {
3488         /* Copy unit and scale from src if dst is not initialized */
3489         if (dst->unit == RAPL_UNIT_INVALID) {
3490                 dst->unit = src->unit;
3491                 dst->scale = src->scale;
3492         }
3493 
3494         assert(dst->unit == src->unit);
3495         assert(dst->scale == src->scale);
3496 
3497         dst->raw_value += src->raw_value;
3498 }
3499 
3500 int sum_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3501 {
3502         int i;
3503         struct msr_counter *mp;
3504         struct perf_counter_info *pp;
3505         struct pmt_counter *ppmt;
3506 
3507         /* copy un-changing apic_id's */
3508         if (DO_BIC(BIC_APIC))
3509                 average.threads.apic_id = t->apic_id;
3510         if (DO_BIC(BIC_X2APIC))
3511                 average.threads.x2apic_id = t->x2apic_id;
3512 
3513         /* remember first tv_begin */
3514         if (average.threads.tv_begin.tv_sec == 0)
3515                 average.threads.tv_begin = t->tv_begin;
3516 
3517         /* remember last tv_end */
3518         average.threads.tv_end = t->tv_end;
3519 
3520         average.threads.tsc += t->tsc;
3521         average.threads.aperf += t->aperf;
3522         average.threads.mperf += t->mperf;
3523         average.threads.c1 += t->c1;
3524 
3525         average.threads.instr_count += t->instr_count;
3526 
3527         average.threads.irq_count += t->irq_count;
3528         average.threads.smi_count += t->smi_count;
3529 
3530         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3531                 if (mp->format == FORMAT_RAW)
3532                         continue;
3533                 average.threads.counter[i] += t->counter[i];
3534         }
3535 
3536         for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
3537                 if (pp->format == FORMAT_RAW)
3538                         continue;
3539                 average.threads.perf_counter[i] += t->perf_counter[i];
3540         }
3541 
3542         for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
3543                 average.threads.pmt_counter[i] += t->pmt_counter[i];
3544         }
3545 
3546         /* sum per-core values only for 1st thread in core */
3547         if (!is_cpu_first_thread_in_core(t, c, p))
3548                 return 0;
3549 
3550         average.cores.c3 += c->c3;
3551         average.cores.c6 += c->c6;
3552         average.cores.c7 += c->c7;
3553         average.cores.mc6_us += c->mc6_us;
3554 
3555         average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
3556         average.cores.core_throt_cnt = MAX(average.cores.core_throt_cnt, c->core_throt_cnt);
3557 
3558         rapl_counter_accumulate(&average.cores.core_energy, &c->core_energy);
3559 
3560         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3561                 if (mp->format == FORMAT_RAW)
3562                         continue;
3563                 average.cores.counter[i] += c->counter[i];
3564         }
3565 
3566         for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
3567                 if (pp->format == FORMAT_RAW)
3568                         continue;
3569                 average.cores.perf_counter[i] += c->perf_counter[i];
3570         }
3571 
3572         for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
3573                 average.cores.pmt_counter[i] += c->pmt_counter[i];
3574         }
3575 
3576         /* sum per-pkg values only for 1st core in pkg */
3577         if (!is_cpu_first_core_in_package(t, c, p))
3578                 return 0;
3579 
3580         if (DO_BIC(BIC_Totl_c0))
3581                 average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
3582         if (DO_BIC(BIC_Any_c0))
3583                 average.packages.pkg_any_core_c0 += p->pkg_any_core_c0;
3584         if (DO_BIC(BIC_GFX_c0))
3585                 average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
3586         if (DO_BIC(BIC_CPUGFX))
3587                 average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
3588 
3589         average.packages.pc2 += p->pc2;
3590         if (DO_BIC(BIC_Pkgpc3))
3591                 average.packages.pc3 += p->pc3;
3592         if (DO_BIC(BIC_Pkgpc6))
3593                 average.packages.pc6 += p->pc6;
3594         if (DO_BIC(BIC_Pkgpc7))
3595                 average.packages.pc7 += p->pc7;
3596         average.packages.pc8 += p->pc8;
3597         average.packages.pc9 += p->pc9;
3598         average.packages.pc10 += p->pc10;
3599         average.packages.die_c6 += p->die_c6;
3600 
3601         average.packages.cpu_lpi = p->cpu_lpi;
3602         average.packages.sys_lpi = p->sys_lpi;
3603 
3604         rapl_counter_accumulate(&average.packages.energy_pkg, &p->energy_pkg);
3605         rapl_counter_accumulate(&average.packages.energy_dram, &p->energy_dram);
3606         rapl_counter_accumulate(&average.packages.energy_cores, &p->energy_cores);
3607         rapl_counter_accumulate(&average.packages.energy_gfx, &p->energy_gfx);
3608 
3609         average.packages.gfx_rc6_ms = p->gfx_rc6_ms;
3610         average.packages.uncore_mhz = p->uncore_mhz;
3611         average.packages.gfx_mhz = p->gfx_mhz;
3612         average.packages.gfx_act_mhz = p->gfx_act_mhz;
3613         average.packages.sam_mc6_ms = p->sam_mc6_ms;
3614         average.packages.sam_mhz = p->sam_mhz;
3615         average.packages.sam_act_mhz = p->sam_act_mhz;
3616 
3617         average.packages.pkg_temp_c = MAX(average.packages.pkg_temp_c, p->pkg_temp_c);
3618 
3619         rapl_counter_accumulate(&average.packages.rapl_pkg_perf_status, &p->rapl_pkg_perf_status);
3620         rapl_counter_accumulate(&average.packages.rapl_dram_perf_status, &p->rapl_dram_perf_status);
3621 
3622         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3623                 if ((mp->format == FORMAT_RAW) && (topo.num_packages == 0))
3624                         average.packages.counter[i] = p->counter[i];
3625                 else
3626                         average.packages.counter[i] += p->counter[i];
3627         }
3628 
3629         for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
3630                 if ((pp->format == FORMAT_RAW) && (topo.num_packages == 0))
3631                         average.packages.perf_counter[i] = p->perf_counter[i];
3632                 else
3633                         average.packages.perf_counter[i] += p->perf_counter[i];
3634         }
3635 
3636         for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
3637                 average.packages.pmt_counter[i] += p->pmt_counter[i];
3638         }
3639 
3640         return 0;
3641 }
3642 
3643 /*
3644  * sum the counters for all cpus in the system
3645  * compute the weighted average
3646  */
3647 void compute_average(struct thread_data *t, struct core_data *c, struct pkg_data *p)
3648 {
3649         int i;
3650         struct msr_counter *mp;
3651         struct perf_counter_info *pp;
3652         struct pmt_counter *ppmt;
3653 
3654         clear_counters(&average.threads, &average.cores, &average.packages);
3655 
3656         for_all_cpus(sum_counters, t, c, p);
3657 
3658         /* Use the global time delta for the average. */
3659         average.threads.tv_delta = tv_delta;
3660 
3661         average.threads.tsc /= topo.allowed_cpus;
3662         average.threads.aperf /= topo.allowed_cpus;
3663         average.threads.mperf /= topo.allowed_cpus;
3664         average.threads.instr_count /= topo.allowed_cpus;
3665         average.threads.c1 /= topo.allowed_cpus;
3666 
3667         if (average.threads.irq_count > 9999999)
3668                 sums_need_wide_columns = 1;
3669 
3670         average.cores.c3 /= topo.allowed_cores;
3671         average.cores.c6 /= topo.allowed_cores;
3672         average.cores.c7 /= topo.allowed_cores;
3673         average.cores.mc6_us /= topo.allowed_cores;
3674 
3675         if (DO_BIC(BIC_Totl_c0))
3676                 average.packages.pkg_wtd_core_c0 /= topo.allowed_packages;
3677         if (DO_BIC(BIC_Any_c0))
3678                 average.packages.pkg_any_core_c0 /= topo.allowed_packages;
3679         if (DO_BIC(BIC_GFX_c0))
3680                 average.packages.pkg_any_gfxe_c0 /= topo.allowed_packages;
3681         if (DO_BIC(BIC_CPUGFX))
3682                 average.packages.pkg_both_core_gfxe_c0 /= topo.allowed_packages;
3683 
3684         average.packages.pc2 /= topo.allowed_packages;
3685         if (DO_BIC(BIC_Pkgpc3))
3686                 average.packages.pc3 /= topo.allowed_packages;
3687         if (DO_BIC(BIC_Pkgpc6))
3688                 average.packages.pc6 /= topo.allowed_packages;
3689         if (DO_BIC(BIC_Pkgpc7))
3690                 average.packages.pc7 /= topo.allowed_packages;
3691 
3692         average.packages.pc8 /= topo.allowed_packages;
3693         average.packages.pc9 /= topo.allowed_packages;
3694         average.packages.pc10 /= topo.allowed_packages;
3695         average.packages.die_c6 /= topo.allowed_packages;
3696 
3697         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
3698                 if (mp->format == FORMAT_RAW)
3699                         continue;
3700                 if (mp->type == COUNTER_ITEMS) {
3701                         if (average.threads.counter[i] > 9999999)
3702                                 sums_need_wide_columns = 1;
3703                         continue;
3704                 }
3705                 average.threads.counter[i] /= topo.allowed_cpus;
3706         }
3707         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
3708                 if (mp->format == FORMAT_RAW)
3709                         continue;
3710                 if (mp->type == COUNTER_ITEMS) {
3711                         if (average.cores.counter[i] > 9999999)
3712                                 sums_need_wide_columns = 1;
3713                 }
3714                 average.cores.counter[i] /= topo.allowed_cores;
3715         }
3716         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
3717                 if (mp->format == FORMAT_RAW)
3718                         continue;
3719                 if (mp->type == COUNTER_ITEMS) {
3720                         if (average.packages.counter[i] > 9999999)
3721                                 sums_need_wide_columns = 1;
3722                 }
3723                 average.packages.counter[i] /= topo.allowed_packages;
3724         }
3725 
3726         for (i = 0, pp = sys.perf_tp; pp; i++, pp = pp->next) {
3727                 if (pp->format == FORMAT_RAW)
3728                         continue;
3729                 if (pp->type == COUNTER_ITEMS) {
3730                         if (average.threads.perf_counter[i] > 9999999)
3731                                 sums_need_wide_columns = 1;
3732                         continue;
3733                 }
3734                 average.threads.perf_counter[i] /= topo.allowed_cpus;
3735         }
3736         for (i = 0, pp = sys.perf_cp; pp; i++, pp = pp->next) {
3737                 if (pp->format == FORMAT_RAW)
3738                         continue;
3739                 if (pp->type == COUNTER_ITEMS) {
3740                         if (average.cores.perf_counter[i] > 9999999)
3741                                 sums_need_wide_columns = 1;
3742                 }
3743                 average.cores.perf_counter[i] /= topo.allowed_cores;
3744         }
3745         for (i = 0, pp = sys.perf_pp; pp; i++, pp = pp->next) {
3746                 if (pp->format == FORMAT_RAW)
3747                         continue;
3748                 if (pp->type == COUNTER_ITEMS) {
3749                         if (average.packages.perf_counter[i] > 9999999)
3750                                 sums_need_wide_columns = 1;
3751                 }
3752                 average.packages.perf_counter[i] /= topo.allowed_packages;
3753         }
3754 
3755         for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) {
3756                 average.threads.pmt_counter[i] /= topo.allowed_cpus;
3757         }
3758         for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) {
3759                 average.cores.pmt_counter[i] /= topo.allowed_cores;
3760         }
3761         for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) {
3762                 average.packages.pmt_counter[i] /= topo.allowed_packages;
3763         }
3764 }
3765 
3766 static unsigned long long rdtsc(void)
3767 {
3768         unsigned int low, high;
3769 
3770         asm volatile ("rdtsc":"=a" (low), "=d"(high));
3771 
3772         return low | ((unsigned long long)high) << 32;
3773 }
3774 
3775 /*
3776  * Open a file, and exit on failure
3777  */
3778 FILE *fopen_or_die(const char *path, const char *mode)
3779 {
3780         FILE *filep = fopen(path, mode);
3781 
3782         if (!filep)
3783                 err(1, "%s: open failed", path);
3784         return filep;
3785 }
3786 
3787 /*
3788  * snapshot_sysfs_counter()
3789  *
3790  * return snapshot of given counter
3791  */
3792 unsigned long long snapshot_sysfs_counter(char *path)
3793 {
3794         FILE *fp;
3795         int retval;
3796         unsigned long long counter;
3797 
3798         fp = fopen_or_die(path, "r");
3799 
3800         retval = fscanf(fp, "%lld", &counter);
3801         if (retval != 1)
3802                 err(1, "snapshot_sysfs_counter(%s)", path);
3803 
3804         fclose(fp);
3805 
3806         return counter;
3807 }
3808 
3809 int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp, char *counter_path)
3810 {
3811         if (mp->msr_num != 0) {
3812                 assert(!no_msr);
3813                 if (get_msr(cpu, mp->msr_num, counterp))
3814                         return -1;
3815         } else {
3816                 char path[128 + PATH_BYTES];
3817 
3818                 if (mp->flags & SYSFS_PERCPU) {
3819                         sprintf(path, "/sys/devices/system/cpu/cpu%d/%s", cpu, mp->sp->path);
3820 
3821                         *counterp = snapshot_sysfs_counter(path);
3822                 } else {
3823                         *counterp = snapshot_sysfs_counter(counter_path);
3824                 }
3825         }
3826 
3827         return 0;
3828 }
3829 
3830 unsigned long long get_legacy_uncore_mhz(int package)
3831 {
3832         char path[128];
3833         int die;
3834         static int warn_once;
3835 
3836         /*
3837          * for this package, use the first die_id that exists
3838          */
3839         for (die = 0; die <= topo.max_die_id; ++die) {
3840 
3841                 sprintf(path, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d/current_freq_khz",
3842                         package, die);
3843 
3844                 if (access(path, R_OK) == 0)
3845                         return (snapshot_sysfs_counter(path) / 1000);
3846         }
3847         if (!warn_once) {
3848                 warnx("BUG: %s: No %s", __func__, path);
3849                 warn_once = 1;
3850         }
3851 
3852         return 0;
3853 }
3854 
3855 int get_epb(int cpu)
3856 {
3857         char path[128 + PATH_BYTES];
3858         unsigned long long msr;
3859         int ret, epb = -1;
3860         FILE *fp;
3861 
3862         sprintf(path, "/sys/devices/system/cpu/cpu%d/power/energy_perf_bias", cpu);
3863 
3864         fp = fopen(path, "r");
3865         if (!fp)
3866                 goto msr_fallback;
3867 
3868         ret = fscanf(fp, "%d", &epb);
3869         if (ret != 1)
3870                 err(1, "%s(%s)", __func__, path);
3871 
3872         fclose(fp);
3873 
3874         return epb;
3875 
3876 msr_fallback:
3877         if (no_msr)
3878                 return -1;
3879 
3880         get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr);
3881 
3882         return msr & 0xf;
3883 }
3884 
3885 void get_apic_id(struct thread_data *t)
3886 {
3887         unsigned int eax, ebx, ecx, edx;
3888 
3889         if (DO_BIC(BIC_APIC)) {
3890                 eax = ebx = ecx = edx = 0;
3891                 __cpuid(1, eax, ebx, ecx, edx);
3892 
3893                 t->apic_id = (ebx >> 24) & 0xff;
3894         }
3895 
3896         if (!DO_BIC(BIC_X2APIC))
3897                 return;
3898 
3899         if (authentic_amd || hygon_genuine) {
3900                 unsigned int topology_extensions;
3901 
3902                 if (max_extended_level < 0x8000001e)
3903                         return;
3904 
3905                 eax = ebx = ecx = edx = 0;
3906                 __cpuid(0x80000001, eax, ebx, ecx, edx);
3907                 topology_extensions = ecx & (1 << 22);
3908 
3909                 if (topology_extensions == 0)
3910                         return;
3911 
3912                 eax = ebx = ecx = edx = 0;
3913                 __cpuid(0x8000001e, eax, ebx, ecx, edx);
3914 
3915                 t->x2apic_id = eax;
3916                 return;
3917         }
3918 
3919         if (!genuine_intel)
3920                 return;
3921 
3922         if (max_level < 0xb)
3923                 return;
3924 
3925         ecx = 0;
3926         __cpuid(0xb, eax, ebx, ecx, edx);
3927         t->x2apic_id = edx;
3928 
3929         if (debug && (t->apic_id != (t->x2apic_id & 0xff)))
3930                 fprintf(outf, "cpu%d: BIOS BUG: apic 0x%x x2apic 0x%x\n", t->cpu_id, t->apic_id, t->x2apic_id);
3931 }
3932 
3933 int get_core_throt_cnt(int cpu, unsigned long long *cnt)
3934 {
3935         char path[128 + PATH_BYTES];
3936         unsigned long long tmp;
3937         FILE *fp;
3938         int ret;
3939 
3940         sprintf(path, "/sys/devices/system/cpu/cpu%d/thermal_throttle/core_throttle_count", cpu);
3941         fp = fopen(path, "r");
3942         if (!fp)
3943                 return -1;
3944         ret = fscanf(fp, "%lld", &tmp);
3945         fclose(fp);
3946         if (ret != 1)
3947                 return -1;
3948         *cnt = tmp;
3949 
3950         return 0;
3951 }
3952 
3953 struct amperf_group_fd {
3954         int aperf;              /* Also the group descriptor */
3955         int mperf;
3956 };
3957 
3958 static int read_perf_counter_info(const char *const path, const char *const parse_format, void *value_ptr)
3959 {
3960         int fdmt;
3961         int bytes_read;
3962         char buf[64];
3963         int ret = -1;
3964 
3965         fdmt = open(path, O_RDONLY, 0);
3966         if (fdmt == -1) {
3967                 if (debug)
3968                         fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3969                 ret = -1;
3970                 goto cleanup_and_exit;
3971         }
3972 
3973         bytes_read = read(fdmt, buf, sizeof(buf) - 1);
3974         if (bytes_read <= 0 || bytes_read >= (int)sizeof(buf)) {
3975                 if (debug)
3976                         fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3977                 ret = -1;
3978                 goto cleanup_and_exit;
3979         }
3980 
3981         buf[bytes_read] = '\0';
3982 
3983         if (sscanf(buf, parse_format, value_ptr) != 1) {
3984                 if (debug)
3985                         fprintf(stderr, "Failed to parse perf counter info %s\n", path);
3986                 ret = -1;
3987                 goto cleanup_and_exit;
3988         }
3989 
3990         ret = 0;
3991 
3992 cleanup_and_exit:
3993         close(fdmt);
3994         return ret;
3995 }
3996 
3997 static unsigned int read_perf_counter_info_n(const char *const path, const char *const parse_format)
3998 {
3999         unsigned int v;
4000         int status;
4001 
4002         status = read_perf_counter_info(path, parse_format, &v);
4003         if (status)
4004                 v = -1;
4005 
4006         return v;
4007 }
4008 
4009 static unsigned int read_perf_type(const char *subsys)
4010 {
4011         const char *const path_format = "/sys/bus/event_source/devices/%s/type";
4012         const char *const format = "%u";
4013         char path[128];
4014 
4015         snprintf(path, sizeof(path), path_format, subsys);
4016 
4017         return read_perf_counter_info_n(path, format);
4018 }
4019 
4020 static unsigned int read_perf_config(const char *subsys, const char *event_name)
4021 {
4022         const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s";
4023         FILE *fconfig = NULL;
4024         char path[128];
4025         char config_str[64];
4026         unsigned int config;
4027         unsigned int umask;
4028         bool has_config = false;
4029         bool has_umask = false;
4030         unsigned int ret = -1;
4031 
4032         snprintf(path, sizeof(path), path_format, subsys, event_name);
4033 
4034         fconfig = fopen(path, "r");
4035         if (!fconfig)
4036                 return -1;
4037 
4038         if (fgets(config_str, ARRAY_SIZE(config_str), fconfig) != config_str)
4039                 goto cleanup_and_exit;
4040 
4041         for (char *pconfig_str = &config_str[0]; pconfig_str;) {
4042                 if (sscanf(pconfig_str, "event=%x", &config) == 1) {
4043                         has_config = true;
4044                         goto next;
4045                 }
4046 
4047                 if (sscanf(pconfig_str, "umask=%x", &umask) == 1) {
4048                         has_umask = true;
4049                         goto next;
4050                 }
4051 
4052 next:
4053                 pconfig_str = strchr(pconfig_str, ',');
4054                 if (pconfig_str) {
4055                         *pconfig_str = '\0';
4056                         ++pconfig_str;
4057                 }
4058         }
4059 
4060         if (!has_umask)
4061                 umask = 0;
4062 
4063         if (has_config)
4064                 ret = (umask << 8) | config;
4065 
4066 cleanup_and_exit:
4067         fclose(fconfig);
4068         return ret;
4069 }
4070 
4071 static unsigned int read_perf_rapl_unit(const char *subsys, const char *event_name)
4072 {
4073         const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.unit";
4074         const char *const format = "%s";
4075         char path[128];
4076         char unit_buffer[16];
4077 
4078         snprintf(path, sizeof(path), path_format, subsys, event_name);
4079 
4080         read_perf_counter_info(path, format, &unit_buffer);
4081         if (strcmp("Joules", unit_buffer) == 0)
4082                 return RAPL_UNIT_JOULES;
4083 
4084         return RAPL_UNIT_INVALID;
4085 }
4086 
4087 static double read_perf_scale(const char *subsys, const char *event_name)
4088 {
4089         const char *const path_format = "/sys/bus/event_source/devices/%s/events/%s.scale";
4090         const char *const format = "%lf";
4091         char path[128];
4092         double scale;
4093 
4094         snprintf(path, sizeof(path), path_format, subsys, event_name);
4095 
4096         if (read_perf_counter_info(path, format, &scale))
4097                 return 0.0;
4098 
4099         return scale;
4100 }
4101 
4102 size_t rapl_counter_info_count_perf(const struct rapl_counter_info_t *rci)
4103 {
4104         size_t ret = 0;
4105 
4106         for (int i = 0; i < NUM_RAPL_COUNTERS; ++i)
4107                 if (rci->source[i] == COUNTER_SOURCE_PERF)
4108                         ++ret;
4109 
4110         return ret;
4111 }
4112 
4113 static size_t cstate_counter_info_count_perf(const struct cstate_counter_info_t *cci)
4114 {
4115         size_t ret = 0;
4116 
4117         for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i)
4118                 if (cci->source[i] == COUNTER_SOURCE_PERF)
4119                         ++ret;
4120 
4121         return ret;
4122 }
4123 
4124 void write_rapl_counter(struct rapl_counter *rc, struct rapl_counter_info_t *rci, unsigned int idx)
4125 {
4126         rc->raw_value = rci->data[idx];
4127         rc->unit = rci->unit[idx];
4128         rc->scale = rci->scale[idx];
4129 }
4130 
4131 int get_rapl_counters(int cpu, unsigned int domain, struct core_data *c, struct pkg_data *p)
4132 {
4133         unsigned long long perf_data[NUM_RAPL_COUNTERS + 1];
4134         struct rapl_counter_info_t *rci;
4135 
4136         if (debug >= 2)
4137                 fprintf(stderr, "%s: cpu%d domain%d\n", __func__, cpu, domain);
4138 
4139         assert(rapl_counter_info_perdomain);
4140         assert(domain < rapl_counter_info_perdomain_size);
4141 
4142         rci = &rapl_counter_info_perdomain[domain];
4143 
4144         /*
4145          * If we have any perf counters to read, read them all now, in bulk
4146          */
4147         if (rci->fd_perf != -1) {
4148                 size_t num_perf_counters = rapl_counter_info_count_perf(rci);
4149                 const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
4150                 const ssize_t actual_read_size = read(rci->fd_perf, &perf_data[0], sizeof(perf_data));
4151 
4152                 if (actual_read_size != expected_read_size)
4153                         err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
4154                             actual_read_size);
4155         }
4156 
4157         for (unsigned int i = 0, pi = 1; i < NUM_RAPL_COUNTERS; ++i) {
4158                 switch (rci->source[i]) {
4159                 case COUNTER_SOURCE_NONE:
4160                         break;
4161 
4162                 case COUNTER_SOURCE_PERF:
4163                         assert(pi < ARRAY_SIZE(perf_data));
4164                         assert(rci->fd_perf != -1);
4165 
4166                         if (debug >= 2)
4167                                 fprintf(stderr, "Reading rapl counter via perf at %u (%llu %e %lf)\n",
4168                                         i, perf_data[pi], rci->scale[i], perf_data[pi] * rci->scale[i]);
4169 
4170                         rci->data[i] = perf_data[pi];
4171 
4172                         ++pi;
4173                         break;
4174 
4175                 case COUNTER_SOURCE_MSR:
4176                         if (debug >= 2)
4177                                 fprintf(stderr, "Reading rapl counter via msr at %u\n", i);
4178 
4179                         assert(!no_msr);
4180                         if (rci->flags[i] & RAPL_COUNTER_FLAG_USE_MSR_SUM) {
4181                                 if (get_msr_sum(cpu, rci->msr[i], &rci->data[i]))
4182                                         return -13 - i;
4183                         } else {
4184                                 if (get_msr(cpu, rci->msr[i], &rci->data[i]))
4185                                         return -13 - i;
4186                         }
4187 
4188                         rci->data[i] &= rci->msr_mask[i];
4189                         if (rci->msr_shift[i] >= 0)
4190                                 rci->data[i] >>= abs(rci->msr_shift[i]);
4191                         else
4192                                 rci->data[i] <<= abs(rci->msr_shift[i]);
4193 
4194                         break;
4195                 }
4196         }
4197 
4198         BUILD_BUG_ON(NUM_RAPL_COUNTERS != 7);
4199         write_rapl_counter(&p->energy_pkg, rci, RAPL_RCI_INDEX_ENERGY_PKG);
4200         write_rapl_counter(&p->energy_cores, rci, RAPL_RCI_INDEX_ENERGY_CORES);
4201         write_rapl_counter(&p->energy_dram, rci, RAPL_RCI_INDEX_DRAM);
4202         write_rapl_counter(&p->energy_gfx, rci, RAPL_RCI_INDEX_GFX);
4203         write_rapl_counter(&p->rapl_pkg_perf_status, rci, RAPL_RCI_INDEX_PKG_PERF_STATUS);
4204         write_rapl_counter(&p->rapl_dram_perf_status, rci, RAPL_RCI_INDEX_DRAM_PERF_STATUS);
4205         write_rapl_counter(&c->core_energy, rci, RAPL_RCI_INDEX_CORE_ENERGY);
4206 
4207         return 0;
4208 }
4209 
4210 char *find_sysfs_path_by_id(struct sysfs_path *sp, int id)
4211 {
4212         while (sp) {
4213                 if (sp->id == id)
4214                         return (sp->path);
4215                 sp = sp->next;
4216         }
4217         if (debug)
4218                 warnx("%s: id%d not found", __func__, id);
4219         return NULL;
4220 }
4221 
4222 int get_cstate_counters(unsigned int cpu, struct thread_data *t, struct core_data *c, struct pkg_data *p)
4223 {
4224         /*
4225          * Overcommit memory a little bit here,
4226          * but skip calculating exact sizes for the buffers.
4227          */
4228         unsigned long long perf_data[NUM_CSTATE_COUNTERS];
4229         unsigned long long perf_data_core[NUM_CSTATE_COUNTERS + 1];
4230         unsigned long long perf_data_pkg[NUM_CSTATE_COUNTERS + 1];
4231 
4232         struct cstate_counter_info_t *cci;
4233 
4234         if (debug >= 2)
4235                 fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
4236 
4237         assert(ccstate_counter_info);
4238         assert(cpu <= ccstate_counter_info_size);
4239 
4240         ZERO_ARRAY(perf_data);
4241         ZERO_ARRAY(perf_data_core);
4242         ZERO_ARRAY(perf_data_pkg);
4243 
4244         cci = &ccstate_counter_info[cpu];
4245 
4246         /*
4247          * If we have any perf counters to read, read them all now, in bulk
4248          */
4249         const size_t num_perf_counters = cstate_counter_info_count_perf(cci);
4250         ssize_t expected_read_size = num_perf_counters * sizeof(unsigned long long);
4251         ssize_t actual_read_size_core = 0, actual_read_size_pkg = 0;
4252 
4253         if (cci->fd_perf_core != -1) {
4254                 /* Each descriptor read begins with number of counters read. */
4255                 expected_read_size += sizeof(unsigned long long);
4256 
4257                 actual_read_size_core = read(cci->fd_perf_core, &perf_data_core[0], sizeof(perf_data_core));
4258 
4259                 if (actual_read_size_core <= 0)
4260                         err(-1, "%s: read perf %s: %ld", __func__, "core", actual_read_size_core);
4261         }
4262 
4263         if (cci->fd_perf_pkg != -1) {
4264                 /* Each descriptor read begins with number of counters read. */
4265                 expected_read_size += sizeof(unsigned long long);
4266 
4267                 actual_read_size_pkg = read(cci->fd_perf_pkg, &perf_data_pkg[0], sizeof(perf_data_pkg));
4268 
4269                 if (actual_read_size_pkg <= 0)
4270                         err(-1, "%s: read perf %s: %ld", __func__, "pkg", actual_read_size_pkg);
4271         }
4272 
4273         const ssize_t actual_read_size_total = actual_read_size_core + actual_read_size_pkg;
4274 
4275         if (actual_read_size_total != expected_read_size)
4276                 err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size, actual_read_size_total);
4277 
4278         /*
4279          * Copy ccstate and pcstate data into unified buffer.
4280          *
4281          * Skip first element from core and pkg buffers.
4282          * Kernel puts there how many counters were read.
4283          */
4284         const size_t num_core_counters = perf_data_core[0];
4285         const size_t num_pkg_counters = perf_data_pkg[0];
4286 
4287         assert(num_perf_counters == num_core_counters + num_pkg_counters);
4288 
4289         /* Copy ccstate perf data */
4290         memcpy(&perf_data[0], &perf_data_core[1], num_core_counters * sizeof(unsigned long long));
4291 
4292         /* Copy pcstate perf data */
4293         memcpy(&perf_data[num_core_counters], &perf_data_pkg[1], num_pkg_counters * sizeof(unsigned long long));
4294 
4295         for (unsigned int i = 0, pi = 0; i < NUM_CSTATE_COUNTERS; ++i) {
4296                 switch (cci->source[i]) {
4297                 case COUNTER_SOURCE_NONE:
4298                         break;
4299 
4300                 case COUNTER_SOURCE_PERF:
4301                         assert(pi < ARRAY_SIZE(perf_data));
4302                         assert(cci->fd_perf_core != -1 || cci->fd_perf_pkg != -1);
4303 
4304                         if (debug >= 2)
4305                                 fprintf(stderr, "cstate via %s %u: %llu\n", "perf", i, perf_data[pi]);
4306 
4307                         cci->data[i] = perf_data[pi];
4308 
4309                         ++pi;
4310                         break;
4311 
4312                 case COUNTER_SOURCE_MSR:
4313                         assert(!no_msr);
4314                         if (get_msr(cpu, cci->msr[i], &cci->data[i]))
4315                                 return -13 - i;
4316 
4317                         if (debug >= 2)
4318                                 fprintf(stderr, "cstate via %s0x%llx %u: %llu\n", "msr", cci->msr[i], i, cci->data[i]);
4319 
4320                         break;
4321                 }
4322         }
4323 
4324         /*
4325          * Helper to write the data only if the source of
4326          * the counter for the current cpu is not none.
4327          *
4328          * Otherwise we would overwrite core data with 0 (default value),
4329          * when invoked for the thread sibling.
4330          */
4331 #define PERF_COUNTER_WRITE_DATA(out_counter, index) do {        \
4332         if (cci->source[index] != COUNTER_SOURCE_NONE)          \
4333                 out_counter = cci->data[index];                 \
4334 } while (0)
4335 
4336         BUILD_BUG_ON(NUM_CSTATE_COUNTERS != 11);
4337 
4338         PERF_COUNTER_WRITE_DATA(t->c1, CCSTATE_RCI_INDEX_C1_RESIDENCY);
4339         PERF_COUNTER_WRITE_DATA(c->c3, CCSTATE_RCI_INDEX_C3_RESIDENCY);
4340         PERF_COUNTER_WRITE_DATA(c->c6, CCSTATE_RCI_INDEX_C6_RESIDENCY);
4341         PERF_COUNTER_WRITE_DATA(c->c7, CCSTATE_RCI_INDEX_C7_RESIDENCY);
4342 
4343         PERF_COUNTER_WRITE_DATA(p->pc2, PCSTATE_RCI_INDEX_C2_RESIDENCY);
4344         PERF_COUNTER_WRITE_DATA(p->pc3, PCSTATE_RCI_INDEX_C3_RESIDENCY);
4345         PERF_COUNTER_WRITE_DATA(p->pc6, PCSTATE_RCI_INDEX_C6_RESIDENCY);
4346         PERF_COUNTER_WRITE_DATA(p->pc7, PCSTATE_RCI_INDEX_C7_RESIDENCY);
4347         PERF_COUNTER_WRITE_DATA(p->pc8, PCSTATE_RCI_INDEX_C8_RESIDENCY);
4348         PERF_COUNTER_WRITE_DATA(p->pc9, PCSTATE_RCI_INDEX_C9_RESIDENCY);
4349         PERF_COUNTER_WRITE_DATA(p->pc10, PCSTATE_RCI_INDEX_C10_RESIDENCY);
4350 
4351 #undef PERF_COUNTER_WRITE_DATA
4352 
4353         return 0;
4354 }
4355 
4356 size_t msr_counter_info_count_perf(const struct msr_counter_info_t *mci)
4357 {
4358         size_t ret = 0;
4359 
4360         for (int i = 0; i < NUM_MSR_COUNTERS; ++i)
4361                 if (mci->source[i] == COUNTER_SOURCE_PERF)
4362                         ++ret;
4363 
4364         return ret;
4365 }
4366 
4367 int get_smi_aperf_mperf(unsigned int cpu, struct thread_data *t)
4368 {
4369         unsigned long long perf_data[NUM_MSR_COUNTERS + 1];
4370 
4371         struct msr_counter_info_t *mci;
4372 
4373         if (debug >= 2)
4374                 fprintf(stderr, "%s: cpu%d\n", __func__, cpu);
4375 
4376         assert(msr_counter_info);
4377         assert(cpu <= msr_counter_info_size);
4378 
4379         mci = &msr_counter_info[cpu];
4380 
4381         ZERO_ARRAY(perf_data);
4382         ZERO_ARRAY(mci->data);
4383 
4384         if (mci->fd_perf != -1) {
4385                 const size_t num_perf_counters = msr_counter_info_count_perf(mci);
4386                 const ssize_t expected_read_size = (num_perf_counters + 1) * sizeof(unsigned long long);
4387                 const ssize_t actual_read_size = read(mci->fd_perf, &perf_data[0], sizeof(perf_data));
4388 
4389                 if (actual_read_size != expected_read_size)
4390                         err(-1, "%s: failed to read perf_data (%zu %zu)", __func__, expected_read_size,
4391                             actual_read_size);
4392         }
4393 
4394         for (unsigned int i = 0, pi = 1; i < NUM_MSR_COUNTERS; ++i) {
4395                 switch (mci->source[i]) {
4396                 case COUNTER_SOURCE_NONE:
4397                         break;
4398 
4399                 case COUNTER_SOURCE_PERF:
4400                         assert(pi < ARRAY_SIZE(perf_data));
4401                         assert(mci->fd_perf != -1);
4402 
4403                         if (debug >= 2)
4404                                 fprintf(stderr, "Reading msr counter via perf at %u: %llu\n", i, perf_data[pi]);
4405 
4406                         mci->data[i] = perf_data[pi];
4407 
4408                         ++pi;
4409                         break;
4410 
4411                 case COUNTER_SOURCE_MSR:
4412                         assert(!no_msr);
4413 
4414                         if (get_msr(cpu, mci->msr[i], &mci->data[i]))
4415                                 return -2 - i;
4416 
4417                         mci->data[i] &= mci->msr_mask[i];
4418 
4419                         if (debug >= 2)
4420                                 fprintf(stderr, "Reading msr counter via msr at %u: %llu\n", i, mci->data[i]);
4421 
4422                         break;
4423                 }
4424         }
4425 
4426         BUILD_BUG_ON(NUM_MSR_COUNTERS != 3);
4427         t->aperf = mci->data[MSR_RCI_INDEX_APERF];
4428         t->mperf = mci->data[MSR_RCI_INDEX_MPERF];
4429         t->smi_count = mci->data[MSR_RCI_INDEX_SMI];
4430 
4431         return 0;
4432 }
4433 
4434 int perf_counter_info_read_values(struct perf_counter_info *pp, int cpu, unsigned long long *out, size_t out_size)
4435 {
4436         unsigned int domain;
4437         unsigned long long value;
4438         int fd_counter;
4439 
4440         for (size_t i = 0; pp; ++i, pp = pp->next) {
4441                 domain = cpu_to_domain(pp, cpu);
4442                 assert(domain < pp->num_domains);
4443 
4444                 fd_counter = pp->fd_perf_per_domain[domain];
4445 
4446                 if (fd_counter == -1)
4447                         continue;
4448 
4449                 if (read(fd_counter, &value, sizeof(value)) != sizeof(value))
4450                         return 1;
4451 
4452                 assert(i < out_size);
4453                 out[i] = value * pp->scale;
4454         }
4455 
4456         return 0;
4457 }
4458 
4459 unsigned long pmt_gen_value_mask(unsigned int lsb, unsigned int msb)
4460 {
4461         unsigned long mask;
4462 
4463         if (msb == 63)
4464                 mask = 0xffffffffffffffff;
4465         else
4466                 mask = ((1 << (msb + 1)) - 1);
4467 
4468         mask -= (1 << lsb) - 1;
4469 
4470         return mask;
4471 }
4472 
4473 unsigned long pmt_read_counter(struct pmt_counter *ppmt, unsigned int domain_id)
4474 {
4475         assert(domain_id < ppmt->num_domains);
4476 
4477         const unsigned long *pmmio = ppmt->domains[domain_id].pcounter;
4478         const unsigned long value = pmmio ? *pmmio : 0;
4479         const unsigned long value_mask = pmt_gen_value_mask(ppmt->lsb, ppmt->msb);
4480         const unsigned long value_shift = ppmt->lsb;
4481 
4482         return (value & value_mask) >> value_shift;
4483 }
4484 
4485 /*
4486  * get_counters(...)
4487  * migrate to cpu
4488  * acquire and record local counters for that cpu
4489  */
4490 int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
4491 {
4492         int cpu = t->cpu_id;
4493         unsigned long long msr;
4494         struct msr_counter *mp;
4495         struct pmt_counter *pp;
4496         int i;
4497         int status;
4498 
4499         if (cpu_migrate(cpu)) {
4500                 fprintf(outf, "%s: Could not migrate to CPU %d\n", __func__, cpu);
4501                 return -1;
4502         }
4503 
4504         gettimeofday(&t->tv_begin, (struct timezone *)NULL);
4505 
4506         if (first_counter_read)
4507                 get_apic_id(t);
4508 
4509         t->tsc = rdtsc();       /* we are running on local CPU of interest */
4510 
4511         get_smi_aperf_mperf(cpu, t);
4512 
4513         if (DO_BIC(BIC_IPC))
4514                 if (read(get_instr_count_fd(cpu), &t->instr_count, sizeof(long long)) != sizeof(long long))
4515                         return -4;
4516 
4517         if (DO_BIC(BIC_IRQ))
4518                 t->irq_count = irqs_per_cpu[cpu];
4519 
4520         get_cstate_counters(cpu, t, c, p);
4521 
4522         for (i = 0, mp = sys.tp; mp; i++, mp = mp->next) {
4523                 if (get_mp(cpu, mp, &t->counter[i], mp->sp->path))
4524                         return -10;
4525         }
4526 
4527         if (perf_counter_info_read_values(sys.perf_tp, cpu, t->perf_counter, MAX_ADDED_THREAD_COUNTERS))
4528                 return -10;
4529 
4530         for (i = 0, pp = sys.pmt_tp; pp; i++, pp = pp->next)
4531                 t->pmt_counter[i] = pmt_read_counter(pp, t->cpu_id);
4532 
4533         /* collect core counters only for 1st thread in core */
4534         if (!is_cpu_first_thread_in_core(t, c, p))
4535                 goto done;
4536 
4537         if (platform->has_per_core_rapl) {
4538                 status = get_rapl_counters(cpu, c->core_id, c, p);
4539                 if (status != 0)
4540                         return status;
4541         }
4542 
4543         if (DO_BIC(BIC_CPU_c7) && t->is_atom) {
4544                 /*
4545                  * For Atom CPUs that has core cstate deeper than c6,
4546                  * MSR_CORE_C6_RESIDENCY returns residency of cc6 and deeper.
4547                  * Minus CC7 (and deeper cstates) residency to get
4548                  * accturate cc6 residency.
4549                  */
4550                 c->c6 -= c->c7;
4551         }
4552 
4553         if (DO_BIC(BIC_Mod_c6))
4554                 if (get_msr(cpu, MSR_MODULE_C6_RES_MS, &c->mc6_us))
4555                         return -8;
4556 
4557         if (DO_BIC(BIC_CoreTmp)) {
4558                 if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
4559                         return -9;
4560                 c->core_temp_c = tj_max - ((msr >> 16) & 0x7F);
4561         }
4562 
4563         if (DO_BIC(BIC_CORE_THROT_CNT))
4564                 get_core_throt_cnt(cpu, &c->core_throt_cnt);
4565 
4566         for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) {
4567                 if (get_mp(cpu, mp, &c->counter[i], mp->sp->path))
4568                         return -10;
4569         }
4570 
4571         if (perf_counter_info_read_values(sys.perf_cp, cpu, c->perf_counter, MAX_ADDED_CORE_COUNTERS))
4572                 return -10;
4573 
4574         for (i = 0, pp = sys.pmt_cp; pp; i++, pp = pp->next)
4575                 c->pmt_counter[i] = pmt_read_counter(pp, c->core_id);
4576 
4577         /* collect package counters only for 1st core in package */
4578         if (!is_cpu_first_core_in_package(t, c, p))
4579                 goto done;
4580 
4581         if (DO_BIC(BIC_Totl_c0)) {
4582                 if (get_msr(cpu, MSR_PKG_WEIGHTED_CORE_C0_RES, &p->pkg_wtd_core_c0))
4583                         return -10;
4584         }
4585         if (DO_BIC(BIC_Any_c0)) {
4586                 if (get_msr(cpu, MSR_PKG_ANY_CORE_C0_RES, &p->pkg_any_core_c0))
4587                         return -11;
4588         }
4589         if (DO_BIC(BIC_GFX_c0)) {
4590                 if (get_msr(cpu, MSR_PKG_ANY_GFXE_C0_RES, &p->pkg_any_gfxe_c0))
4591                         return -12;
4592         }
4593         if (DO_BIC(BIC_CPUGFX)) {
4594                 if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0))
4595                         return -13;
4596         }
4597 
4598         if (DO_BIC(BIC_CPU_LPI))
4599                 p->cpu_lpi = cpuidle_cur_cpu_lpi_us;
4600         if (DO_BIC(BIC_SYS_LPI))
4601                 p->sys_lpi = cpuidle_cur_sys_lpi_us;
4602 
4603         if (!platform->has_per_core_rapl) {
4604                 status = get_rapl_counters(cpu, p->package_id, c, p);
4605                 if (status != 0)
4606                         return status;
4607         }
4608 
4609         if (DO_BIC(BIC_PkgTmp)) {
4610                 if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
4611                         return -17;
4612                 p->pkg_temp_c = tj_max - ((msr >> 16) & 0x7F);
4613         }
4614 
4615         if (DO_BIC(BIC_UNCORE_MHZ))
4616                 p->uncore_mhz = get_legacy_uncore_mhz(p->package_id);
4617 
4618         if (DO_BIC(BIC_GFX_rc6))
4619                 p->gfx_rc6_ms = gfx_info[GFX_rc6].val_ull;
4620 
4621         if (DO_BIC(BIC_GFXMHz))
4622                 p->gfx_mhz = gfx_info[GFX_MHz].val;
4623 
4624         if (DO_BIC(BIC_GFXACTMHz))
4625                 p->gfx_act_mhz = gfx_info[GFX_ACTMHz].val;
4626 
4627         if (DO_BIC(BIC_SAM_mc6))
4628                 p->sam_mc6_ms = gfx_info[SAM_mc6].val_ull;
4629 
4630         if (DO_BIC(BIC_SAMMHz))
4631                 p->sam_mhz = gfx_info[SAM_MHz].val;
4632 
4633         if (DO_BIC(BIC_SAMACTMHz))
4634                 p->sam_act_mhz = gfx_info[SAM_ACTMHz].val;
4635 
4636         for (i = 0, mp = sys.pp; mp; i++, mp = mp->next) {
4637                 char *path = NULL;
4638 
4639                 if (mp->msr_num == 0) {
4640                         path = find_sysfs_path_by_id(mp->sp, p->package_id);
4641                         if (path == NULL) {
4642                                 warnx("%s: package_id %d not found", __func__, p->package_id);
4643                                 return -10;
4644                         }
4645                 }
4646                 if (get_mp(cpu, mp, &p->counter[i], path))
4647                         return -10;
4648         }
4649 
4650         if (perf_counter_info_read_values(sys.perf_pp, cpu, p->perf_counter, MAX_ADDED_PACKAGE_COUNTERS))
4651                 return -10;
4652 
4653         for (i = 0, pp = sys.pmt_pp; pp; i++, pp = pp->next)
4654                 p->pmt_counter[i] = pmt_read_counter(pp, p->package_id);
4655 
4656 done:
4657         gettimeofday(&t->tv_end, (struct timezone *)NULL);
4658 
4659         return 0;
4660 }
4661 
4662 int pkg_cstate_limit = PCLUKN;
4663 char *pkg_cstate_limit_strings[] = { "unknown", "reserved", "pc0", "pc1", "pc2",
4664         "pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "pc10", "unlimited"
4665 };
4666 
4667 int nhm_pkg_cstate_limits[16] =
4668     { PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4669         PCLRSV, PCLRSV
4670 };
4671 
4672 int snb_pkg_cstate_limits[16] =
4673     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4674         PCLRSV, PCLRSV
4675 };
4676 
4677 int hsw_pkg_cstate_limits[16] =
4678     { PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4679         PCLRSV, PCLRSV
4680 };
4681 
4682 int slv_pkg_cstate_limits[16] =
4683     { PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4684         PCL__6, PCL__7
4685 };
4686 
4687 int amt_pkg_cstate_limits[16] =
4688     { PCLUNL, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4689         PCLRSV, PCLRSV
4690 };
4691 
4692 int phi_pkg_cstate_limits[16] =
4693     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4694         PCLRSV, PCLRSV
4695 };
4696 
4697 int glm_pkg_cstate_limits[16] =
4698     { PCLUNL, PCL__1, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9, PCL_10, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4699         PCLRSV, PCLRSV
4700 };
4701 
4702 int skx_pkg_cstate_limits[16] =
4703     { PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4704         PCLRSV, PCLRSV
4705 };
4706 
4707 int icx_pkg_cstate_limits[16] =
4708     { PCL__0, PCL__2, PCL__6, PCL__6, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV,
4709         PCLRSV, PCLRSV
4710 };
4711 
4712 void probe_cst_limit(void)
4713 {
4714         unsigned long long msr;
4715         int *pkg_cstate_limits;
4716 
4717         if (!platform->has_nhm_msrs || no_msr)
4718                 return;
4719 
4720         switch (platform->cst_limit) {
4721         case CST_LIMIT_NHM:
4722                 pkg_cstate_limits = nhm_pkg_cstate_limits;
4723                 break;
4724         case CST_LIMIT_SNB:
4725                 pkg_cstate_limits = snb_pkg_cstate_limits;
4726                 break;
4727         case CST_LIMIT_HSW:
4728                 pkg_cstate_limits = hsw_pkg_cstate_limits;
4729                 break;
4730         case CST_LIMIT_SKX:
4731                 pkg_cstate_limits = skx_pkg_cstate_limits;
4732                 break;
4733         case CST_LIMIT_ICX:
4734                 pkg_cstate_limits = icx_pkg_cstate_limits;
4735                 break;
4736         case CST_LIMIT_SLV:
4737                 pkg_cstate_limits = slv_pkg_cstate_limits;
4738                 break;
4739         case CST_LIMIT_AMT:
4740                 pkg_cstate_limits = amt_pkg_cstate_limits;
4741                 break;
4742         case CST_LIMIT_KNL:
4743                 pkg_cstate_limits = phi_pkg_cstate_limits;
4744                 break;
4745         case CST_LIMIT_GMT:
4746                 pkg_cstate_limits = glm_pkg_cstate_limits;
4747                 break;
4748         default:
4749                 return;
4750         }
4751 
4752         get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
4753         pkg_cstate_limit = pkg_cstate_limits[msr & 0xF];
4754 }
4755 
4756 static void dump_platform_info(void)
4757 {
4758         unsigned long long msr;
4759         unsigned int ratio;
4760 
4761         if (!platform->has_nhm_msrs || no_msr)
4762                 return;
4763 
4764         get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
4765 
4766         fprintf(outf, "cpu%d: MSR_PLATFORM_INFO: 0x%08llx\n", base_cpu, msr);
4767 
4768         ratio = (msr >> 40) & 0xFF;
4769         fprintf(outf, "%d * %.1f = %.1f MHz max efficiency frequency\n", ratio, bclk, ratio * bclk);
4770 
4771         ratio = (msr >> 8) & 0xFF;
4772         fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
4773 }
4774 
4775 static void dump_power_ctl(void)
4776 {
4777         unsigned long long msr;
4778 
4779         if (!platform->has_nhm_msrs || no_msr)
4780                 return;
4781 
4782         get_msr(base_cpu, MSR_IA32_POWER_CTL, &msr);
4783         fprintf(outf, "cpu%d: MSR_IA32_POWER_CTL: 0x%08llx (C1E auto-promotion: %sabled)\n",
4784                 base_cpu, msr, msr & 0x2 ? "EN" : "DIS");
4785 
4786         /* C-state Pre-wake Disable (CSTATE_PREWAKE_DISABLE) */
4787         if (platform->has_cst_prewake_bit)
4788                 fprintf(outf, "C-state Pre-wake: %sabled\n", msr & 0x40000000 ? "DIS" : "EN");
4789 
4790         return;
4791 }
4792 
4793 static void dump_turbo_ratio_limit2(void)
4794 {
4795         unsigned long long msr;
4796         unsigned int ratio;
4797 
4798         get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT2, &msr);
4799 
4800         fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT2: 0x%08llx\n", base_cpu, msr);
4801 
4802         ratio = (msr >> 8) & 0xFF;
4803         if (ratio)
4804                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 18 active cores\n", ratio, bclk, ratio * bclk);
4805 
4806         ratio = (msr >> 0) & 0xFF;
4807         if (ratio)
4808                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 17 active cores\n", ratio, bclk, ratio * bclk);
4809         return;
4810 }
4811 
4812 static void dump_turbo_ratio_limit1(void)
4813 {
4814         unsigned long long msr;
4815         unsigned int ratio;
4816 
4817         get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &msr);
4818 
4819         fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, msr);
4820 
4821         ratio = (msr >> 56) & 0xFF;
4822         if (ratio)
4823                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 16 active cores\n", ratio, bclk, ratio * bclk);
4824 
4825         ratio = (msr >> 48) & 0xFF;
4826         if (ratio)
4827                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 15 active cores\n", ratio, bclk, ratio * bclk);
4828 
4829         ratio = (msr >> 40) & 0xFF;
4830         if (ratio)
4831                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 14 active cores\n", ratio, bclk, ratio * bclk);
4832 
4833         ratio = (msr >> 32) & 0xFF;
4834         if (ratio)
4835                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 13 active cores\n", ratio, bclk, ratio * bclk);
4836 
4837         ratio = (msr >> 24) & 0xFF;
4838         if (ratio)
4839                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 12 active cores\n", ratio, bclk, ratio * bclk);
4840 
4841         ratio = (msr >> 16) & 0xFF;
4842         if (ratio)
4843                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 11 active cores\n", ratio, bclk, ratio * bclk);
4844 
4845         ratio = (msr >> 8) & 0xFF;
4846         if (ratio)
4847                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 10 active cores\n", ratio, bclk, ratio * bclk);
4848 
4849         ratio = (msr >> 0) & 0xFF;
4850         if (ratio)
4851                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 9 active cores\n", ratio, bclk, ratio * bclk);
4852         return;
4853 }
4854 
4855 static void dump_turbo_ratio_limits(int trl_msr_offset)
4856 {
4857         unsigned long long msr, core_counts;
4858         int shift;
4859 
4860         get_msr(base_cpu, trl_msr_offset, &msr);
4861         fprintf(outf, "cpu%d: MSR_%sTURBO_RATIO_LIMIT: 0x%08llx\n",
4862                 base_cpu, trl_msr_offset == MSR_SECONDARY_TURBO_RATIO_LIMIT ? "SECONDARY_" : "", msr);
4863 
4864         if (platform->trl_msrs & TRL_CORECOUNT) {
4865                 get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT1, &core_counts);
4866                 fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT1: 0x%08llx\n", base_cpu, core_counts);
4867         } else {
4868                 core_counts = 0x0807060504030201;
4869         }
4870 
4871         for (shift = 56; shift >= 0; shift -= 8) {
4872                 unsigned int ratio, group_size;
4873 
4874                 ratio = (msr >> shift) & 0xFF;
4875                 group_size = (core_counts >> shift) & 0xFF;
4876                 if (ratio)
4877                         fprintf(outf, "%d * %.1f = %.1f MHz max turbo %d active cores\n",
4878                                 ratio, bclk, ratio * bclk, group_size);
4879         }
4880 
4881         return;
4882 }
4883 
4884 static void dump_atom_turbo_ratio_limits(void)
4885 {
4886         unsigned long long msr;
4887         unsigned int ratio;
4888 
4889         get_msr(base_cpu, MSR_ATOM_CORE_RATIOS, &msr);
4890         fprintf(outf, "cpu%d: MSR_ATOM_CORE_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
4891 
4892         ratio = (msr >> 0) & 0x3F;
4893         if (ratio)
4894                 fprintf(outf, "%d * %.1f = %.1f MHz minimum operating frequency\n", ratio, bclk, ratio * bclk);
4895 
4896         ratio = (msr >> 8) & 0x3F;
4897         if (ratio)
4898                 fprintf(outf, "%d * %.1f = %.1f MHz low frequency mode (LFM)\n", ratio, bclk, ratio * bclk);
4899 
4900         ratio = (msr >> 16) & 0x3F;
4901         if (ratio)
4902                 fprintf(outf, "%d * %.1f = %.1f MHz base frequency\n", ratio, bclk, ratio * bclk);
4903 
4904         get_msr(base_cpu, MSR_ATOM_CORE_TURBO_RATIOS, &msr);
4905         fprintf(outf, "cpu%d: MSR_ATOM_CORE_TURBO_RATIOS: 0x%08llx\n", base_cpu, msr & 0xFFFFFFFF);
4906 
4907         ratio = (msr >> 24) & 0x3F;
4908         if (ratio)
4909                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 4 active cores\n", ratio, bclk, ratio * bclk);
4910 
4911         ratio = (msr >> 16) & 0x3F;
4912         if (ratio)
4913                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 3 active cores\n", ratio, bclk, ratio * bclk);
4914 
4915         ratio = (msr >> 8) & 0x3F;
4916         if (ratio)
4917                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 2 active cores\n", ratio, bclk, ratio * bclk);
4918 
4919         ratio = (msr >> 0) & 0x3F;
4920         if (ratio)
4921                 fprintf(outf, "%d * %.1f = %.1f MHz max turbo 1 active core\n", ratio, bclk, ratio * bclk);
4922 }
4923 
4924 static void dump_knl_turbo_ratio_limits(void)
4925 {
4926         const unsigned int buckets_no = 7;
4927 
4928         unsigned long long msr;
4929         int delta_cores, delta_ratio;
4930         int i, b_nr;
4931         unsigned int cores[buckets_no];
4932         unsigned int ratio[buckets_no];
4933 
4934         get_msr(base_cpu, MSR_TURBO_RATIO_LIMIT, &msr);
4935 
4936         fprintf(outf, "cpu%d: MSR_TURBO_RATIO_LIMIT: 0x%08llx\n", base_cpu, msr);
4937 
4938         /*
4939          * Turbo encoding in KNL is as follows:
4940          * [0] -- Reserved
4941          * [7:1] -- Base value of number of active cores of bucket 1.
4942          * [15:8] -- Base value of freq ratio of bucket 1.
4943          * [20:16] -- +ve delta of number of active cores of bucket 2.
4944          * i.e. active cores of bucket 2 =
4945          * active cores of bucket 1 + delta
4946          * [23:21] -- Negative delta of freq ratio of bucket 2.
4947          * i.e. freq ratio of bucket 2 =
4948          * freq ratio of bucket 1 - delta
4949          * [28:24]-- +ve delta of number of active cores of bucket 3.
4950          * [31:29]-- -ve delta of freq ratio of bucket 3.
4951          * [36:32]-- +ve delta of number of active cores of bucket 4.
4952          * [39:37]-- -ve delta of freq ratio of bucket 4.
4953          * [44:40]-- +ve delta of number of active cores of bucket 5.
4954          * [47:45]-- -ve delta of freq ratio of bucket 5.
4955          * [52:48]-- +ve delta of number of active cores of bucket 6.
4956          * [55:53]-- -ve delta of freq ratio of bucket 6.
4957          * [60:56]-- +ve delta of number of active cores of bucket 7.
4958          * [63:61]-- -ve delta of freq ratio of bucket 7.
4959          */
4960 
4961         b_nr = 0;
4962         cores[b_nr] = (msr & 0xFF) >> 1;
4963         ratio[b_nr] = (msr >> 8) & 0xFF;
4964 
4965         for (i = 16; i < 64; i += 8) {
4966                 delta_cores = (msr >> i) & 0x1F;
4967                 delta_ratio = (msr >> (i + 5)) & 0x7;
4968 
4969                 cores[b_nr + 1] = cores[b_nr] + delta_cores;
4970                 ratio[b_nr + 1] = ratio[b_nr] - delta_ratio;
4971                 b_nr++;
4972         }
4973 
4974         for (i = buckets_no - 1; i >= 0; i--)
4975                 if (i > 0 ? ratio[i] != ratio[i - 1] : 1)
4976                         fprintf(outf,
4977                                 "%d * %.1f = %.1f MHz max turbo %d active cores\n",
4978                                 ratio[i], bclk, ratio[i] * bclk, cores[i]);
4979 }
4980 
4981 static void dump_cst_cfg(void)
4982 {
4983         unsigned long long msr;
4984 
4985         if (!platform->has_nhm_msrs || no_msr)
4986                 return;
4987 
4988         get_msr(base_cpu, MSR_PKG_CST_CONFIG_CONTROL, &msr);
4989 
4990         fprintf(outf, "cpu%d: MSR_PKG_CST_CONFIG_CONTROL: 0x%08llx", base_cpu, msr);
4991 
4992         fprintf(outf, " (%s%s%s%s%slocked, pkg-cstate-limit=%d (%s)",
4993                 (msr & SNB_C3_AUTO_UNDEMOTE) ? "UNdemote-C3, " : "",
4994                 (msr & SNB_C1_AUTO_UNDEMOTE) ? "UNdemote-C1, " : "",
4995                 (msr & NHM_C3_AUTO_DEMOTE) ? "demote-C3, " : "",
4996                 (msr & NHM_C1_AUTO_DEMOTE) ? "demote-C1, " : "",
4997                 (msr & (1 << 15)) ? "" : "UN", (unsigned int)msr & 0xF, pkg_cstate_limit_strings[pkg_cstate_limit]);
4998 
4999 #define AUTOMATIC_CSTATE_CONVERSION             (1UL << 16)
5000         if (platform->has_cst_auto_convension) {
5001                 fprintf(outf, ", automatic c-state conversion=%s", (msr & AUTOMATIC_CSTATE_CONVERSION) ? "on" : "off");
5002         }
5003 
5004         fprintf(outf, ")\n");
5005 
5006         return;
5007 }
5008 
5009 static void dump_config_tdp(void)
5010 {
5011         unsigned long long msr;
5012 
5013         get_msr(base_cpu, MSR_CONFIG_TDP_NOMINAL, &msr);
5014         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_NOMINAL: 0x%08llx", base_cpu, msr);
5015         fprintf(outf, " (base_ratio=%d)\n", (unsigned int)msr & 0xFF);
5016 
5017         get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_1, &msr);
5018         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_1: 0x%08llx (", base_cpu, msr);
5019         if (msr) {
5020                 fprintf(outf, "PKG_MIN_PWR_LVL1=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
5021                 fprintf(outf, "PKG_MAX_PWR_LVL1=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
5022                 fprintf(outf, "LVL1_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
5023                 fprintf(outf, "PKG_TDP_LVL1=%d", (unsigned int)(msr) & 0x7FFF);
5024         }
5025         fprintf(outf, ")\n");
5026 
5027         get_msr(base_cpu, MSR_CONFIG_TDP_LEVEL_2, &msr);
5028         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_LEVEL_2: 0x%08llx (", base_cpu, msr);
5029         if (msr) {
5030                 fprintf(outf, "PKG_MIN_PWR_LVL2=%d ", (unsigned int)(msr >> 48) & 0x7FFF);
5031                 fprintf(outf, "PKG_MAX_PWR_LVL2=%d ", (unsigned int)(msr >> 32) & 0x7FFF);
5032                 fprintf(outf, "LVL2_RATIO=%d ", (unsigned int)(msr >> 16) & 0xFF);
5033                 fprintf(outf, "PKG_TDP_LVL2=%d", (unsigned int)(msr) & 0x7FFF);
5034         }
5035         fprintf(outf, ")\n");
5036 
5037         get_msr(base_cpu, MSR_CONFIG_TDP_CONTROL, &msr);
5038         fprintf(outf, "cpu%d: MSR_CONFIG_TDP_CONTROL: 0x%08llx (", base_cpu, msr);
5039         if ((msr) & 0x3)
5040                 fprintf(outf, "TDP_LEVEL=%d ", (unsigned int)(msr) & 0x3);
5041         fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
5042         fprintf(outf, ")\n");
5043 
5044         get_msr(base_cpu, MSR_TURBO_ACTIVATION_RATIO, &msr);
5045         fprintf(outf, "cpu%d: MSR_TURBO_ACTIVATION_RATIO: 0x%08llx (", base_cpu, msr);
5046         fprintf(outf, "MAX_NON_TURBO_RATIO=%d", (unsigned int)(msr) & 0xFF);
5047         fprintf(outf, " lock=%d", (unsigned int)(msr >> 31) & 1);
5048         fprintf(outf, ")\n");
5049 }
5050 
5051 unsigned int irtl_time_units[] = { 1, 32, 1024, 32768, 1048576, 33554432, 0, 0 };
5052 
5053 void print_irtl(void)
5054 {
5055         unsigned long long msr;
5056 
5057         if (!platform->has_irtl_msrs || no_msr)
5058                 return;
5059 
5060         if (platform->supported_cstates & PC3) {
5061                 get_msr(base_cpu, MSR_PKGC3_IRTL, &msr);
5062                 fprintf(outf, "cpu%d: MSR_PKGC3_IRTL: 0x%08llx (", base_cpu, msr);
5063                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5064                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5065         }
5066 
5067         if (platform->supported_cstates & PC6) {
5068                 get_msr(base_cpu, MSR_PKGC6_IRTL, &msr);
5069                 fprintf(outf, "cpu%d: MSR_PKGC6_IRTL: 0x%08llx (", base_cpu, msr);
5070                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5071                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5072         }
5073 
5074         if (platform->supported_cstates & PC7) {
5075                 get_msr(base_cpu, MSR_PKGC7_IRTL, &msr);
5076                 fprintf(outf, "cpu%d: MSR_PKGC7_IRTL: 0x%08llx (", base_cpu, msr);
5077                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5078                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5079         }
5080 
5081         if (platform->supported_cstates & PC8) {
5082                 get_msr(base_cpu, MSR_PKGC8_IRTL, &msr);
5083                 fprintf(outf, "cpu%d: MSR_PKGC8_IRTL: 0x%08llx (", base_cpu, msr);
5084                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5085                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5086         }
5087 
5088         if (platform->supported_cstates & PC9) {
5089                 get_msr(base_cpu, MSR_PKGC9_IRTL, &msr);
5090                 fprintf(outf, "cpu%d: MSR_PKGC9_IRTL: 0x%08llx (", base_cpu, msr);
5091                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5092                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5093         }
5094 
5095         if (platform->supported_cstates & PC10) {
5096                 get_msr(base_cpu, MSR_PKGC10_IRTL, &msr);
5097                 fprintf(outf, "cpu%d: MSR_PKGC10_IRTL: 0x%08llx (", base_cpu, msr);
5098                 fprintf(outf, "%svalid, %lld ns)\n", msr & (1 << 15) ? "" : "NOT",
5099                         (msr & 0x3FF) * irtl_time_units[(msr >> 10) & 0x3]);
5100         }
5101 }
5102 
5103 void free_fd_percpu(void)
5104 {
5105         int i;
5106 
5107         if (!fd_percpu)
5108                 return;
5109 
5110         for (i = 0; i < topo.max_cpu_num + 1; ++i) {
5111                 if (fd_percpu[i] != 0)
5112                         close(fd_percpu[i]);
5113         }
5114 
5115         free(fd_percpu);
5116         fd_percpu = NULL;
5117 }
5118 
5119 void free_fd_instr_count_percpu(void)
5120 {
5121         if (!fd_instr_count_percpu)
5122                 return;
5123 
5124         for (int i = 0; i < topo.max_cpu_num + 1; ++i) {
5125                 if (fd_instr_count_percpu[i] != 0)
5126                         close(fd_instr_count_percpu[i]);
5127         }
5128 
5129         free(fd_instr_count_percpu);
5130         fd_instr_count_percpu = NULL;
5131 }
5132 
5133 void free_fd_cstate(void)
5134 {
5135         if (!ccstate_counter_info)
5136                 return;
5137 
5138         const int counter_info_num = ccstate_counter_info_size;
5139 
5140         for (int counter_id = 0; counter_id < counter_info_num; ++counter_id) {
5141                 if (ccstate_counter_info[counter_id].fd_perf_core != -1)
5142                         close(ccstate_counter_info[counter_id].fd_perf_core);
5143 
5144                 if (ccstate_counter_info[counter_id].fd_perf_pkg != -1)
5145                         close(ccstate_counter_info[counter_id].fd_perf_pkg);
5146         }
5147 
5148         free(ccstate_counter_info);
5149         ccstate_counter_info = NULL;
5150         ccstate_counter_info_size = 0;
5151 }
5152 
5153 void free_fd_msr(void)
5154 {
5155         if (!msr_counter_info)
5156                 return;
5157 
5158         for (int cpu = 0; cpu < topo.max_cpu_num; ++cpu) {
5159                 if (msr_counter_info[cpu].fd_perf != -1)
5160                         close(msr_counter_info[cpu].fd_perf);
5161         }
5162 
5163         free(msr_counter_info);
5164         msr_counter_info = NULL;
5165         msr_counter_info_size = 0;
5166 }
5167 
5168 void free_fd_rapl_percpu(void)
5169 {
5170         if (!rapl_counter_info_perdomain)
5171                 return;
5172 
5173         const int num_domains = rapl_counter_info_perdomain_size;
5174 
5175         for (int domain_id = 0; domain_id < num_domains; ++domain_id) {
5176                 if (rapl_counter_info_perdomain[domain_id].fd_perf != -1)
5177                         close(rapl_counter_info_perdomain[domain_id].fd_perf);
5178         }
5179 
5180         free(rapl_counter_info_perdomain);
5181         rapl_counter_info_perdomain = NULL;
5182         rapl_counter_info_perdomain_size = 0;
5183 }
5184 
5185 void free_fd_added_perf_counters_(struct perf_counter_info *pp)
5186 {
5187         if (!pp)
5188                 return;
5189 
5190         if (!pp->fd_perf_per_domain)
5191                 return;
5192 
5193         while (pp) {
5194                 for (size_t domain = 0; domain < pp->num_domains; ++domain) {
5195                         if (pp->fd_perf_per_domain[domain] != -1) {
5196                                 close(pp->fd_perf_per_domain[domain]);
5197                                 pp->fd_perf_per_domain[domain] = -1;
5198                         }
5199                 }
5200 
5201                 free(pp->fd_perf_per_domain);
5202                 pp->fd_perf_per_domain = NULL;
5203 
5204                 pp = pp->next;
5205         }
5206 }
5207 
5208 void free_fd_added_perf_counters(void)
5209 {
5210         free_fd_added_perf_counters_(sys.perf_tp);
5211         free_fd_added_perf_counters_(sys.perf_cp);
5212         free_fd_added_perf_counters_(sys.perf_pp);
5213 }
5214 
5215 void free_all_buffers(void)
5216 {
5217         int i;
5218 
5219         CPU_FREE(cpu_present_set);
5220         cpu_present_set = NULL;
5221         cpu_present_setsize = 0;
5222 
5223         CPU_FREE(cpu_effective_set);
5224         cpu_effective_set = NULL;
5225         cpu_effective_setsize = 0;
5226 
5227         CPU_FREE(cpu_allowed_set);
5228         cpu_allowed_set = NULL;
5229         cpu_allowed_setsize = 0;
5230 
5231         CPU_FREE(cpu_affinity_set);
5232         cpu_affinity_set = NULL;
5233         cpu_affinity_setsize = 0;
5234 
5235         free(thread_even);
5236         free(core_even);
5237         free(package_even);
5238 
5239         thread_even = NULL;
5240         core_even = NULL;
5241         package_even = NULL;
5242 
5243         free(thread_odd);
5244         free(core_odd);
5245         free(package_odd);
5246 
5247         thread_odd = NULL;
5248         core_odd = NULL;
5249         package_odd = NULL;
5250 
5251         free(output_buffer);
5252         output_buffer = NULL;
5253         outp = NULL;
5254 
5255         free_fd_percpu();
5256         free_fd_instr_count_percpu();
5257         free_fd_msr();
5258         free_fd_rapl_percpu();
5259         free_fd_cstate();
5260         free_fd_added_perf_counters();
5261 
5262         free(irq_column_2_cpu);
5263         free(irqs_per_cpu);
5264 
5265         for (i = 0; i <= topo.max_cpu_num; ++i) {
5266                 if (cpus[i].put_ids)
5267                         CPU_FREE(cpus[i].put_ids);
5268         }
5269         free(cpus);
5270 }
5271 
5272 /*
5273  * Parse a file containing a single int.
5274  * Return 0 if file can not be opened
5275  * Exit if file can be opened, but can not be parsed
5276  */
5277 int parse_int_file(const char *fmt, ...)
5278 {
5279         va_list args;
5280         char path[PATH_MAX];
5281         FILE *filep;
5282         int value;
5283 
5284         va_start(args, fmt);
5285         vsnprintf(path, sizeof(path), fmt, args);
5286         va_end(args);
5287         filep = fopen(path, "r");
5288         if (!filep)
5289                 return 0;
5290         if (fscanf(filep, "%d", &value) != 1)
5291                 err(1, "%s: failed to parse number from file", path);
5292         fclose(filep);
5293         return value;
5294 }
5295 
5296 /*
5297  * cpu_is_first_core_in_package(cpu)
5298  * return 1 if given CPU is 1st core in package
5299  */
5300 int cpu_is_first_core_in_package(int cpu)
5301 {
5302         return cpu == parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_siblings_list", cpu);
5303 }
5304 
5305 int get_physical_package_id(int cpu)
5306 {
5307         return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/physical_package_id", cpu);
5308 }
5309 
5310 int get_die_id(int cpu)
5311 {
5312         return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/die_id", cpu);
5313 }
5314 
5315 int get_core_id(int cpu)
5316 {
5317         return parse_int_file("/sys/devices/system/cpu/cpu%d/topology/core_id", cpu);
5318 }
5319 
5320 void set_node_data(void)
5321 {
5322         int pkg, node, lnode, cpu, cpux;
5323         int cpu_count;
5324 
5325         /* initialize logical_node_id */
5326         for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu)
5327                 cpus[cpu].logical_node_id = -1;
5328 
5329         cpu_count = 0;
5330         for (pkg = 0; pkg < topo.num_packages; pkg++) {
5331                 lnode = 0;
5332                 for (cpu = 0; cpu <= topo.max_cpu_num; ++cpu) {
5333                         if (cpus[cpu].physical_package_id != pkg)
5334                                 continue;
5335                         /* find a cpu with an unset logical_node_id */
5336                         if (cpus[cpu].logical_node_id != -1)
5337                                 continue;
5338                         cpus[cpu].logical_node_id = lnode;
5339                         node = cpus[cpu].physical_node_id;
5340                         cpu_count++;
5341                         /*
5342                          * find all matching cpus on this pkg and set
5343                          * the logical_node_id
5344                          */
5345                         for (cpux = cpu; cpux <= topo.max_cpu_num; cpux++) {
5346                                 if ((cpus[cpux].physical_package_id == pkg) && (cpus[cpux].physical_node_id == node)) {
5347                                         cpus[cpux].logical_node_id = lnode;
5348                                         cpu_count++;
5349                                 }
5350                         }
5351                         lnode++;
5352                         if (lnode > topo.nodes_per_pkg)
5353                                 topo.nodes_per_pkg = lnode;
5354                 }
5355                 if (cpu_count >= topo.max_cpu_num)
5356                         break;
5357         }
5358 }
5359 
5360 int get_physical_node_id(struct cpu_topology *thiscpu)
5361 {
5362         char path[80];
5363         FILE *filep;
5364         int i;
5365         int cpu = thiscpu->logical_cpu_id;
5366 
5367         for (i = 0; i <= topo.max_cpu_num; i++) {
5368                 sprintf(path, "/sys/devices/system/cpu/cpu%d/node%i/cpulist", cpu, i);
5369                 filep = fopen(path, "r");
5370                 if (!filep)
5371                         continue;
5372                 fclose(filep);
5373                 return i;
5374         }
5375         return -1;
5376 }
5377 
5378 static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size)
5379 {
5380         unsigned int start, end;
5381         char *next = cpu_str;
5382 
5383         while (next && *next) {
5384 
5385                 if (*next == '-')       /* no negative cpu numbers */
5386                         return 1;
5387 
5388                 start = strtoul(next, &next, 10);
5389 
5390                 if (start >= CPU_SUBSET_MAXCPUS)
5391                         return 1;
5392                 CPU_SET_S(start, cpu_set_size, cpu_set);
5393 
5394                 if (*next == '\0' || *next == '\n')
5395                         break;
5396 
5397                 if (*next == ',') {
5398                         next += 1;
5399                         continue;
5400                 }
5401 
5402                 if (*next == '-') {
5403                         next += 1;      /* start range */
5404                 } else if (*next == '.') {
5405                         next += 1;
5406                         if (*next == '.')
5407                                 next += 1;      /* start range */
5408                         else
5409                                 return 1;
5410                 }
5411 
5412                 end = strtoul(next, &next, 10);
5413                 if (end <= start)
5414                         return 1;
5415 
5416                 while (++start <= end) {
5417                         if (start >= CPU_SUBSET_MAXCPUS)
5418                                 return 1;
5419                         CPU_SET_S(start, cpu_set_size, cpu_set);
5420                 }
5421 
5422                 if (*next == ',')
5423                         next += 1;
5424                 else if (*next != '\0' && *next != '\n')
5425                         return 1;
5426         }
5427 
5428         return 0;
5429 }
5430 
5431 int get_thread_siblings(struct cpu_topology *thiscpu)
5432 {
5433         char path[80], character;
5434         FILE *filep;
5435         unsigned long map;
5436         int so, shift, sib_core;
5437         int cpu = thiscpu->logical_cpu_id;
5438         int offset = topo.max_cpu_num + 1;
5439         size_t size;
5440         int thread_id = 0;
5441 
5442         thiscpu->put_ids = CPU_ALLOC((topo.max_cpu_num + 1));
5443         if (thiscpu->thread_id < 0)
5444                 thiscpu->thread_id = thread_id++;
5445         if (!thiscpu->put_ids)
5446                 return -1;
5447 
5448         size = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
5449         CPU_ZERO_S(size, thiscpu->put_ids);
5450 
5451         sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpu);
5452         filep = fopen(path, "r");
5453 
5454         if (!filep) {
5455                 warnx("%s: open failed", path);
5456                 return -1;
5457         }
5458         do {
5459                 offset -= BITMASK_SIZE;
5460                 if (fscanf(filep, "%lx%c", &map, &character) != 2)
5461                         err(1, "%s: failed to parse file", path);
5462                 for (shift = 0; shift < BITMASK_SIZE; shift++) {
5463                         if ((map >> shift) & 0x1) {
5464                                 so = shift + offset;
5465                                 sib_core = get_core_id(so);
5466                                 if (sib_core == thiscpu->physical_core_id) {
5467                                         CPU_SET_S(so, size, thiscpu->put_ids);
5468                                         if ((so != cpu) && (cpus[so].thread_id < 0))
5469                                                 cpus[so].thread_id = thread_id++;
5470                                 }
5471                         }
5472                 }
5473         } while (character == ',');
5474         fclose(filep);
5475 
5476         return CPU_COUNT_S(size, thiscpu->put_ids);
5477 }
5478 
5479 /*
5480  * run func(thread, core, package) in topology order
5481  * skip non-present cpus
5482  */
5483 
5484 int for_all_cpus_2(int (func) (struct thread_data *, struct core_data *,
5485                                struct pkg_data *, struct thread_data *, struct core_data *,
5486                                struct pkg_data *), struct thread_data *thread_base,
5487                    struct core_data *core_base, struct pkg_data *pkg_base,
5488                    struct thread_data *thread_base2, struct core_data *core_base2, struct pkg_data *pkg_base2)
5489 {
5490         int retval, pkg_no, node_no, core_no, thread_no;
5491 
5492         for (pkg_no = 0; pkg_no < topo.num_packages; ++pkg_no) {
5493                 for (node_no = 0; node_no < topo.nodes_per_pkg; ++node_no) {
5494                         for (core_no = 0; core_no < topo.cores_per_node; ++core_no) {
5495                                 for (thread_no = 0; thread_no < topo.threads_per_core; ++thread_no) {
5496                                         struct thread_data *t, *t2;
5497                                         struct core_data *c, *c2;
5498                                         struct pkg_data *p, *p2;
5499 
5500                                         t = GET_THREAD(thread_base, thread_no, core_no, node_no, pkg_no);
5501 
5502                                         if (cpu_is_not_allowed(t->cpu_id))
5503                                                 continue;
5504 
5505                                         t2 = GET_THREAD(thread_base2, thread_no, core_no, node_no, pkg_no);
5506 
5507                                         c = GET_CORE(core_base, core_no, node_no, pkg_no);
5508                                         c2 = GET_CORE(core_base2, core_no, node_no, pkg_no);
5509 
5510                                         p = GET_PKG(pkg_base, pkg_no);
5511                                         p2 = GET_PKG(pkg_base2, pkg_no);
5512 
5513                                         retval = func(t, c, p, t2, c2, p2);
5514                                         if (retval)
5515                                                 return retval;
5516                                 }
5517                         }
5518                 }
5519         }
5520         return 0;
5521 }
5522 
5523 /*
5524  * run func(cpu) on every cpu in /proc/stat
5525  * return max_cpu number
5526  */
5527 int for_all_proc_cpus(int (func) (int))
5528 {
5529         FILE *fp;
5530         int cpu_num;
5531         int retval;
5532 
5533         fp = fopen_or_die(proc_stat, "r");
5534 
5535         retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n");
5536         if (retval != 0)
5537                 err(1, "%s: failed to parse format", proc_stat);
5538 
5539         while (1) {
5540                 retval = fscanf(fp, "cpu%u %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n", &cpu_num);
5541                 if (retval != 1)
5542                         break;
5543 
5544                 retval = func(cpu_num);
5545                 if (retval) {
5546                         fclose(fp);
5547                         return (retval);
5548                 }
5549         }
5550         fclose(fp);
5551         return 0;
5552 }
5553 
5554 #define PATH_EFFECTIVE_CPUS     "/sys/fs/cgroup/cpuset.cpus.effective"
5555 
5556 static char cpu_effective_str[1024];
5557 
5558 static int update_effective_str(bool startup)
5559 {
5560         FILE *fp;
5561         char *pos;
5562         char buf[1024];
5563         int ret;
5564 
5565         if (cpu_effective_str[0] == '\0' && !startup)
5566                 return 0;
5567 
5568         fp = fopen(PATH_EFFECTIVE_CPUS, "r");
5569         if (!fp)
5570                 return 0;
5571 
5572         pos = fgets(buf, 1024, fp);
5573         if (!pos)
5574                 err(1, "%s: file read failed\n", PATH_EFFECTIVE_CPUS);
5575 
5576         fclose(fp);
5577 
5578         ret = strncmp(cpu_effective_str, buf, 1024);
5579         if (!ret)
5580                 return 0;
5581 
5582         strncpy(cpu_effective_str, buf, 1024);
5583         return 1;
5584 }
5585 
5586 static void update_effective_set(bool startup)
5587 {
5588         update_effective_str(startup);
5589 
5590         if (parse_cpu_str(cpu_effective_str, cpu_effective_set, cpu_effective_setsize))
5591                 err(1, "%s: cpu str malformat %s\n", PATH_EFFECTIVE_CPUS, cpu_effective_str);
5592 }
5593 
5594 void linux_perf_init(void);
5595 void msr_perf_init(void);
5596 void rapl_perf_init(void);
5597 void cstate_perf_init(void);
5598 void added_perf_counters_init(void);
5599 void pmt_init(void);
5600 
5601 void re_initialize(void)
5602 {
5603         free_all_buffers();
5604         setup_all_buffers(false);
5605         linux_perf_init();
5606         msr_perf_init();
5607         rapl_perf_init();
5608         cstate_perf_init();
5609         added_perf_counters_init();
5610         pmt_init();
5611         fprintf(outf, "turbostat: re-initialized with num_cpus %d, allowed_cpus %d\n", topo.num_cpus,
5612                 topo.allowed_cpus);
5613 }
5614 
5615 void set_max_cpu_num(void)
5616 {
5617         FILE *filep;
5618         int base_cpu;
5619         unsigned long dummy;
5620         char pathname[64];
5621 
5622         base_cpu = sched_getcpu();
5623         if (base_cpu < 0)
5624                 err(1, "cannot find calling cpu ID");
5625         sprintf(pathname, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", base_cpu);
5626 
5627         filep = fopen_or_die(pathname, "r");
5628         topo.max_cpu_num = 0;
5629         while (fscanf(filep, "%lx,", &dummy) == 1)
5630                 topo.max_cpu_num += BITMASK_SIZE;
5631         fclose(filep);
5632         topo.max_cpu_num--;     /* 0 based */
5633 }
5634 
5635 /*
5636  * count_cpus()
5637  * remember the last one seen, it will be the max
5638  */
5639 int count_cpus(int cpu)
5640 {
5641         UNUSED(cpu);
5642 
5643         topo.num_cpus++;
5644         return 0;
5645 }
5646 
5647 int mark_cpu_present(int cpu)
5648 {
5649         CPU_SET_S(cpu, cpu_present_setsize, cpu_present_set);
5650         return 0;
5651 }
5652 
5653 int init_thread_id(int cpu)
5654 {
5655         cpus[cpu].thread_id = -1;
5656         return 0;
5657 }
5658 
5659 /*
5660  * snapshot_proc_interrupts()
5661  *
5662  * read and record summary of /proc/interrupts
5663  *
5664  * return 1 if config change requires a restart, else return 0
5665  */
5666 int snapshot_proc_interrupts(void)
5667 {
5668         static FILE *fp;
5669         int column, retval;
5670 
5671         if (fp == NULL)
5672                 fp = fopen_or_die("/proc/interrupts", "r");
5673         else
5674                 rewind(fp);
5675 
5676         /* read 1st line of /proc/interrupts to get cpu* name for each column */
5677         for (column = 0; column < topo.num_cpus; ++column) {
5678                 int cpu_number;
5679 
5680                 retval = fscanf(fp, " CPU%d", &cpu_number);
5681                 if (retval != 1)
5682                         break;
5683 
5684                 if (cpu_number > topo.max_cpu_num) {
5685                         warn("/proc/interrupts: cpu%d: > %d", cpu_number, topo.max_cpu_num);
5686                         return 1;
5687                 }
5688 
5689                 irq_column_2_cpu[column] = cpu_number;
5690                 irqs_per_cpu[cpu_number] = 0;
5691         }
5692 
5693         /* read /proc/interrupt count lines and sum up irqs per cpu */
5694         while (1) {
5695                 int column;
5696                 char buf[64];
5697 
5698                 retval = fscanf(fp, " %s:", buf);       /* flush irq# "N:" */
5699                 if (retval != 1)
5700                         break;
5701 
5702                 /* read the count per cpu */
5703                 for (column = 0; column < topo.num_cpus; ++column) {
5704 
5705                         int cpu_number, irq_count;
5706 
5707                         retval = fscanf(fp, " %d", &irq_count);
5708                         if (retval != 1)
5709                                 break;
5710 
5711                         cpu_number = irq_column_2_cpu[column];
5712                         irqs_per_cpu[cpu_number] += irq_count;
5713 
5714                 }
5715 
5716                 while (getc(fp) != '\n') ;      /* flush interrupt description */
5717 
5718         }
5719         return 0;
5720 }
5721 
5722 /*
5723  * snapshot_graphics()
5724  *
5725  * record snapshot of specified graphics sysfs knob
5726  *
5727  * return 1 if config change requires a restart, else return 0
5728  */
5729 int snapshot_graphics(int idx)
5730 {
5731         FILE *fp;
5732         int retval;
5733 
5734         switch (idx) {
5735         case GFX_rc6:
5736         case SAM_mc6:
5737                 fp = fopen_or_die(gfx_info[idx].path, "r");
5738                 retval = fscanf(fp, "%lld", &gfx_info[idx].val_ull);
5739                 if (retval != 1)
5740                         err(1, "rc6");
5741                 fclose(fp);
5742                 return 0;
5743         case GFX_MHz:
5744         case GFX_ACTMHz:
5745         case SAM_MHz:
5746         case SAM_ACTMHz:
5747                 if (gfx_info[idx].fp == NULL) {
5748                         gfx_info[idx].fp = fopen_or_die(gfx_info[idx].path, "r");
5749                 } else {
5750                         rewind(gfx_info[idx].fp);
5751                         fflush(gfx_info[idx].fp);
5752                 }
5753                 retval = fscanf(gfx_info[idx].fp, "%d", &gfx_info[idx].val);
5754                 if (retval != 1)
5755                         err(1, "MHz");
5756                 return 0;
5757         default:
5758                 return -EINVAL;
5759         }
5760 }
5761 
5762 /*
5763  * snapshot_cpu_lpi()
5764  *
5765  * record snapshot of
5766  * /sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us
5767  */
5768 int snapshot_cpu_lpi_us(void)
5769 {
5770         FILE *fp;
5771         int retval;
5772 
5773         fp = fopen_or_die("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", "r");
5774 
5775         retval = fscanf(fp, "%lld", &cpuidle_cur_cpu_lpi_us);
5776         if (retval != 1) {
5777                 fprintf(stderr, "Disabling Low Power Idle CPU output\n");
5778                 BIC_NOT_PRESENT(BIC_CPU_LPI);
5779                 fclose(fp);
5780                 return -1;
5781         }
5782 
5783         fclose(fp);
5784 
5785         return 0;
5786 }
5787 
5788 /*
5789  * snapshot_sys_lpi()
5790  *
5791  * record snapshot of sys_lpi_file
5792  */
5793 int snapshot_sys_lpi_us(void)
5794 {
5795         FILE *fp;
5796         int retval;
5797 
5798         fp = fopen_or_die(sys_lpi_file, "r");
5799 
5800         retval = fscanf(fp, "%lld", &cpuidle_cur_sys_lpi_us);
5801         if (retval != 1) {
5802                 fprintf(stderr, "Disabling Low Power Idle System output\n");
5803                 BIC_NOT_PRESENT(BIC_SYS_LPI);
5804                 fclose(fp);
5805                 return -1;
5806         }
5807         fclose(fp);
5808 
5809         return 0;
5810 }
5811 
5812 /*
5813  * snapshot /proc and /sys files
5814  *
5815  * return 1 if configuration restart needed, else return 0
5816  */
5817 int snapshot_proc_sysfs_files(void)
5818 {
5819         if (DO_BIC(BIC_IRQ))
5820                 if (snapshot_proc_interrupts())
5821                         return 1;
5822 
5823         if (DO_BIC(BIC_GFX_rc6))
5824                 snapshot_graphics(GFX_rc6);
5825 
5826         if (DO_BIC(BIC_GFXMHz))
5827                 snapshot_graphics(GFX_MHz);
5828 
5829         if (DO_BIC(BIC_GFXACTMHz))
5830                 snapshot_graphics(GFX_ACTMHz);
5831 
5832         if (DO_BIC(BIC_SAM_mc6))
5833                 snapshot_graphics(SAM_mc6);
5834 
5835         if (DO_BIC(BIC_SAMMHz))
5836                 snapshot_graphics(SAM_MHz);
5837 
5838         if (DO_BIC(BIC_SAMACTMHz))
5839                 snapshot_graphics(SAM_ACTMHz);
5840 
5841         if (DO_BIC(BIC_CPU_LPI))
5842                 snapshot_cpu_lpi_us();
5843 
5844         if (DO_BIC(BIC_SYS_LPI))
5845                 snapshot_sys_lpi_us();
5846 
5847         return 0;
5848 }
5849 
5850 int exit_requested;
5851 
5852 static void signal_handler(int signal)
5853 {
5854         switch (signal) {
5855         case SIGINT:
5856                 exit_requested = 1;
5857                 if (debug)
5858                         fprintf(stderr, " SIGINT\n");
5859                 break;
5860         case SIGUSR1:
5861                 if (debug > 1)
5862                         fprintf(stderr, "SIGUSR1\n");
5863                 break;
5864         }
5865 }
5866 
5867 void setup_signal_handler(void)
5868 {
5869         struct sigaction sa;
5870 
5871         memset(&sa, 0, sizeof(sa));
5872 
5873         sa.sa_handler = &signal_handler;
5874 
5875         if (sigaction(SIGINT, &sa, NULL) < 0)
5876                 err(1, "sigaction SIGINT");
5877         if (sigaction(SIGUSR1, &sa, NULL) < 0)
5878                 err(1, "sigaction SIGUSR1");
5879 }
5880 
5881 void do_sleep(void)
5882 {
5883         struct timeval tout;
5884         struct timespec rest;
5885         fd_set readfds;
5886         int retval;
5887 
5888         FD_ZERO(&readfds);
5889         FD_SET(0, &readfds);
5890 
5891         if (ignore_stdin) {
5892                 nanosleep(&interval_ts, NULL);
5893                 return;
5894         }
5895 
5896         tout = interval_tv;
5897         retval = select(1, &readfds, NULL, NULL, &tout);
5898 
5899         if (retval == 1) {
5900                 switch (getc(stdin)) {
5901                 case 'q':
5902                         exit_requested = 1;
5903                         break;
5904                 case EOF:
5905                         /*
5906                          * 'stdin' is a pipe closed on the other end. There
5907                          * won't be any further input.
5908                          */
5909                         ignore_stdin = 1;
5910                         /* Sleep the rest of the time */
5911                         rest.tv_sec = (tout.tv_sec + tout.tv_usec / 1000000);
5912                         rest.tv_nsec = (tout.tv_usec % 1000000) * 1000;
5913                         nanosleep(&rest, NULL);
5914                 }
5915         }
5916 }
5917 
5918 int get_msr_sum(int cpu, off_t offset, unsigned long long *msr)
5919 {
5920         int ret, idx;
5921         unsigned long long msr_cur, msr_last;
5922 
5923         assert(!no_msr);
5924 
5925         if (!per_cpu_msr_sum)
5926                 return 1;
5927 
5928         idx = offset_to_idx(offset);
5929         if (idx < 0)
5930                 return idx;
5931         /* get_msr_sum() = sum + (get_msr() - last) */
5932         ret = get_msr(cpu, offset, &msr_cur);
5933         if (ret)
5934                 return ret;
5935         msr_last = per_cpu_msr_sum[cpu].entries[idx].last;
5936         DELTA_WRAP32(msr_cur, msr_last);
5937         *msr = msr_last + per_cpu_msr_sum[cpu].entries[idx].sum;
5938 
5939         return 0;
5940 }
5941 
5942 timer_t timerid;
5943 
5944 /* Timer callback, update the sum of MSRs periodically. */
5945 static int update_msr_sum(struct thread_data *t, struct core_data *c, struct pkg_data *p)
5946 {
5947         int i, ret;
5948         int cpu = t->cpu_id;
5949 
5950         UNUSED(c);
5951         UNUSED(p);
5952 
5953         assert(!no_msr);
5954 
5955         for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
5956                 unsigned long long msr_cur, msr_last;
5957                 off_t offset;
5958 
5959                 if (!idx_valid(i))
5960                         continue;
5961                 offset = idx_to_offset(i);
5962                 if (offset < 0)
5963                         continue;
5964                 ret = get_msr(cpu, offset, &msr_cur);
5965                 if (ret) {
5966                         fprintf(outf, "Can not update msr(0x%llx)\n", (unsigned long long)offset);
5967                         continue;
5968                 }
5969 
5970                 msr_last = per_cpu_msr_sum[cpu].entries[i].last;
5971                 per_cpu_msr_sum[cpu].entries[i].last = msr_cur & 0xffffffff;
5972 
5973                 DELTA_WRAP32(msr_cur, msr_last);
5974                 per_cpu_msr_sum[cpu].entries[i].sum += msr_last;
5975         }
5976         return 0;
5977 }
5978 
5979 static void msr_record_handler(union sigval v)
5980 {
5981         UNUSED(v);
5982 
5983         for_all_cpus(update_msr_sum, EVEN_COUNTERS);
5984 }
5985 
5986 void msr_sum_record(void)
5987 {
5988         struct itimerspec its;
5989         struct sigevent sev;
5990 
5991         per_cpu_msr_sum = calloc(topo.max_cpu_num + 1, sizeof(struct msr_sum_array));
5992         if (!per_cpu_msr_sum) {
5993                 fprintf(outf, "Can not allocate memory for long time MSR.\n");
5994                 return;
5995         }
5996         /*
5997          * Signal handler might be restricted, so use thread notifier instead.
5998          */
5999         memset(&sev, 0, sizeof(struct sigevent));
6000         sev.sigev_notify = SIGEV_THREAD;
6001         sev.sigev_notify_function = msr_record_handler;
6002 
6003         sev.sigev_value.sival_ptr = &timerid;
6004         if (timer_create(CLOCK_REALTIME, &sev, &timerid) == -1) {
6005                 fprintf(outf, "Can not create timer.\n");
6006                 goto release_msr;
6007         }
6008 
6009         its.it_value.tv_sec = 0;
6010         its.it_value.tv_nsec = 1;
6011         /*
6012          * A wraparound time has been calculated early.
6013          * Some sources state that the peak power for a
6014          * microprocessor is usually 1.5 times the TDP rating,
6015          * use 2 * TDP for safety.
6016          */
6017         its.it_interval.tv_sec = rapl_joule_counter_range / 2;
6018         its.it_interval.tv_nsec = 0;
6019 
6020         if (timer_settime(timerid, 0, &its, NULL) == -1) {
6021                 fprintf(outf, "Can not set timer.\n");
6022                 goto release_timer;
6023         }
6024         return;
6025 
6026 release_timer:
6027         timer_delete(timerid);
6028 release_msr:
6029         free(per_cpu_msr_sum);
6030 }
6031 
6032 /*
6033  * set_my_sched_priority(pri)
6034  * return previous priority on success
6035  * return value < -20 on failure
6036  */
6037 int set_my_sched_priority(int priority)
6038 {
6039         int retval;
6040         int original_priority;
6041 
6042         errno = 0;
6043         original_priority = getpriority(PRIO_PROCESS, 0);
6044         if (errno && (original_priority == -1))
6045                 return -21;
6046 
6047         retval = setpriority(PRIO_PROCESS, 0, priority);
6048         if (retval)
6049                 return -21;
6050 
6051         errno = 0;
6052         retval = getpriority(PRIO_PROCESS, 0);
6053         if (retval != priority)
6054                 return -21;
6055 
6056         return original_priority;
6057 }
6058 
6059 void turbostat_loop()
6060 {
6061         int retval;
6062         int restarted = 0;
6063         unsigned int done_iters = 0;
6064 
6065         setup_signal_handler();
6066 
6067         /*
6068          * elevate own priority for interval mode
6069          *
6070          * ignore on error - we probably don't have permission to set it, but
6071          * it's not a big deal
6072          */
6073         set_my_sched_priority(-20);
6074 
6075 restart:
6076         restarted++;
6077 
6078         snapshot_proc_sysfs_files();
6079         retval = for_all_cpus(get_counters, EVEN_COUNTERS);
6080         first_counter_read = 0;
6081         if (retval < -1) {
6082                 exit(retval);
6083         } else if (retval == -1) {
6084                 if (restarted > 10) {
6085                         exit(retval);
6086                 }
6087                 re_initialize();
6088                 goto restart;
6089         }
6090         restarted = 0;
6091         done_iters = 0;
6092         gettimeofday(&tv_even, (struct timezone *)NULL);
6093 
6094         while (1) {
6095                 if (for_all_proc_cpus(cpu_is_not_present)) {
6096                         re_initialize();
6097                         goto restart;
6098                 }
6099                 if (update_effective_str(false)) {
6100                         re_initialize();
6101                         goto restart;
6102                 }
6103                 do_sleep();
6104                 if (snapshot_proc_sysfs_files())
6105                         goto restart;
6106                 retval = for_all_cpus(get_counters, ODD_COUNTERS);
6107                 if (retval < -1) {
6108                         exit(retval);
6109                 } else if (retval == -1) {
6110                         re_initialize();
6111                         goto restart;
6112                 }
6113                 gettimeofday(&tv_odd, (struct timezone *)NULL);
6114                 timersub(&tv_odd, &tv_even, &tv_delta);
6115                 if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS)) {
6116                         re_initialize();
6117                         goto restart;
6118                 }
6119                 compute_average(EVEN_COUNTERS);
6120                 format_all_counters(EVEN_COUNTERS);
6121                 flush_output_stdout();
6122                 if (exit_requested)
6123                         break;
6124                 if (num_iterations && ++done_iters >= num_iterations)
6125                         break;
6126                 do_sleep();
6127                 if (snapshot_proc_sysfs_files())
6128                         goto restart;
6129                 retval = for_all_cpus(get_counters, EVEN_COUNTERS);
6130                 if (retval < -1) {
6131                         exit(retval);
6132                 } else if (retval == -1) {
6133                         re_initialize();
6134                         goto restart;
6135                 }
6136                 gettimeofday(&tv_even, (struct timezone *)NULL);
6137                 timersub(&tv_even, &tv_odd, &tv_delta);
6138                 if (for_all_cpus_2(delta_cpu, EVEN_COUNTERS, ODD_COUNTERS)) {
6139                         re_initialize();
6140                         goto restart;
6141                 }
6142                 compute_average(ODD_COUNTERS);
6143                 format_all_counters(ODD_COUNTERS);
6144                 flush_output_stdout();
6145                 if (exit_requested)
6146                         break;
6147                 if (num_iterations && ++done_iters >= num_iterations)
6148                         break;
6149         }
6150 }
6151 
6152 void check_dev_msr()
6153 {
6154         struct stat sb;
6155         char pathname[32];
6156 
6157         if (no_msr)
6158                 return;
6159 
6160         sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
6161         if (stat(pathname, &sb))
6162                 if (system("/sbin/modprobe msr > /dev/null 2>&1"))
6163                         no_msr = 1;
6164 }
6165 
6166 /*
6167  * check for CAP_SYS_RAWIO
6168  * return 0 on success
6169  * return 1 on fail
6170  */
6171 int check_for_cap_sys_rawio(void)
6172 {
6173         cap_t caps;
6174         cap_flag_value_t cap_flag_value;
6175         int ret = 0;
6176 
6177         caps = cap_get_proc();
6178         if (caps == NULL)
6179                 return 1;
6180 
6181         if (cap_get_flag(caps, CAP_SYS_RAWIO, CAP_EFFECTIVE, &cap_flag_value)) {
6182                 ret = 1;
6183                 goto free_and_exit;
6184         }
6185 
6186         if (cap_flag_value != CAP_SET) {
6187                 ret = 1;
6188                 goto free_and_exit;
6189         }
6190 
6191 free_and_exit:
6192         if (cap_free(caps) == -1)
6193                 err(-6, "cap_free\n");
6194 
6195         return ret;
6196 }
6197 
6198 void check_msr_permission(void)
6199 {
6200         int failed = 0;
6201         char pathname[32];
6202 
6203         if (no_msr)
6204                 return;
6205 
6206         /* check for CAP_SYS_RAWIO */
6207         failed += check_for_cap_sys_rawio();
6208 
6209         /* test file permissions */
6210         sprintf(pathname, "/dev/cpu/%d/msr", base_cpu);
6211         if (euidaccess(pathname, R_OK)) {
6212                 failed++;
6213         }
6214 
6215         if (failed) {
6216                 warnx("Failed to access %s. Some of the counters may not be available\n"
6217                       "\tRun as root to enable them or use %s to disable the access explicitly", pathname, "--no-msr");
6218                 no_msr = 1;
6219         }
6220 }
6221 
6222 void probe_bclk(void)
6223 {
6224         unsigned long long msr;
6225         unsigned int base_ratio;
6226 
6227         if (!platform->has_nhm_msrs || no_msr)
6228                 return;
6229 
6230         if (platform->bclk_freq == BCLK_100MHZ)
6231                 bclk = 100.00;
6232         else if (platform->bclk_freq == BCLK_133MHZ)
6233                 bclk = 133.33;
6234         else if (platform->bclk_freq == BCLK_SLV)
6235                 bclk = slm_bclk();
6236         else
6237                 return;
6238 
6239         get_msr(base_cpu, MSR_PLATFORM_INFO, &msr);
6240         base_ratio = (msr >> 8) & 0xFF;
6241 
6242         base_hz = base_ratio * bclk * 1000000;
6243         has_base_hz = 1;
6244 
6245         if (platform->enable_tsc_tweak)
6246                 tsc_tweak = base_hz / tsc_hz;
6247 }
6248 
6249 static void remove_underbar(char *s)
6250 {
6251         char *to = s;
6252 
6253         while (*s) {
6254                 if (*s != '_')
6255                         *to++ = *s;
6256                 s++;
6257         }
6258 
6259         *to = 0;
6260 }
6261 
6262 static void dump_turbo_ratio_info(void)
6263 {
6264         if (!has_turbo)
6265                 return;
6266 
6267         if (!platform->has_nhm_msrs || no_msr)
6268                 return;
6269 
6270         if (platform->trl_msrs & TRL_LIMIT2)
6271                 dump_turbo_ratio_limit2();
6272 
6273         if (platform->trl_msrs & TRL_LIMIT1)
6274                 dump_turbo_ratio_limit1();
6275 
6276         if (platform->trl_msrs & TRL_BASE) {
6277                 dump_turbo_ratio_limits(MSR_TURBO_RATIO_LIMIT);
6278 
6279                 if (is_hybrid)
6280                         dump_turbo_ratio_limits(MSR_SECONDARY_TURBO_RATIO_LIMIT);
6281         }
6282 
6283         if (platform->trl_msrs & TRL_ATOM)
6284                 dump_atom_turbo_ratio_limits();
6285 
6286         if (platform->trl_msrs & TRL_KNL)
6287                 dump_knl_turbo_ratio_limits();
6288 
6289         if (platform->has_config_tdp)
6290                 dump_config_tdp();
6291 }
6292 
6293 static int read_sysfs_int(char *path)
6294 {
6295         FILE *input;
6296         int retval = -1;
6297 
6298         input = fopen(path, "r");
6299         if (input == NULL) {
6300                 if (debug)
6301                         fprintf(outf, "NSFOD %s\n", path);
6302                 return (-1);
6303         }
6304         if (fscanf(input, "%d", &retval) != 1)
6305                 err(1, "%s: failed to read int from file", path);
6306         fclose(input);
6307 
6308         return (retval);
6309 }
6310 
6311 static void dump_sysfs_file(char *path)
6312 {
6313         FILE *input;
6314         char cpuidle_buf[64];
6315 
6316         input = fopen(path, "r");
6317         if (input == NULL) {
6318                 if (debug)
6319                         fprintf(outf, "NSFOD %s\n", path);
6320                 return;
6321         }
6322         if (!fgets(cpuidle_buf, sizeof(cpuidle_buf), input))
6323                 err(1, "%s: failed to read file", path);
6324         fclose(input);
6325 
6326         fprintf(outf, "%s: %s", strrchr(path, '/') + 1, cpuidle_buf);
6327 }
6328 
6329 static void probe_intel_uncore_frequency_legacy(void)
6330 {
6331         int i, j;
6332         char path[256];
6333 
6334         for (i = 0; i < topo.num_packages; ++i) {
6335                 for (j = 0; j <= topo.max_die_id; ++j) {
6336                         int k, l;
6337                         char path_base[128];
6338 
6339                         sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/package_%02d_die_%02d", i,
6340                                 j);
6341 
6342                         if (access(path_base, R_OK))
6343                                 continue;
6344 
6345                         BIC_PRESENT(BIC_UNCORE_MHZ);
6346 
6347                         if (quiet)
6348                                 return;
6349 
6350                         sprintf(path, "%s/min_freq_khz", path_base);
6351                         k = read_sysfs_int(path);
6352                         sprintf(path, "%s/max_freq_khz", path_base);
6353                         l = read_sysfs_int(path);
6354                         fprintf(outf, "Uncore Frequency package%d die%d: %d - %d MHz ", i, j, k / 1000, l / 1000);
6355 
6356                         sprintf(path, "%s/initial_min_freq_khz", path_base);
6357                         k = read_sysfs_int(path);
6358                         sprintf(path, "%s/initial_max_freq_khz", path_base);
6359                         l = read_sysfs_int(path);
6360                         fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
6361 
6362                         sprintf(path, "%s/current_freq_khz", path_base);
6363                         k = read_sysfs_int(path);
6364                         fprintf(outf, " %d MHz\n", k / 1000);
6365                 }
6366         }
6367 }
6368 
6369 static void probe_intel_uncore_frequency_cluster(void)
6370 {
6371         int i, uncore_max_id;
6372         char path[256];
6373         char path_base[128];
6374 
6375         if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00/current_freq_khz", R_OK))
6376                 return;
6377 
6378         for (uncore_max_id = 0;; ++uncore_max_id) {
6379 
6380                 sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", uncore_max_id);
6381 
6382                 /* uncore## start at 00 and skips no numbers, so stop upon first missing */
6383                 if (access(path_base, R_OK)) {
6384                         uncore_max_id -= 1;
6385                         break;
6386                 }
6387         }
6388         for (i = uncore_max_id; i >= 0; --i) {
6389                 int k, l;
6390                 int package_id, domain_id, cluster_id;
6391                 char name_buf[16];
6392 
6393                 sprintf(path_base, "/sys/devices/system/cpu/intel_uncore_frequency/uncore%02d", i);
6394 
6395                 if (access(path_base, R_OK))
6396                         err(1, "%s: %s\n", __func__, path_base);
6397 
6398                 sprintf(path, "%s/package_id", path_base);
6399                 package_id = read_sysfs_int(path);
6400 
6401                 sprintf(path, "%s/domain_id", path_base);
6402                 domain_id = read_sysfs_int(path);
6403 
6404                 sprintf(path, "%s/fabric_cluster_id", path_base);
6405                 cluster_id = read_sysfs_int(path);
6406 
6407                 sprintf(path, "%s/current_freq_khz", path_base);
6408                 sprintf(name_buf, "UMHz%d.%d", domain_id, cluster_id);
6409 
6410                 add_counter(0, path, name_buf, 0, SCOPE_PACKAGE, COUNTER_K2M, FORMAT_AVERAGE, 0, package_id);
6411 
6412                 if (quiet)
6413                         continue;
6414 
6415                 sprintf(path, "%s/min_freq_khz", path_base);
6416                 k = read_sysfs_int(path);
6417                 sprintf(path, "%s/max_freq_khz", path_base);
6418                 l = read_sysfs_int(path);
6419                 fprintf(outf, "Uncore Frequency package%d domain%d cluster%d: %d - %d MHz ", package_id, domain_id,
6420                         cluster_id, k / 1000, l / 1000);
6421 
6422                 sprintf(path, "%s/initial_min_freq_khz", path_base);
6423                 k = read_sysfs_int(path);
6424                 sprintf(path, "%s/initial_max_freq_khz", path_base);
6425                 l = read_sysfs_int(path);
6426                 fprintf(outf, "(%d - %d MHz)", k / 1000, l / 1000);
6427 
6428                 sprintf(path, "%s/current_freq_khz", path_base);
6429                 k = read_sysfs_int(path);
6430                 fprintf(outf, " %d MHz\n", k / 1000);
6431         }
6432 }
6433 
6434 static void probe_intel_uncore_frequency(void)
6435 {
6436         if (!genuine_intel)
6437                 return;
6438 
6439         if (access("/sys/devices/system/cpu/intel_uncore_frequency/uncore00", R_OK) == 0)
6440                 probe_intel_uncore_frequency_cluster();
6441         else
6442                 probe_intel_uncore_frequency_legacy();
6443 }
6444 
6445 static void probe_graphics(void)
6446 {
6447         /* Xe graphics sysfs knobs */
6448         if (!access("/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms", R_OK)) {
6449                 FILE *fp;
6450                 char buf[8];
6451                 bool gt0_is_gt;
6452                 int idx;
6453 
6454                 fp = fopen("/sys/class/drm/card0/device/tile0/gt0/gtidle/name", "r");
6455                 if (!fp)
6456                         goto next;
6457 
6458                 if (!fread(buf, sizeof(char), 7, fp)) {
6459                         fclose(fp);
6460                         goto next;
6461                 }
6462                 fclose(fp);
6463 
6464                 if (!strncmp(buf, "gt0-rc", strlen("gt0-rc")))
6465                         gt0_is_gt = true;
6466                 else if (!strncmp(buf, "gt0-mc", strlen("gt0-mc")))
6467                         gt0_is_gt = false;
6468                 else
6469                         goto next;
6470 
6471                 idx = gt0_is_gt ? GFX_rc6 : SAM_mc6;
6472                 gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/gtidle/idle_residency_ms";
6473 
6474                 idx = gt0_is_gt ? GFX_MHz : SAM_MHz;
6475                 if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq", R_OK))
6476                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/cur_freq";
6477 
6478                 idx = gt0_is_gt ? GFX_ACTMHz : SAM_ACTMHz;
6479                 if (!access("/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq", R_OK))
6480                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt0/freq0/act_freq";
6481 
6482                 idx = gt0_is_gt ? SAM_mc6 : GFX_rc6;
6483                 if (!access("/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms", R_OK))
6484                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/gtidle/idle_residency_ms";
6485 
6486                 idx = gt0_is_gt ? SAM_MHz : GFX_MHz;
6487                 if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq", R_OK))
6488                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/cur_freq";
6489 
6490                 idx = gt0_is_gt ? SAM_ACTMHz : GFX_ACTMHz;
6491                 if (!access("/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq", R_OK))
6492                         gfx_info[idx].path = "/sys/class/drm/card0/device/tile0/gt1/freq0/act_freq";
6493 
6494                 goto end;
6495         }
6496 
6497 next:
6498         /* New i915 graphics sysfs knobs */
6499         if (!access("/sys/class/drm/card0/gt/gt0/rc6_residency_ms", R_OK)) {
6500                 gfx_info[GFX_rc6].path = "/sys/class/drm/card0/gt/gt0/rc6_residency_ms";
6501 
6502                 if (!access("/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz", R_OK))
6503                         gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt/gt0/rps_cur_freq_mhz";
6504 
6505                 if (!access("/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz", R_OK))
6506                         gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt/gt0/rps_act_freq_mhz";
6507 
6508                 if (!access("/sys/class/drm/card0/gt/gt1/rc6_residency_ms", R_OK))
6509                         gfx_info[SAM_mc6].path = "/sys/class/drm/card0/gt/gt1/rc6_residency_ms";
6510 
6511                 if (!access("/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz", R_OK))
6512                         gfx_info[SAM_MHz].path = "/sys/class/drm/card0/gt/gt1/rps_cur_freq_mhz";
6513 
6514                 if (!access("/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz", R_OK))
6515                         gfx_info[SAM_ACTMHz].path = "/sys/class/drm/card0/gt/gt1/rps_act_freq_mhz";
6516 
6517                 goto end;
6518         }
6519 
6520         /* Fall back to traditional i915 graphics sysfs knobs */
6521         if (!access("/sys/class/drm/card0/power/rc6_residency_ms", R_OK))
6522                 gfx_info[GFX_rc6].path = "/sys/class/drm/card0/power/rc6_residency_ms";
6523 
6524         if (!access("/sys/class/drm/card0/gt_cur_freq_mhz", R_OK))
6525                 gfx_info[GFX_MHz].path = "/sys/class/drm/card0/gt_cur_freq_mhz";
6526         else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz", R_OK))
6527                 gfx_info[GFX_MHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_cur_freq_mhz";
6528 
6529         if (!access("/sys/class/drm/card0/gt_act_freq_mhz", R_OK))
6530                 gfx_info[GFX_ACTMHz].path = "/sys/class/drm/card0/gt_act_freq_mhz";
6531         else if (!access("/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz", R_OK))
6532                 gfx_info[GFX_ACTMHz].path = "/sys/class/graphics/fb0/device/drm/card0/gt_act_freq_mhz";
6533 
6534 end:
6535         if (gfx_info[GFX_rc6].path)
6536                 BIC_PRESENT(BIC_GFX_rc6);
6537         if (gfx_info[GFX_MHz].path)
6538                 BIC_PRESENT(BIC_GFXMHz);
6539         if (gfx_info[GFX_ACTMHz].path)
6540                 BIC_PRESENT(BIC_GFXACTMHz);
6541         if (gfx_info[SAM_mc6].path)
6542                 BIC_PRESENT(BIC_SAM_mc6);
6543         if (gfx_info[SAM_MHz].path)
6544                 BIC_PRESENT(BIC_SAMMHz);
6545         if (gfx_info[SAM_ACTMHz].path)
6546                 BIC_PRESENT(BIC_SAMACTMHz);
6547 }
6548 
6549 static void dump_sysfs_cstate_config(void)
6550 {
6551         char path[64];
6552         char name_buf[16];
6553         char desc[64];
6554         FILE *input;
6555         int state;
6556         char *sp;
6557 
6558         if (access("/sys/devices/system/cpu/cpuidle", R_OK)) {
6559                 fprintf(outf, "cpuidle not loaded\n");
6560                 return;
6561         }
6562 
6563         dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_driver");
6564         dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor");
6565         dump_sysfs_file("/sys/devices/system/cpu/cpuidle/current_governor_ro");
6566 
6567         for (state = 0; state < 10; ++state) {
6568 
6569                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
6570                 input = fopen(path, "r");
6571                 if (input == NULL)
6572                         continue;
6573                 if (!fgets(name_buf, sizeof(name_buf), input))
6574                         err(1, "%s: failed to read file", path);
6575 
6576                 /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
6577                 sp = strchr(name_buf, '-');
6578                 if (!sp)
6579                         sp = strchrnul(name_buf, '\n');
6580                 *sp = '\0';
6581                 fclose(input);
6582 
6583                 remove_underbar(name_buf);
6584 
6585                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/desc", base_cpu, state);
6586                 input = fopen(path, "r");
6587                 if (input == NULL)
6588                         continue;
6589                 if (!fgets(desc, sizeof(desc), input))
6590                         err(1, "%s: failed to read file", path);
6591 
6592                 fprintf(outf, "cpu%d: %s: %s", base_cpu, name_buf, desc);
6593                 fclose(input);
6594         }
6595 }
6596 
6597 static void dump_sysfs_pstate_config(void)
6598 {
6599         char path[64];
6600         char driver_buf[64];
6601         char governor_buf[64];
6602         FILE *input;
6603         int turbo;
6604 
6605         sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_driver", base_cpu);
6606         input = fopen(path, "r");
6607         if (input == NULL) {
6608                 fprintf(outf, "NSFOD %s\n", path);
6609                 return;
6610         }
6611         if (!fgets(driver_buf, sizeof(driver_buf), input))
6612                 err(1, "%s: failed to read file", path);
6613         fclose(input);
6614 
6615         sprintf(path, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", base_cpu);
6616         input = fopen(path, "r");
6617         if (input == NULL) {
6618                 fprintf(outf, "NSFOD %s\n", path);
6619                 return;
6620         }
6621         if (!fgets(governor_buf, sizeof(governor_buf), input))
6622                 err(1, "%s: failed to read file", path);
6623         fclose(input);
6624 
6625         fprintf(outf, "cpu%d: cpufreq driver: %s", base_cpu, driver_buf);
6626         fprintf(outf, "cpu%d: cpufreq governor: %s", base_cpu, governor_buf);
6627 
6628         sprintf(path, "/sys/devices/system/cpu/cpufreq/boost");
6629         input = fopen(path, "r");
6630         if (input != NULL) {
6631                 if (fscanf(input, "%d", &turbo) != 1)
6632                         err(1, "%s: failed to parse number from file", path);
6633                 fprintf(outf, "cpufreq boost: %d\n", turbo);
6634                 fclose(input);
6635         }
6636 
6637         sprintf(path, "/sys/devices/system/cpu/intel_pstate/no_turbo");
6638         input = fopen(path, "r");
6639         if (input != NULL) {
6640                 if (fscanf(input, "%d", &turbo) != 1)
6641                         err(1, "%s: failed to parse number from file", path);
6642                 fprintf(outf, "cpufreq intel_pstate no_turbo: %d\n", turbo);
6643                 fclose(input);
6644         }
6645 }
6646 
6647 /*
6648  * print_epb()
6649  * Decode the ENERGY_PERF_BIAS MSR
6650  */
6651 int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6652 {
6653         char *epb_string;
6654         int cpu, epb;
6655 
6656         UNUSED(c);
6657         UNUSED(p);
6658 
6659         if (!has_epb)
6660                 return 0;
6661 
6662         cpu = t->cpu_id;
6663 
6664         /* EPB is per-package */
6665         if (!is_cpu_first_thread_in_package(t, c, p))
6666                 return 0;
6667 
6668         if (cpu_migrate(cpu)) {
6669                 fprintf(outf, "print_epb: Could not migrate to CPU %d\n", cpu);
6670                 return -1;
6671         }
6672 
6673         epb = get_epb(cpu);
6674         if (epb < 0)
6675                 return 0;
6676 
6677         switch (epb) {
6678         case ENERGY_PERF_BIAS_PERFORMANCE:
6679                 epb_string = "performance";
6680                 break;
6681         case ENERGY_PERF_BIAS_NORMAL:
6682                 epb_string = "balanced";
6683                 break;
6684         case ENERGY_PERF_BIAS_POWERSAVE:
6685                 epb_string = "powersave";
6686                 break;
6687         default:
6688                 epb_string = "custom";
6689                 break;
6690         }
6691         fprintf(outf, "cpu%d: EPB: %d (%s)\n", cpu, epb, epb_string);
6692 
6693         return 0;
6694 }
6695 
6696 /*
6697  * print_hwp()
6698  * Decode the MSR_HWP_CAPABILITIES
6699  */
6700 int print_hwp(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6701 {
6702         unsigned long long msr;
6703         int cpu;
6704 
6705         UNUSED(c);
6706         UNUSED(p);
6707 
6708         if (no_msr)
6709                 return 0;
6710 
6711         if (!has_hwp)
6712                 return 0;
6713 
6714         cpu = t->cpu_id;
6715 
6716         /* MSR_HWP_CAPABILITIES is per-package */
6717         if (!is_cpu_first_thread_in_package(t, c, p))
6718                 return 0;
6719 
6720         if (cpu_migrate(cpu)) {
6721                 fprintf(outf, "print_hwp: Could not migrate to CPU %d\n", cpu);
6722                 return -1;
6723         }
6724 
6725         if (get_msr(cpu, MSR_PM_ENABLE, &msr))
6726                 return 0;
6727 
6728         fprintf(outf, "cpu%d: MSR_PM_ENABLE: 0x%08llx (%sHWP)\n", cpu, msr, (msr & (1 << 0)) ? "" : "No-");
6729 
6730         /* MSR_PM_ENABLE[1] == 1 if HWP is enabled and MSRs visible */
6731         if ((msr & (1 << 0)) == 0)
6732                 return 0;
6733 
6734         if (get_msr(cpu, MSR_HWP_CAPABILITIES, &msr))
6735                 return 0;
6736 
6737         fprintf(outf, "cpu%d: MSR_HWP_CAPABILITIES: 0x%08llx "
6738                 "(high %d guar %d eff %d low %d)\n",
6739                 cpu, msr,
6740                 (unsigned int)HWP_HIGHEST_PERF(msr),
6741                 (unsigned int)HWP_GUARANTEED_PERF(msr),
6742                 (unsigned int)HWP_MOSTEFFICIENT_PERF(msr), (unsigned int)HWP_LOWEST_PERF(msr));
6743 
6744         if (get_msr(cpu, MSR_HWP_REQUEST, &msr))
6745                 return 0;
6746 
6747         fprintf(outf, "cpu%d: MSR_HWP_REQUEST: 0x%08llx "
6748                 "(min %d max %d des %d epp 0x%x window 0x%x pkg 0x%x)\n",
6749                 cpu, msr,
6750                 (unsigned int)(((msr) >> 0) & 0xff),
6751                 (unsigned int)(((msr) >> 8) & 0xff),
6752                 (unsigned int)(((msr) >> 16) & 0xff),
6753                 (unsigned int)(((msr) >> 24) & 0xff),
6754                 (unsigned int)(((msr) >> 32) & 0xff3), (unsigned int)(((msr) >> 42) & 0x1));
6755 
6756         if (has_hwp_pkg) {
6757                 if (get_msr(cpu, MSR_HWP_REQUEST_PKG, &msr))
6758                         return 0;
6759 
6760                 fprintf(outf, "cpu%d: MSR_HWP_REQUEST_PKG: 0x%08llx "
6761                         "(min %d max %d des %d epp 0x%x window 0x%x)\n",
6762                         cpu, msr,
6763                         (unsigned int)(((msr) >> 0) & 0xff),
6764                         (unsigned int)(((msr) >> 8) & 0xff),
6765                         (unsigned int)(((msr) >> 16) & 0xff),
6766                         (unsigned int)(((msr) >> 24) & 0xff), (unsigned int)(((msr) >> 32) & 0xff3));
6767         }
6768         if (has_hwp_notify) {
6769                 if (get_msr(cpu, MSR_HWP_INTERRUPT, &msr))
6770                         return 0;
6771 
6772                 fprintf(outf, "cpu%d: MSR_HWP_INTERRUPT: 0x%08llx "
6773                         "(%s_Guaranteed_Perf_Change, %s_Excursion_Min)\n",
6774                         cpu, msr, ((msr) & 0x1) ? "EN" : "Dis", ((msr) & 0x2) ? "EN" : "Dis");
6775         }
6776         if (get_msr(cpu, MSR_HWP_STATUS, &msr))
6777                 return 0;
6778 
6779         fprintf(outf, "cpu%d: MSR_HWP_STATUS: 0x%08llx "
6780                 "(%sGuaranteed_Perf_Change, %sExcursion_Min)\n",
6781                 cpu, msr, ((msr) & 0x1) ? "" : "No-", ((msr) & 0x4) ? "" : "No-");
6782 
6783         return 0;
6784 }
6785 
6786 /*
6787  * print_perf_limit()
6788  */
6789 int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6790 {
6791         unsigned long long msr;
6792         int cpu;
6793 
6794         UNUSED(c);
6795         UNUSED(p);
6796 
6797         if (no_msr)
6798                 return 0;
6799 
6800         cpu = t->cpu_id;
6801 
6802         /* per-package */
6803         if (!is_cpu_first_thread_in_package(t, c, p))
6804                 return 0;
6805 
6806         if (cpu_migrate(cpu)) {
6807                 fprintf(outf, "print_perf_limit: Could not migrate to CPU %d\n", cpu);
6808                 return -1;
6809         }
6810 
6811         if (platform->plr_msrs & PLR_CORE) {
6812                 get_msr(cpu, MSR_CORE_PERF_LIMIT_REASONS, &msr);
6813                 fprintf(outf, "cpu%d: MSR_CORE_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
6814                 fprintf(outf, " (Active: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)",
6815                         (msr & 1 << 15) ? "bit15, " : "",
6816                         (msr & 1 << 14) ? "bit14, " : "",
6817                         (msr & 1 << 13) ? "Transitions, " : "",
6818                         (msr & 1 << 12) ? "MultiCoreTurbo, " : "",
6819                         (msr & 1 << 11) ? "PkgPwrL2, " : "",
6820                         (msr & 1 << 10) ? "PkgPwrL1, " : "",
6821                         (msr & 1 << 9) ? "CorePwr, " : "",
6822                         (msr & 1 << 8) ? "Amps, " : "",
6823                         (msr & 1 << 6) ? "VR-Therm, " : "",
6824                         (msr & 1 << 5) ? "Auto-HWP, " : "",
6825                         (msr & 1 << 4) ? "Graphics, " : "",
6826                         (msr & 1 << 2) ? "bit2, " : "",
6827                         (msr & 1 << 1) ? "ThermStatus, " : "", (msr & 1 << 0) ? "PROCHOT, " : "");
6828                 fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s%s%s%s%s%s%s)\n",
6829                         (msr & 1 << 31) ? "bit31, " : "",
6830                         (msr & 1 << 30) ? "bit30, " : "",
6831                         (msr & 1 << 29) ? "Transitions, " : "",
6832                         (msr & 1 << 28) ? "MultiCoreTurbo, " : "",
6833                         (msr & 1 << 27) ? "PkgPwrL2, " : "",
6834                         (msr & 1 << 26) ? "PkgPwrL1, " : "",
6835                         (msr & 1 << 25) ? "CorePwr, " : "",
6836                         (msr & 1 << 24) ? "Amps, " : "",
6837                         (msr & 1 << 22) ? "VR-Therm, " : "",
6838                         (msr & 1 << 21) ? "Auto-HWP, " : "",
6839                         (msr & 1 << 20) ? "Graphics, " : "",
6840                         (msr & 1 << 18) ? "bit18, " : "",
6841                         (msr & 1 << 17) ? "ThermStatus, " : "", (msr & 1 << 16) ? "PROCHOT, " : "");
6842 
6843         }
6844         if (platform->plr_msrs & PLR_GFX) {
6845                 get_msr(cpu, MSR_GFX_PERF_LIMIT_REASONS, &msr);
6846                 fprintf(outf, "cpu%d: MSR_GFX_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
6847                 fprintf(outf, " (Active: %s%s%s%s%s%s%s%s)",
6848                         (msr & 1 << 0) ? "PROCHOT, " : "",
6849                         (msr & 1 << 1) ? "ThermStatus, " : "",
6850                         (msr & 1 << 4) ? "Graphics, " : "",
6851                         (msr & 1 << 6) ? "VR-Therm, " : "",
6852                         (msr & 1 << 8) ? "Amps, " : "",
6853                         (msr & 1 << 9) ? "GFXPwr, " : "",
6854                         (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
6855                 fprintf(outf, " (Logged: %s%s%s%s%s%s%s%s)\n",
6856                         (msr & 1 << 16) ? "PROCHOT, " : "",
6857                         (msr & 1 << 17) ? "ThermStatus, " : "",
6858                         (msr & 1 << 20) ? "Graphics, " : "",
6859                         (msr & 1 << 22) ? "VR-Therm, " : "",
6860                         (msr & 1 << 24) ? "Amps, " : "",
6861                         (msr & 1 << 25) ? "GFXPwr, " : "",
6862                         (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
6863         }
6864         if (platform->plr_msrs & PLR_RING) {
6865                 get_msr(cpu, MSR_RING_PERF_LIMIT_REASONS, &msr);
6866                 fprintf(outf, "cpu%d: MSR_RING_PERF_LIMIT_REASONS, 0x%08llx", cpu, msr);
6867                 fprintf(outf, " (Active: %s%s%s%s%s%s)",
6868                         (msr & 1 << 0) ? "PROCHOT, " : "",
6869                         (msr & 1 << 1) ? "ThermStatus, " : "",
6870                         (msr & 1 << 6) ? "VR-Therm, " : "",
6871                         (msr & 1 << 8) ? "Amps, " : "",
6872                         (msr & 1 << 10) ? "PkgPwrL1, " : "", (msr & 1 << 11) ? "PkgPwrL2, " : "");
6873                 fprintf(outf, " (Logged: %s%s%s%s%s%s)\n",
6874                         (msr & 1 << 16) ? "PROCHOT, " : "",
6875                         (msr & 1 << 17) ? "ThermStatus, " : "",
6876                         (msr & 1 << 22) ? "VR-Therm, " : "",
6877                         (msr & 1 << 24) ? "Amps, " : "",
6878                         (msr & 1 << 26) ? "PkgPwrL1, " : "", (msr & 1 << 27) ? "PkgPwrL2, " : "");
6879         }
6880         return 0;
6881 }
6882 
6883 #define RAPL_POWER_GRANULARITY  0x7FFF  /* 15 bit power granularity */
6884 #define RAPL_TIME_GRANULARITY   0x3F    /* 6 bit time granularity */
6885 
6886 double get_quirk_tdp(void)
6887 {
6888         if (platform->rapl_quirk_tdp)
6889                 return platform->rapl_quirk_tdp;
6890 
6891         return 135.0;
6892 }
6893 
6894 double get_tdp_intel(void)
6895 {
6896         unsigned long long msr;
6897 
6898         if (platform->rapl_msrs & RAPL_PKG_POWER_INFO)
6899                 if (!get_msr(base_cpu, MSR_PKG_POWER_INFO, &msr))
6900                         return ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units;
6901         return get_quirk_tdp();
6902 }
6903 
6904 double get_tdp_amd(void)
6905 {
6906         return get_quirk_tdp();
6907 }
6908 
6909 void rapl_probe_intel(void)
6910 {
6911         unsigned long long msr;
6912         unsigned int time_unit;
6913         double tdp;
6914         const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt | BIC_RAMWatt | BIC_GFXWatt;
6915         const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J | BIC_RAM_J | BIC_GFX_J;
6916 
6917         if (rapl_joules)
6918                 bic_enabled &= ~bic_watt_bits;
6919         else
6920                 bic_enabled &= ~bic_joules_bits;
6921 
6922         if (!(platform->rapl_msrs & RAPL_PKG_PERF_STATUS))
6923                 bic_enabled &= ~BIC_PKG__;
6924         if (!(platform->rapl_msrs & RAPL_DRAM_PERF_STATUS))
6925                 bic_enabled &= ~BIC_RAM__;
6926 
6927         /* units on package 0, verify later other packages match */
6928         if (get_msr(base_cpu, MSR_RAPL_POWER_UNIT, &msr))
6929                 return;
6930 
6931         rapl_power_units = 1.0 / (1 << (msr & 0xF));
6932         if (platform->has_rapl_divisor)
6933                 rapl_energy_units = 1.0 * (1 << (msr >> 8 & 0x1F)) / 1000000;
6934         else
6935                 rapl_energy_units = 1.0 / (1 << (msr >> 8 & 0x1F));
6936 
6937         if (platform->has_fixed_rapl_unit)
6938                 rapl_dram_energy_units = (15.3 / 1000000);
6939         else
6940                 rapl_dram_energy_units = rapl_energy_units;
6941 
6942         time_unit = msr >> 16 & 0xF;
6943         if (time_unit == 0)
6944                 time_unit = 0xA;
6945 
6946         rapl_time_units = 1.0 / (1 << (time_unit));
6947 
6948         tdp = get_tdp_intel();
6949 
6950         rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
6951         if (!quiet)
6952                 fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
6953 }
6954 
6955 void rapl_probe_amd(void)
6956 {
6957         unsigned long long msr;
6958         double tdp;
6959         const unsigned long long bic_watt_bits = BIC_PkgWatt | BIC_CorWatt;
6960         const unsigned long long bic_joules_bits = BIC_Pkg_J | BIC_Cor_J;
6961 
6962         if (rapl_joules)
6963                 bic_enabled &= ~bic_watt_bits;
6964         else
6965                 bic_enabled &= ~bic_joules_bits;
6966 
6967         if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr))
6968                 return;
6969 
6970         rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf));
6971         rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f));
6972         rapl_power_units = ldexp(1.0, -(msr & 0xf));
6973 
6974         tdp = get_tdp_amd();
6975 
6976         rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp;
6977         if (!quiet)
6978                 fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp);
6979 }
6980 
6981 void print_power_limit_msr(int cpu, unsigned long long msr, char *label)
6982 {
6983         fprintf(outf, "cpu%d: %s: %sabled (%0.3f Watts, %f sec, clamp %sabled)\n",
6984                 cpu, label,
6985                 ((msr >> 15) & 1) ? "EN" : "DIS",
6986                 ((msr >> 0) & 0x7FFF) * rapl_power_units,
6987                 (1.0 + (((msr >> 22) & 0x3) / 4.0)) * (1 << ((msr >> 17) & 0x1F)) * rapl_time_units,
6988                 (((msr >> 16) & 1) ? "EN" : "DIS"));
6989 
6990         return;
6991 }
6992 
6993 int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
6994 {
6995         unsigned long long msr;
6996         const char *msr_name;
6997         int cpu;
6998 
6999         UNUSED(c);
7000         UNUSED(p);
7001 
7002         if (!platform->rapl_msrs)
7003                 return 0;
7004 
7005         /* RAPL counters are per package, so print only for 1st thread/package */
7006         if (!is_cpu_first_thread_in_package(t, c, p))
7007                 return 0;
7008 
7009         cpu = t->cpu_id;
7010         if (cpu_migrate(cpu)) {
7011                 fprintf(outf, "print_rapl: Could not migrate to CPU %d\n", cpu);
7012                 return -1;
7013         }
7014 
7015         if (platform->rapl_msrs & RAPL_AMD_F17H) {
7016                 msr_name = "MSR_RAPL_PWR_UNIT";
7017                 if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr))
7018                         return -1;
7019         } else {
7020                 msr_name = "MSR_RAPL_POWER_UNIT";
7021                 if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr))
7022                         return -1;
7023         }
7024 
7025         fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr,
7026                 rapl_power_units, rapl_energy_units, rapl_time_units);
7027 
7028         if (platform->rapl_msrs & RAPL_PKG_POWER_INFO) {
7029 
7030                 if (get_msr(cpu, MSR_PKG_POWER_INFO, &msr))
7031                         return -5;
7032 
7033                 fprintf(outf, "cpu%d: MSR_PKG_POWER_INFO: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
7034                         cpu, msr,
7035                         ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7036                         ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7037                         ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7038                         ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
7039 
7040         }
7041         if (platform->rapl_msrs & RAPL_PKG) {
7042 
7043                 if (get_msr(cpu, MSR_PKG_POWER_LIMIT, &msr))
7044                         return -9;
7045 
7046                 fprintf(outf, "cpu%d: MSR_PKG_POWER_LIMIT: 0x%08llx (%slocked)\n",
7047                         cpu, msr, (msr >> 63) & 1 ? "" : "UN");
7048 
7049                 print_power_limit_msr(cpu, msr, "PKG Limit #1");
7050                 fprintf(outf, "cpu%d: PKG Limit #2: %sabled (%0.3f Watts, %f* sec, clamp %sabled)\n",
7051                         cpu,
7052                         ((msr >> 47) & 1) ? "EN" : "DIS",
7053                         ((msr >> 32) & 0x7FFF) * rapl_power_units,
7054                         (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units,
7055                         ((msr >> 48) & 1) ? "EN" : "DIS");
7056 
7057                 if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr))
7058                         return -9;
7059 
7060                 fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr);
7061                 fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n",
7062                         cpu, ((msr >> 0) & 0x1FFF) * rapl_power_units, (msr >> 31) & 1 ? "" : "UN");
7063         }
7064 
7065         if (platform->rapl_msrs & RAPL_DRAM_POWER_INFO) {
7066                 if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
7067                         return -6;
7068 
7069                 fprintf(outf, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
7070                         cpu, msr,
7071                         ((msr >> 0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7072                         ((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7073                         ((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
7074                         ((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
7075         }
7076         if (platform->rapl_msrs & RAPL_DRAM) {
7077                 if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
7078                         return -9;
7079                 fprintf(outf, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
7080                         cpu, msr, (msr >> 31) & 1 ? "" : "UN");
7081 
7082                 print_power_limit_msr(cpu, msr, "DRAM Limit");
7083         }
7084         if (platform->rapl_msrs & RAPL_CORE_POLICY) {
7085                 if (get_msr(cpu, MSR_PP0_POLICY, &msr))
7086                         return -7;
7087 
7088                 fprintf(outf, "cpu%d: MSR_PP0_POLICY: %lld\n", cpu, msr & 0xF);
7089         }
7090         if (platform->rapl_msrs & RAPL_CORE_POWER_LIMIT) {
7091                 if (get_msr(cpu, MSR_PP0_POWER_LIMIT, &msr))
7092                         return -9;
7093                 fprintf(outf, "cpu%d: MSR_PP0_POWER_LIMIT: 0x%08llx (%slocked)\n",
7094                         cpu, msr, (msr >> 31) & 1 ? "" : "UN");
7095                 print_power_limit_msr(cpu, msr, "Cores Limit");
7096         }
7097         if (platform->rapl_msrs & RAPL_GFX) {
7098                 if (get_msr(cpu, MSR_PP1_POLICY, &msr))
7099                         return -8;
7100 
7101                 fprintf(outf, "cpu%d: MSR_PP1_POLICY: %lld\n", cpu, msr & 0xF);
7102 
7103                 if (get_msr(cpu, MSR_PP1_POWER_LIMIT, &msr))
7104                         return -9;
7105                 fprintf(outf, "cpu%d: MSR_PP1_POWER_LIMIT: 0x%08llx (%slocked)\n",
7106                         cpu, msr, (msr >> 31) & 1 ? "" : "UN");
7107                 print_power_limit_msr(cpu, msr, "GFX Limit");
7108         }
7109         return 0;
7110 }
7111 
7112 /*
7113  * probe_rapl()
7114  *
7115  * sets rapl_power_units, rapl_energy_units, rapl_time_units
7116  */
7117 void probe_rapl(void)
7118 {
7119         if (!platform->rapl_msrs || no_msr)
7120                 return;
7121 
7122         if (genuine_intel)
7123                 rapl_probe_intel();
7124         if (authentic_amd || hygon_genuine)
7125                 rapl_probe_amd();
7126 
7127         if (quiet)
7128                 return;
7129 
7130         for_all_cpus(print_rapl, ODD_COUNTERS);
7131 }
7132 
7133 /*
7134  * MSR_IA32_TEMPERATURE_TARGET indicates the temperature where
7135  * the Thermal Control Circuit (TCC) activates.
7136  * This is usually equal to tjMax.
7137  *
7138  * Older processors do not have this MSR, so there we guess,
7139  * but also allow cmdline over-ride with -T.
7140  *
7141  * Several MSR temperature values are in units of degrees-C
7142  * below this value, including the Digital Thermal Sensor (DTS),
7143  * Package Thermal Management Sensor (PTM), and thermal event thresholds.
7144  */
7145 int set_temperature_target(struct thread_data *t, struct core_data *c, struct pkg_data *p)
7146 {
7147         unsigned long long msr;
7148         unsigned int tcc_default, tcc_offset;
7149         int cpu;
7150 
7151         UNUSED(c);
7152         UNUSED(p);
7153 
7154         /* tj_max is used only for dts or ptm */
7155         if (!(do_dts || do_ptm))
7156                 return 0;
7157 
7158         /* this is a per-package concept */
7159         if (!is_cpu_first_thread_in_package(t, c, p))
7160                 return 0;
7161 
7162         cpu = t->cpu_id;
7163         if (cpu_migrate(cpu)) {
7164                 fprintf(outf, "Could not migrate to CPU %d\n", cpu);
7165                 return -1;
7166         }
7167 
7168         if (tj_max_override != 0) {
7169                 tj_max = tj_max_override;
7170                 fprintf(outf, "cpu%d: Using cmdline TCC Target (%d C)\n", cpu, tj_max);
7171                 return 0;
7172         }
7173 
7174         /* Temperature Target MSR is Nehalem and newer only */
7175         if (!platform->has_nhm_msrs || no_msr)
7176                 goto guess;
7177 
7178         if (get_msr(base_cpu, MSR_IA32_TEMPERATURE_TARGET, &msr))
7179                 goto guess;
7180 
7181         tcc_default = (msr >> 16) & 0xFF;
7182 
7183         if (!quiet) {
7184                 int bits = platform->tcc_offset_bits;
7185                 unsigned long long enabled = 0;
7186 
7187                 if (bits && !get_msr(base_cpu, MSR_PLATFORM_INFO, &enabled))
7188                         enabled = (enabled >> 30) & 1;
7189 
7190                 if (bits && enabled) {
7191                         tcc_offset = (msr >> 24) & GENMASK(bits - 1, 0);
7192                         fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C) (%d default - %d offset)\n",
7193                                 cpu, msr, tcc_default - tcc_offset, tcc_default, tcc_offset);
7194                 } else {
7195                         fprintf(outf, "cpu%d: MSR_IA32_TEMPERATURE_TARGET: 0x%08llx (%d C)\n", cpu, msr, tcc_default);
7196                 }
7197         }
7198 
7199         if (!tcc_default)
7200                 goto guess;
7201 
7202         tj_max = tcc_default;
7203 
7204         return 0;
7205 
7206 guess:
7207         tj_max = TJMAX_DEFAULT;
7208         fprintf(outf, "cpu%d: Guessing tjMax %d C, Please use -T to specify\n", cpu, tj_max);
7209 
7210         return 0;
7211 }
7212 
7213 int print_thermal(struct thread_data *t, struct core_data *c, struct pkg_data *p)
7214 {
7215         unsigned long long msr;
7216         unsigned int dts, dts2;
7217         int cpu;
7218 
7219         UNUSED(c);
7220         UNUSED(p);
7221 
7222         if (no_msr)
7223                 return 0;
7224 
7225         if (!(do_dts || do_ptm))
7226                 return 0;
7227 
7228         cpu = t->cpu_id;
7229 
7230         /* DTS is per-core, no need to print for each thread */
7231         if (!is_cpu_first_thread_in_core(t, c, p))
7232                 return 0;
7233 
7234         if (cpu_migrate(cpu)) {
7235                 fprintf(outf, "print_thermal: Could not migrate to CPU %d\n", cpu);
7236                 return -1;
7237         }
7238 
7239         if (do_ptm && is_cpu_first_core_in_package(t, c, p)) {
7240                 if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr))
7241                         return 0;
7242 
7243                 dts = (msr >> 16) & 0x7F;
7244                 fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_STATUS: 0x%08llx (%d C)\n", cpu, msr, tj_max - dts);
7245 
7246                 if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &msr))
7247                         return 0;
7248 
7249                 dts = (msr >> 16) & 0x7F;
7250                 dts2 = (msr >> 8) & 0x7F;
7251                 fprintf(outf, "cpu%d: MSR_IA32_PACKAGE_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
7252                         cpu, msr, tj_max - dts, tj_max - dts2);
7253         }
7254 
7255         if (do_dts && debug) {
7256                 unsigned int resolution;
7257 
7258                 if (get_msr(cpu, MSR_IA32_THERM_STATUS, &msr))
7259                         return 0;
7260 
7261                 dts = (msr >> 16) & 0x7F;
7262                 resolution = (msr >> 27) & 0xF;
7263                 fprintf(outf, "cpu%d: MSR_IA32_THERM_STATUS: 0x%08llx (%d C +/- %d)\n",
7264                         cpu, msr, tj_max - dts, resolution);
7265 
7266                 if (get_msr(cpu, MSR_IA32_THERM_INTERRUPT, &msr))
7267                         return 0;
7268 
7269                 dts = (msr >> 16) & 0x7F;
7270                 dts2 = (msr >> 8) & 0x7F;
7271                 fprintf(outf, "cpu%d: MSR_IA32_THERM_INTERRUPT: 0x%08llx (%d C, %d C)\n",
7272                         cpu, msr, tj_max - dts, tj_max - dts2);
7273         }
7274 
7275         return 0;
7276 }
7277 
7278 void probe_thermal(void)
7279 {
7280         if (!access("/sys/devices/system/cpu/cpu0/thermal_throttle/core_throttle_count", R_OK))
7281                 BIC_PRESENT(BIC_CORE_THROT_CNT);
7282         else
7283                 BIC_NOT_PRESENT(BIC_CORE_THROT_CNT);
7284 
7285         for_all_cpus(set_temperature_target, ODD_COUNTERS);
7286 
7287         if (quiet)
7288                 return;
7289 
7290         for_all_cpus(print_thermal, ODD_COUNTERS);
7291 }
7292 
7293 int get_cpu_type(struct thread_data *t, struct core_data *c, struct pkg_data *p)
7294 {
7295         unsigned int eax, ebx, ecx, edx;
7296 
7297         UNUSED(c);
7298         UNUSED(p);
7299 
7300         if (!genuine_intel)
7301                 return 0;
7302 
7303         if (cpu_migrate(t->cpu_id)) {
7304                 fprintf(outf, "Could not migrate to CPU %d\n", t->cpu_id);
7305                 return -1;
7306         }
7307 
7308         if (max_level < 0x1a)
7309                 return 0;
7310 
7311         __cpuid(0x1a, eax, ebx, ecx, edx);
7312         eax = (eax >> 24) & 0xFF;
7313         if (eax == 0x20)
7314                 t->is_atom = true;
7315         return 0;
7316 }
7317 
7318 void decode_feature_control_msr(void)
7319 {
7320         unsigned long long msr;
7321 
7322         if (no_msr)
7323                 return;
7324 
7325         if (!get_msr(base_cpu, MSR_IA32_FEAT_CTL, &msr))
7326                 fprintf(outf, "cpu%d: MSR_IA32_FEATURE_CONTROL: 0x%08llx (%sLocked %s)\n",
7327                         base_cpu, msr, msr & FEAT_CTL_LOCKED ? "" : "UN-", msr & (1 << 18) ? "SGX" : "");
7328 }
7329 
7330 void decode_misc_enable_msr(void)
7331 {
7332         unsigned long long msr;
7333 
7334         if (no_msr)
7335                 return;
7336 
7337         if (!genuine_intel)
7338                 return;
7339 
7340         if (!get_msr(base_cpu, MSR_IA32_MISC_ENABLE, &msr))
7341                 fprintf(outf, "cpu%d: MSR_IA32_MISC_ENABLE: 0x%08llx (%sTCC %sEIST %sMWAIT %sPREFETCH %sTURBO)\n",
7342                         base_cpu, msr,
7343                         msr & MSR_IA32_MISC_ENABLE_TM1 ? "" : "No-",
7344                         msr & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP ? "" : "No-",
7345                         msr & MSR_IA32_MISC_ENABLE_MWAIT ? "" : "No-",
7346                         msr & MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE ? "No-" : "",
7347                         msr & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ? "No-" : "");
7348 }
7349 
7350 void decode_misc_feature_control(void)
7351 {
7352         unsigned long long msr;
7353 
7354         if (no_msr)
7355                 return;
7356 
7357         if (!platform->has_msr_misc_feature_control)
7358                 return;
7359 
7360         if (!get_msr(base_cpu, MSR_MISC_FEATURE_CONTROL, &msr))
7361                 fprintf(outf,
7362                         "cpu%d: MSR_MISC_FEATURE_CONTROL: 0x%08llx (%sL2-Prefetch %sL2-Prefetch-pair %sL1-Prefetch %sL1-IP-Prefetch)\n",
7363                         base_cpu, msr, msr & (0 << 0) ? "No-" : "", msr & (1 << 0) ? "No-" : "",
7364                         msr & (2 << 0) ? "No-" : "", msr & (3 << 0) ? "No-" : "");
7365 }
7366 
7367 /*
7368  * Decode MSR_MISC_PWR_MGMT
7369  *
7370  * Decode the bits according to the Nehalem documentation
7371  * bit[0] seems to continue to have same meaning going forward
7372  * bit[1] less so...
7373  */
7374 void decode_misc_pwr_mgmt_msr(void)
7375 {
7376         unsigned long long msr;
7377 
7378         if (no_msr)
7379                 return;
7380 
7381         if (!platform->has_msr_misc_pwr_mgmt)
7382                 return;
7383 
7384         if (!get_msr(base_cpu, MSR_MISC_PWR_MGMT, &msr))
7385                 fprintf(outf, "cpu%d: MSR_MISC_PWR_MGMT: 0x%08llx (%sable-EIST_Coordination %sable-EPB %sable-OOB)\n",
7386                         base_cpu, msr,
7387                         msr & (1 << 0) ? "DIS" : "EN", msr & (1 << 1) ? "EN" : "DIS", msr & (1 << 8) ? "EN" : "DIS");
7388 }
7389 
7390 /*
7391  * Decode MSR_CC6_DEMOTION_POLICY_CONFIG, MSR_MC6_DEMOTION_POLICY_CONFIG
7392  *
7393  * This MSRs are present on Silvermont processors,
7394  * Intel Atom processor E3000 series (Baytrail), and friends.
7395  */
7396 void decode_c6_demotion_policy_msr(void)
7397 {
7398         unsigned long long msr;
7399 
7400         if (no_msr)
7401                 return;
7402 
7403         if (!platform->has_msr_c6_demotion_policy_config)
7404                 return;
7405 
7406         if (!get_msr(base_cpu, MSR_CC6_DEMOTION_POLICY_CONFIG, &msr))
7407                 fprintf(outf, "cpu%d: MSR_CC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-CC6-Demotion)\n",
7408                         base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
7409 
7410         if (!get_msr(base_cpu, MSR_MC6_DEMOTION_POLICY_CONFIG, &msr))
7411                 fprintf(outf, "cpu%d: MSR_MC6_DEMOTION_POLICY_CONFIG: 0x%08llx (%sable-MC6-Demotion)\n",
7412                         base_cpu, msr, msr & (1 << 0) ? "EN" : "DIS");
7413 }
7414 
7415 void print_dev_latency(void)
7416 {
7417         char *path = "/dev/cpu_dma_latency";
7418         int fd;
7419         int value;
7420         int retval;
7421 
7422         fd = open(path, O_RDONLY);
7423         if (fd < 0) {
7424                 if (debug)
7425                         warnx("Read %s failed", path);
7426                 return;
7427         }
7428 
7429         retval = read(fd, (void *)&value, sizeof(int));
7430         if (retval != sizeof(int)) {
7431                 warn("read failed %s", path);
7432                 close(fd);
7433                 return;
7434         }
7435         fprintf(outf, "/dev/cpu_dma_latency: %d usec (%s)\n", value, value == 2000000000 ? "default" : "constrained");
7436 
7437         close(fd);
7438 }
7439 
7440 static int has_instr_count_access(void)
7441 {
7442         int fd;
7443         int has_access;
7444 
7445         if (no_perf)
7446                 return 0;
7447 
7448         fd = open_perf_counter(base_cpu, PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, -1, 0);
7449         has_access = fd != -1;
7450 
7451         if (fd != -1)
7452                 close(fd);
7453 
7454         if (!has_access)
7455                 warnx("Failed to access %s. Some of the counters may not be available\n"
7456                       "\tRun as root to enable them or use %s to disable the access explicitly",
7457                       "instructions retired perf counter", "--no-perf");
7458 
7459         return has_access;
7460 }
7461 
7462 int add_rapl_perf_counter_(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
7463                            double *scale_, enum rapl_unit *unit_)
7464 {
7465         if (no_perf)
7466                 return -1;
7467 
7468         const double scale = read_perf_scale(cai->perf_subsys, cai->perf_name);
7469 
7470         if (scale == 0.0)
7471                 return -1;
7472 
7473         const enum rapl_unit unit = read_perf_rapl_unit(cai->perf_subsys, cai->perf_name);
7474 
7475         if (unit == RAPL_UNIT_INVALID)
7476                 return -1;
7477 
7478         const unsigned int rapl_type = read_perf_type(cai->perf_subsys);
7479         const unsigned int rapl_energy_pkg_config = read_perf_config(cai->perf_subsys, cai->perf_name);
7480 
7481         const int fd_counter =
7482             open_perf_counter(cpu, rapl_type, rapl_energy_pkg_config, rci->fd_perf, PERF_FORMAT_GROUP);
7483         if (fd_counter == -1)
7484                 return -1;
7485 
7486         /* If it's the first counter opened, make it a group descriptor */
7487         if (rci->fd_perf == -1)
7488                 rci->fd_perf = fd_counter;
7489 
7490         *scale_ = scale;
7491         *unit_ = unit;
7492         return fd_counter;
7493 }
7494 
7495 int add_rapl_perf_counter(int cpu, struct rapl_counter_info_t *rci, const struct rapl_counter_arch_info *cai,
7496                           double *scale, enum rapl_unit *unit)
7497 {
7498         int ret = add_rapl_perf_counter_(cpu, rci, cai, scale, unit);
7499 
7500         if (debug >= 2)
7501                 fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
7502 
7503         return ret;
7504 }
7505 
7506 /*
7507  * Linux-perf manages the HW instructions-retired counter
7508  * by enabling when requested, and hiding rollover
7509  */
7510 void linux_perf_init(void)
7511 {
7512         if (access("/proc/sys/kernel/perf_event_paranoid", F_OK))
7513                 return;
7514 
7515         if (BIC_IS_ENABLED(BIC_IPC) && has_aperf) {
7516                 fd_instr_count_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
7517                 if (fd_instr_count_percpu == NULL)
7518                         err(-1, "calloc fd_instr_count_percpu");
7519         }
7520 }
7521 
7522 void rapl_perf_init(void)
7523 {
7524         const unsigned int num_domains = (platform->has_per_core_rapl ? topo.max_core_id : topo.max_package_id) + 1;
7525         bool *domain_visited = calloc(num_domains, sizeof(bool));
7526 
7527         rapl_counter_info_perdomain = calloc(num_domains, sizeof(*rapl_counter_info_perdomain));
7528         if (rapl_counter_info_perdomain == NULL)
7529                 err(-1, "calloc rapl_counter_info_percpu");
7530         rapl_counter_info_perdomain_size = num_domains;
7531 
7532         /*
7533          * Initialize rapl_counter_info_percpu
7534          */
7535         for (unsigned int domain_id = 0; domain_id < num_domains; ++domain_id) {
7536                 struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[domain_id];
7537 
7538                 rci->fd_perf = -1;
7539                 for (size_t i = 0; i < NUM_RAPL_COUNTERS; ++i) {
7540                         rci->data[i] = 0;
7541                         rci->source[i] = COUNTER_SOURCE_NONE;
7542                 }
7543         }
7544 
7545         /*
7546          * Open/probe the counters
7547          * If can't get it via perf, fallback to MSR
7548          */
7549         for (size_t i = 0; i < ARRAY_SIZE(rapl_counter_arch_infos); ++i) {
7550 
7551                 const struct rapl_counter_arch_info *const cai = &rapl_counter_arch_infos[i];
7552                 bool has_counter = 0;
7553                 double scale;
7554                 enum rapl_unit unit;
7555                 unsigned int next_domain;
7556 
7557                 memset(domain_visited, 0, num_domains * sizeof(*domain_visited));
7558 
7559                 for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
7560 
7561                         if (cpu_is_not_allowed(cpu))
7562                                 continue;
7563 
7564                         /* Skip already seen and handled RAPL domains */
7565                         next_domain =
7566                             platform->has_per_core_rapl ? cpus[cpu].physical_core_id : cpus[cpu].physical_package_id;
7567 
7568                         assert(next_domain < num_domains);
7569 
7570                         if (domain_visited[next_domain])
7571                                 continue;
7572 
7573                         domain_visited[next_domain] = 1;
7574 
7575                         struct rapl_counter_info_t *rci = &rapl_counter_info_perdomain[next_domain];
7576 
7577                         /* Check if the counter is enabled and accessible */
7578                         if (BIC_IS_ENABLED(cai->bic) && (platform->rapl_msrs & cai->feature_mask)) {
7579 
7580                                 /* Use perf API for this counter */
7581                                 if (!no_perf && cai->perf_name
7582                                     && add_rapl_perf_counter(cpu, rci, cai, &scale, &unit) != -1) {
7583                                         rci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
7584                                         rci->scale[cai->rci_index] = scale * cai->compat_scale;
7585                                         rci->unit[cai->rci_index] = unit;
7586                                         rci->flags[cai->rci_index] = cai->flags;
7587 
7588                                         /* Use MSR for this counter */
7589                                 } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
7590                                         rci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
7591                                         rci->msr[cai->rci_index] = cai->msr;
7592                                         rci->msr_mask[cai->rci_index] = cai->msr_mask;
7593                                         rci->msr_shift[cai->rci_index] = cai->msr_shift;
7594                                         rci->unit[cai->rci_index] = RAPL_UNIT_JOULES;
7595                                         rci->scale[cai->rci_index] = *cai->platform_rapl_msr_scale * cai->compat_scale;
7596                                         rci->flags[cai->rci_index] = cai->flags;
7597                                 }
7598                         }
7599 
7600                         if (rci->source[cai->rci_index] != COUNTER_SOURCE_NONE)
7601                                 has_counter = 1;
7602                 }
7603 
7604                 /* If any CPU has access to the counter, make it present */
7605                 if (has_counter)
7606                         BIC_PRESENT(cai->bic);
7607         }
7608 
7609         free(domain_visited);
7610 }
7611 
7612 /* Assumes msr_counter_info is populated */
7613 static int has_amperf_access(void)
7614 {
7615         return msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].present &&
7616             msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].present;
7617 }
7618 
7619 int *get_cstate_perf_group_fd(struct cstate_counter_info_t *cci, const char *group_name)
7620 {
7621         if (strcmp(group_name, "cstate_core") == 0)
7622                 return &cci->fd_perf_core;
7623 
7624         if (strcmp(group_name, "cstate_pkg") == 0)
7625                 return &cci->fd_perf_pkg;
7626 
7627         return NULL;
7628 }
7629 
7630 int add_cstate_perf_counter_(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
7631 {
7632         if (no_perf)
7633                 return -1;
7634 
7635         int *pfd_group = get_cstate_perf_group_fd(cci, cai->perf_subsys);
7636 
7637         if (pfd_group == NULL)
7638                 return -1;
7639 
7640         const unsigned int type = read_perf_type(cai->perf_subsys);
7641         const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
7642 
7643         const int fd_counter = open_perf_counter(cpu, type, config, *pfd_group, PERF_FORMAT_GROUP);
7644 
7645         if (fd_counter == -1)
7646                 return -1;
7647 
7648         /* If it's the first counter opened, make it a group descriptor */
7649         if (*pfd_group == -1)
7650                 *pfd_group = fd_counter;
7651 
7652         return fd_counter;
7653 }
7654 
7655 int add_cstate_perf_counter(int cpu, struct cstate_counter_info_t *cci, const struct cstate_counter_arch_info *cai)
7656 {
7657         int ret = add_cstate_perf_counter_(cpu, cci, cai);
7658 
7659         if (debug >= 2)
7660                 fprintf(stderr, "%s: %d (cpu: %d)\n", __func__, ret, cpu);
7661 
7662         return ret;
7663 }
7664 
7665 int add_msr_perf_counter_(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
7666 {
7667         if (no_perf)
7668                 return -1;
7669 
7670         const unsigned int type = read_perf_type(cai->perf_subsys);
7671         const unsigned int config = read_perf_config(cai->perf_subsys, cai->perf_name);
7672 
7673         const int fd_counter = open_perf_counter(cpu, type, config, cci->fd_perf, PERF_FORMAT_GROUP);
7674 
7675         if (fd_counter == -1)
7676                 return -1;
7677 
7678         /* If it's the first counter opened, make it a group descriptor */
7679         if (cci->fd_perf == -1)
7680                 cci->fd_perf = fd_counter;
7681 
7682         return fd_counter;
7683 }
7684 
7685 int add_msr_perf_counter(int cpu, struct msr_counter_info_t *cci, const struct msr_counter_arch_info *cai)
7686 {
7687         int ret = add_msr_perf_counter_(cpu, cci, cai);
7688 
7689         if (debug)
7690                 fprintf(stderr, "%s: %s/%s: %d (cpu: %d)\n", __func__, cai->perf_subsys, cai->perf_name, ret, cpu);
7691 
7692         return ret;
7693 }
7694 
7695 void msr_perf_init_(void)
7696 {
7697         const int mci_num = topo.max_cpu_num + 1;
7698 
7699         msr_counter_info = calloc(mci_num, sizeof(*msr_counter_info));
7700         if (!msr_counter_info)
7701                 err(1, "calloc msr_counter_info");
7702         msr_counter_info_size = mci_num;
7703 
7704         for (int cpu = 0; cpu < mci_num; ++cpu)
7705                 msr_counter_info[cpu].fd_perf = -1;
7706 
7707         for (int cidx = 0; cidx < NUM_MSR_COUNTERS; ++cidx) {
7708 
7709                 struct msr_counter_arch_info *cai = &msr_counter_arch_infos[cidx];
7710 
7711                 cai->present = false;
7712 
7713                 for (int cpu = 0; cpu < mci_num; ++cpu) {
7714 
7715                         struct msr_counter_info_t *const cci = &msr_counter_info[cpu];
7716 
7717                         if (cpu_is_not_allowed(cpu))
7718                                 continue;
7719 
7720                         if (cai->needed) {
7721                                 /* Use perf API for this counter */
7722                                 if (!no_perf && cai->perf_name && add_msr_perf_counter(cpu, cci, cai) != -1) {
7723                                         cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
7724                                         cai->present = true;
7725 
7726                                         /* User MSR for this counter */
7727                                 } else if (!no_msr && cai->msr && probe_msr(cpu, cai->msr) == 0) {
7728                                         cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
7729                                         cci->msr[cai->rci_index] = cai->msr;
7730                                         cci->msr_mask[cai->rci_index] = cai->msr_mask;
7731                                         cai->present = true;
7732                                 }
7733                         }
7734                 }
7735         }
7736 }
7737 
7738 /* Initialize data for reading perf counters from the MSR group. */
7739 void msr_perf_init(void)
7740 {
7741         bool need_amperf = false, need_smi = false;
7742         const bool need_soft_c1 = (!platform->has_msr_core_c1_res) && (platform->supported_cstates & CC1);
7743 
7744         need_amperf = BIC_IS_ENABLED(BIC_Avg_MHz) || BIC_IS_ENABLED(BIC_Busy) || BIC_IS_ENABLED(BIC_Bzy_MHz)
7745             || BIC_IS_ENABLED(BIC_IPC) || need_soft_c1;
7746 
7747         if (BIC_IS_ENABLED(BIC_SMI))
7748                 need_smi = true;
7749 
7750         /* Enable needed counters */
7751         msr_counter_arch_infos[MSR_ARCH_INFO_APERF_INDEX].needed = need_amperf;
7752         msr_counter_arch_infos[MSR_ARCH_INFO_MPERF_INDEX].needed = need_amperf;
7753         msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].needed = need_smi;
7754 
7755         msr_perf_init_();
7756 
7757         const bool has_amperf = has_amperf_access();
7758         const bool has_smi = msr_counter_arch_infos[MSR_ARCH_INFO_SMI_INDEX].present;
7759 
7760         has_aperf_access = has_amperf;
7761 
7762         if (has_amperf) {
7763                 BIC_PRESENT(BIC_Avg_MHz);
7764                 BIC_PRESENT(BIC_Busy);
7765                 BIC_PRESENT(BIC_Bzy_MHz);
7766                 BIC_PRESENT(BIC_SMI);
7767         }
7768 
7769         if (has_smi)
7770                 BIC_PRESENT(BIC_SMI);
7771 }
7772 
7773 void cstate_perf_init_(bool soft_c1)
7774 {
7775         bool has_counter;
7776         bool *cores_visited = NULL, *pkg_visited = NULL;
7777         const int cores_visited_elems = topo.max_core_id + 1;
7778         const int pkg_visited_elems = topo.max_package_id + 1;
7779         const int cci_num = topo.max_cpu_num + 1;
7780 
7781         ccstate_counter_info = calloc(cci_num, sizeof(*ccstate_counter_info));
7782         if (!ccstate_counter_info)
7783                 err(1, "calloc ccstate_counter_arch_info");
7784         ccstate_counter_info_size = cci_num;
7785 
7786         cores_visited = calloc(cores_visited_elems, sizeof(*cores_visited));
7787         if (!cores_visited)
7788                 err(1, "calloc cores_visited");
7789 
7790         pkg_visited = calloc(pkg_visited_elems, sizeof(*pkg_visited));
7791         if (!pkg_visited)
7792                 err(1, "calloc pkg_visited");
7793 
7794         /* Initialize cstate_counter_info_percpu */
7795         for (int cpu = 0; cpu < cci_num; ++cpu) {
7796                 ccstate_counter_info[cpu].fd_perf_core = -1;
7797                 ccstate_counter_info[cpu].fd_perf_pkg = -1;
7798         }
7799 
7800         for (int cidx = 0; cidx < NUM_CSTATE_COUNTERS; ++cidx) {
7801                 has_counter = false;
7802                 memset(cores_visited, 0, cores_visited_elems * sizeof(*cores_visited));
7803                 memset(pkg_visited, 0, pkg_visited_elems * sizeof(*pkg_visited));
7804 
7805                 const struct cstate_counter_arch_info *cai = &ccstate_counter_arch_infos[cidx];
7806 
7807                 for (int cpu = 0; cpu < cci_num; ++cpu) {
7808 
7809                         struct cstate_counter_info_t *const cci = &ccstate_counter_info[cpu];
7810 
7811                         if (cpu_is_not_allowed(cpu))
7812                                 continue;
7813 
7814                         const int core_id = cpus[cpu].physical_core_id;
7815                         const int pkg_id = cpus[cpu].physical_package_id;
7816 
7817                         assert(core_id < cores_visited_elems);
7818                         assert(pkg_id < pkg_visited_elems);
7819 
7820                         const bool per_thread = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_THREAD;
7821                         const bool per_core = cai->flags & CSTATE_COUNTER_FLAG_COLLECT_PER_CORE;
7822 
7823                         if (!per_thread && cores_visited[core_id])
7824                                 continue;
7825 
7826                         if (!per_core && pkg_visited[pkg_id])
7827                                 continue;
7828 
7829                         const bool counter_needed = BIC_IS_ENABLED(cai->bic) ||
7830                             (soft_c1 && (cai->flags & CSTATE_COUNTER_FLAG_SOFT_C1_DEPENDENCY));
7831                         const bool counter_supported = (platform->supported_cstates & cai->feature_mask);
7832 
7833                         if (counter_needed && counter_supported) {
7834                                 /* Use perf API for this counter */
7835                                 if (!no_perf && cai->perf_name && add_cstate_perf_counter(cpu, cci, cai) != -1) {
7836 
7837                                         cci->source[cai->rci_index] = COUNTER_SOURCE_PERF;
7838 
7839                                         /* User MSR for this counter */
7840                                 } else if (!no_msr && cai->msr && pkg_cstate_limit >= cai->pkg_cstate_limit
7841                                            && probe_msr(cpu, cai->msr) == 0) {
7842                                         cci->source[cai->rci_index] = COUNTER_SOURCE_MSR;
7843                                         cci->msr[cai->rci_index] = cai->msr;
7844                                 }
7845                         }
7846 
7847                         if (cci->source[cai->rci_index] != COUNTER_SOURCE_NONE) {
7848                                 has_counter = true;
7849                                 cores_visited[core_id] = true;
7850                                 pkg_visited[pkg_id] = true;
7851                         }
7852                 }
7853 
7854                 /* If any CPU has access to the counter, make it present */
7855                 if (has_counter)
7856                         BIC_PRESENT(cai->bic);
7857         }
7858 
7859         free(cores_visited);
7860         free(pkg_visited);
7861 }
7862 
7863 void cstate_perf_init(void)
7864 {
7865         /*
7866          * If we don't have a C1 residency MSR, we calculate it "in software",
7867          * but we need APERF, MPERF too.
7868          */
7869         const bool soft_c1 = !platform->has_msr_core_c1_res && has_amperf_access()
7870             && platform->supported_cstates & CC1;
7871 
7872         if (soft_c1)
7873                 BIC_PRESENT(BIC_CPU_c1);
7874 
7875         cstate_perf_init_(soft_c1);
7876 }
7877 
7878 void probe_cstates(void)
7879 {
7880         probe_cst_limit();
7881 
7882         if (platform->has_msr_module_c6_res_ms)
7883                 BIC_PRESENT(BIC_Mod_c6);
7884 
7885         if (platform->has_ext_cst_msrs && !no_msr) {
7886                 BIC_PRESENT(BIC_Totl_c0);
7887                 BIC_PRESENT(BIC_Any_c0);
7888                 BIC_PRESENT(BIC_GFX_c0);
7889                 BIC_PRESENT(BIC_CPUGFX);
7890         }
7891 
7892         if (quiet)
7893                 return;
7894 
7895         dump_power_ctl();
7896         dump_cst_cfg();
7897         decode_c6_demotion_policy_msr();
7898         print_dev_latency();
7899         dump_sysfs_cstate_config();
7900         print_irtl();
7901 }
7902 
7903 void probe_lpi(void)
7904 {
7905         if (!access("/sys/devices/system/cpu/cpuidle/low_power_idle_cpu_residency_us", R_OK))
7906                 BIC_PRESENT(BIC_CPU_LPI);
7907         else
7908                 BIC_NOT_PRESENT(BIC_CPU_LPI);
7909 
7910         if (!access(sys_lpi_file_sysfs, R_OK)) {
7911                 sys_lpi_file = sys_lpi_file_sysfs;
7912                 BIC_PRESENT(BIC_SYS_LPI);
7913         } else if (!access(sys_lpi_file_debugfs, R_OK)) {
7914                 sys_lpi_file = sys_lpi_file_debugfs;
7915                 BIC_PRESENT(BIC_SYS_LPI);
7916         } else {
7917                 sys_lpi_file_sysfs = NULL;
7918                 BIC_NOT_PRESENT(BIC_SYS_LPI);
7919         }
7920 
7921 }
7922 
7923 void probe_pstates(void)
7924 {
7925         probe_bclk();
7926 
7927         if (quiet)
7928                 return;
7929 
7930         dump_platform_info();
7931         dump_turbo_ratio_info();
7932         dump_sysfs_pstate_config();
7933         decode_misc_pwr_mgmt_msr();
7934 
7935         for_all_cpus(print_hwp, ODD_COUNTERS);
7936         for_all_cpus(print_epb, ODD_COUNTERS);
7937         for_all_cpus(print_perf_limit, ODD_COUNTERS);
7938 }
7939 
7940 void process_cpuid()
7941 {
7942         unsigned int eax, ebx, ecx, edx;
7943         unsigned int fms, family, model, stepping, ecx_flags, edx_flags;
7944         unsigned long long ucode_patch = 0;
7945         bool ucode_patch_valid = false;
7946 
7947         eax = ebx = ecx = edx = 0;
7948 
7949         __cpuid(0, max_level, ebx, ecx, edx);
7950 
7951         if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
7952                 genuine_intel = 1;
7953         else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
7954                 authentic_amd = 1;
7955         else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e)
7956                 hygon_genuine = 1;
7957 
7958         if (!quiet)
7959                 fprintf(outf, "CPUID(0): %.4s%.4s%.4s 0x%x CPUID levels\n",
7960                         (char *)&ebx, (char *)&edx, (char *)&ecx, max_level);
7961 
7962         __cpuid(1, fms, ebx, ecx, edx);
7963         family = (fms >> 8) & 0xf;
7964         model = (fms >> 4) & 0xf;
7965         stepping = fms & 0xf;
7966         if (family == 0xf)
7967                 family += (fms >> 20) & 0xff;
7968         if (family >= 6)
7969                 model += ((fms >> 16) & 0xf) << 4;
7970         ecx_flags = ecx;
7971         edx_flags = edx;
7972 
7973         if (!no_msr) {
7974                 if (get_msr(sched_getcpu(), MSR_IA32_UCODE_REV, &ucode_patch))
7975                         warnx("get_msr(UCODE)");
7976                 else
7977                         ucode_patch_valid = true;
7978         }
7979 
7980         /*
7981          * check max extended function levels of CPUID.
7982          * This is needed to check for invariant TSC.
7983          * This check is valid for both Intel and AMD.
7984          */
7985         ebx = ecx = edx = 0;
7986         __cpuid(0x80000000, max_extended_level, ebx, ecx, edx);
7987 
7988         if (!quiet) {
7989                 fprintf(outf, "CPUID(1): family:model:stepping 0x%x:%x:%x (%d:%d:%d)",
7990                         family, model, stepping, family, model, stepping);
7991                 if (ucode_patch_valid)
7992                         fprintf(outf, " microcode 0x%x", (unsigned int)((ucode_patch >> 32) & 0xFFFFFFFF));
7993                 fputc('\n', outf);
7994 
7995                 fprintf(outf, "CPUID(0x80000000): max_extended_levels: 0x%x\n", max_extended_level);
7996                 fprintf(outf, "CPUID(1): %s %s %s %s %s %s %s %s %s %s\n",
7997                         ecx_flags & (1 << 0) ? "SSE3" : "-",
7998                         ecx_flags & (1 << 3) ? "MONITOR" : "-",
7999                         ecx_flags & (1 << 6) ? "SMX" : "-",
8000                         ecx_flags & (1 << 7) ? "EIST" : "-",
8001                         ecx_flags & (1 << 8) ? "TM2" : "-",
8002                         edx_flags & (1 << 4) ? "TSC" : "-",
8003                         edx_flags & (1 << 5) ? "MSR" : "-",
8004                         edx_flags & (1 << 22) ? "ACPI-TM" : "-",
8005                         edx_flags & (1 << 28) ? "HT" : "-", edx_flags & (1 << 29) ? "TM" : "-");
8006         }
8007 
8008         probe_platform_features(family, model);
8009 
8010         if (!(edx_flags & (1 << 5)))
8011                 errx(1, "CPUID: no MSR");
8012 
8013         if (max_extended_level >= 0x80000007) {
8014 
8015                 /*
8016                  * Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
8017                  * this check is valid for both Intel and AMD
8018                  */
8019                 __cpuid(0x80000007, eax, ebx, ecx, edx);
8020                 has_invariant_tsc = edx & (1 << 8);
8021         }
8022 
8023         /*
8024          * APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
8025          * this check is valid for both Intel and AMD
8026          */
8027 
8028         __cpuid(0x6, eax, ebx, ecx, edx);
8029         has_aperf = ecx & (1 << 0);
8030         do_dts = eax & (1 << 0);
8031         if (do_dts)
8032                 BIC_PRESENT(BIC_CoreTmp);
8033         has_turbo = eax & (1 << 1);
8034         do_ptm = eax & (1 << 6);
8035         if (do_ptm)
8036                 BIC_PRESENT(BIC_PkgTmp);
8037         has_hwp = eax & (1 << 7);
8038         has_hwp_notify = eax & (1 << 8);
8039         has_hwp_activity_window = eax & (1 << 9);
8040         has_hwp_epp = eax & (1 << 10);
8041         has_hwp_pkg = eax & (1 << 11);
8042         has_epb = ecx & (1 << 3);
8043 
8044         if (!quiet)
8045                 fprintf(outf, "CPUID(6): %sAPERF, %sTURBO, %sDTS, %sPTM, %sHWP, "
8046                         "%sHWPnotify, %sHWPwindow, %sHWPepp, %sHWPpkg, %sEPB\n",
8047                         has_aperf ? "" : "No-",
8048                         has_turbo ? "" : "No-",
8049                         do_dts ? "" : "No-",
8050                         do_ptm ? "" : "No-",
8051                         has_hwp ? "" : "No-",
8052                         has_hwp_notify ? "" : "No-",
8053                         has_hwp_activity_window ? "" : "No-",
8054                         has_hwp_epp ? "" : "No-", has_hwp_pkg ? "" : "No-", has_epb ? "" : "No-");
8055 
8056         if (!quiet)
8057                 decode_misc_enable_msr();
8058 
8059         if (max_level >= 0x7 && !quiet) {
8060                 int has_sgx;
8061 
8062                 ecx = 0;
8063 
8064                 __cpuid_count(0x7, 0, eax, ebx, ecx, edx);
8065 
8066                 has_sgx = ebx & (1 << 2);
8067 
8068                 is_hybrid = edx & (1 << 15);
8069 
8070                 fprintf(outf, "CPUID(7): %sSGX %sHybrid\n", has_sgx ? "" : "No-", is_hybrid ? "" : "No-");
8071 
8072                 if (has_sgx)
8073                         decode_feature_control_msr();
8074         }
8075 
8076         if (max_level >= 0x15) {
8077                 unsigned int eax_crystal;
8078                 unsigned int ebx_tsc;
8079 
8080                 /*
8081                  * CPUID 15H TSC/Crystal ratio, possibly Crystal Hz
8082                  */
8083                 eax_crystal = ebx_tsc = crystal_hz = edx = 0;
8084                 __cpuid(0x15, eax_crystal, ebx_tsc, crystal_hz, edx);
8085 
8086                 if (ebx_tsc != 0) {
8087                         if (!quiet && (ebx != 0))
8088                                 fprintf(outf, "CPUID(0x15): eax_crystal: %d ebx_tsc: %d ecx_crystal_hz: %d\n",
8089                                         eax_crystal, ebx_tsc, crystal_hz);
8090 
8091                         if (crystal_hz == 0)
8092                                 crystal_hz = platform->crystal_freq;
8093 
8094                         if (crystal_hz) {
8095                                 tsc_hz = (unsigned long long)crystal_hz *ebx_tsc / eax_crystal;
8096                                 if (!quiet)
8097                                         fprintf(outf, "TSC: %lld MHz (%d Hz * %d / %d / 1000000)\n",
8098                                                 tsc_hz / 1000000, crystal_hz, ebx_tsc, eax_crystal);
8099                         }
8100                 }
8101         }
8102         if (max_level >= 0x16) {
8103                 unsigned int base_mhz, max_mhz, bus_mhz, edx;
8104 
8105                 /*
8106                  * CPUID 16H Base MHz, Max MHz, Bus MHz
8107                  */
8108                 base_mhz = max_mhz = bus_mhz = edx = 0;
8109 
8110                 __cpuid(0x16, base_mhz, max_mhz, bus_mhz, edx);
8111 
8112                 bclk = bus_mhz;
8113 
8114                 base_hz = base_mhz * 1000000;
8115                 has_base_hz = 1;
8116 
8117                 if (platform->enable_tsc_tweak)
8118                         tsc_tweak = base_hz / tsc_hz;
8119 
8120                 if (!quiet)
8121                         fprintf(outf, "CPUID(0x16): base_mhz: %d max_mhz: %d bus_mhz: %d\n",
8122                                 base_mhz, max_mhz, bus_mhz);
8123         }
8124 
8125         if (has_aperf)
8126                 aperf_mperf_multiplier = platform->need_perf_multiplier ? 1024 : 1;
8127 
8128         BIC_PRESENT(BIC_IRQ);
8129         BIC_PRESENT(BIC_TSC_MHz);
8130 }
8131 
8132 static void counter_info_init(void)
8133 {
8134         for (int i = 0; i < NUM_CSTATE_COUNTERS; ++i) {
8135                 struct cstate_counter_arch_info *const cai = &ccstate_counter_arch_infos[i];
8136 
8137                 if (platform->has_msr_knl_core_c6_residency && cai->msr == MSR_CORE_C6_RESIDENCY)
8138                         cai->msr = MSR_KNL_CORE_C6_RESIDENCY;
8139 
8140                 if (!platform->has_msr_core_c1_res && cai->msr == MSR_CORE_C1_RES)
8141                         cai->msr = 0;
8142 
8143                 if (platform->has_msr_atom_pkg_c6_residency && cai->msr == MSR_PKG_C6_RESIDENCY)
8144                         cai->msr = MSR_ATOM_PKG_C6_RESIDENCY;
8145         }
8146 
8147         for (int i = 0; i < NUM_MSR_COUNTERS; ++i) {
8148                 msr_counter_arch_infos[i].present = false;
8149                 msr_counter_arch_infos[i].needed = false;
8150         }
8151 }
8152 
8153 void probe_pm_features(void)
8154 {
8155         probe_pstates();
8156 
8157         probe_cstates();
8158 
8159         probe_lpi();
8160 
8161         probe_intel_uncore_frequency();
8162 
8163         probe_graphics();
8164 
8165         probe_rapl();
8166 
8167         probe_thermal();
8168 
8169         if (platform->has_nhm_msrs && !no_msr)
8170                 BIC_PRESENT(BIC_SMI);
8171 
8172         if (!quiet)
8173                 decode_misc_feature_control();
8174 }
8175 
8176 /*
8177  * in /dev/cpu/ return success for names that are numbers
8178  * ie. filter out ".", "..", "microcode".
8179  */
8180 int dir_filter(const struct dirent *dirp)
8181 {
8182         if (isdigit(dirp->d_name[0]))
8183                 return 1;
8184         else
8185                 return 0;
8186 }
8187 
8188 void topology_probe(bool startup)
8189 {
8190         int i;
8191         int max_core_id = 0;
8192         int max_package_id = 0;
8193         int max_siblings = 0;
8194 
8195         /* Initialize num_cpus, max_cpu_num */
8196         set_max_cpu_num();
8197         topo.num_cpus = 0;
8198         for_all_proc_cpus(count_cpus);
8199         if (!summary_only && topo.num_cpus > 1)
8200                 BIC_PRESENT(BIC_CPU);
8201 
8202         if (debug > 1)
8203                 fprintf(outf, "num_cpus %d max_cpu_num %d\n", topo.num_cpus, topo.max_cpu_num);
8204 
8205         cpus = calloc(1, (topo.max_cpu_num + 1) * sizeof(struct cpu_topology));
8206         if (cpus == NULL)
8207                 err(1, "calloc cpus");
8208 
8209         /*
8210          * Allocate and initialize cpu_present_set
8211          */
8212         cpu_present_set = CPU_ALLOC((topo.max_cpu_num + 1));
8213         if (cpu_present_set == NULL)
8214                 err(3, "CPU_ALLOC");
8215         cpu_present_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
8216         CPU_ZERO_S(cpu_present_setsize, cpu_present_set);
8217         for_all_proc_cpus(mark_cpu_present);
8218 
8219         /*
8220          * Allocate and initialize cpu_effective_set
8221          */
8222         cpu_effective_set = CPU_ALLOC((topo.max_cpu_num + 1));
8223         if (cpu_effective_set == NULL)
8224                 err(3, "CPU_ALLOC");
8225         cpu_effective_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
8226         CPU_ZERO_S(cpu_effective_setsize, cpu_effective_set);
8227         update_effective_set(startup);
8228 
8229         /*
8230          * Allocate and initialize cpu_allowed_set
8231          */
8232         cpu_allowed_set = CPU_ALLOC((topo.max_cpu_num + 1));
8233         if (cpu_allowed_set == NULL)
8234                 err(3, "CPU_ALLOC");
8235         cpu_allowed_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
8236         CPU_ZERO_S(cpu_allowed_setsize, cpu_allowed_set);
8237 
8238         /*
8239          * Validate and update cpu_allowed_set.
8240          *
8241          * Make sure all cpus in cpu_subset are also in cpu_present_set during startup.
8242          * Give a warning when cpus in cpu_subset become unavailable at runtime.
8243          * Give a warning when cpus are not effective because of cgroup setting.
8244          *
8245          * cpu_allowed_set is the intersection of cpu_present_set/cpu_effective_set/cpu_subset.
8246          */
8247         for (i = 0; i < CPU_SUBSET_MAXCPUS; ++i) {
8248                 if (cpu_subset && !CPU_ISSET_S(i, cpu_subset_size, cpu_subset))
8249                         continue;
8250 
8251                 if (!CPU_ISSET_S(i, cpu_present_setsize, cpu_present_set)) {
8252                         if (cpu_subset) {
8253                                 /* cpus in cpu_subset must be in cpu_present_set during startup */
8254                                 if (startup)
8255                                         err(1, "cpu%d not present", i);
8256                                 else
8257                                         fprintf(stderr, "cpu%d not present\n", i);
8258                         }
8259                         continue;
8260                 }
8261 
8262                 if (CPU_COUNT_S(cpu_effective_setsize, cpu_effective_set)) {
8263                         if (!CPU_ISSET_S(i, cpu_effective_setsize, cpu_effective_set)) {
8264                                 fprintf(stderr, "cpu%d not effective\n", i);
8265                                 continue;
8266                         }
8267                 }
8268 
8269                 CPU_SET_S(i, cpu_allowed_setsize, cpu_allowed_set);
8270         }
8271 
8272         if (!CPU_COUNT_S(cpu_allowed_setsize, cpu_allowed_set))
8273                 err(-ENODEV, "No valid cpus found");
8274         sched_setaffinity(0, cpu_allowed_setsize, cpu_allowed_set);
8275 
8276         /*
8277          * Allocate and initialize cpu_affinity_set
8278          */
8279         cpu_affinity_set = CPU_ALLOC((topo.max_cpu_num + 1));
8280         if (cpu_affinity_set == NULL)
8281                 err(3, "CPU_ALLOC");
8282         cpu_affinity_setsize = CPU_ALLOC_SIZE((topo.max_cpu_num + 1));
8283         CPU_ZERO_S(cpu_affinity_setsize, cpu_affinity_set);
8284 
8285         for_all_proc_cpus(init_thread_id);
8286 
8287         /*
8288          * For online cpus
8289          * find max_core_id, max_package_id
8290          */
8291         for (i = 0; i <= topo.max_cpu_num; ++i) {
8292                 int siblings;
8293 
8294                 if (cpu_is_not_present(i)) {
8295                         if (debug > 1)
8296                                 fprintf(outf, "cpu%d NOT PRESENT\n", i);
8297                         continue;
8298                 }
8299 
8300                 cpus[i].logical_cpu_id = i;
8301 
8302                 /* get package information */
8303                 cpus[i].physical_package_id = get_physical_package_id(i);
8304                 if (cpus[i].physical_package_id > max_package_id)
8305                         max_package_id = cpus[i].physical_package_id;
8306 
8307                 /* get die information */
8308                 cpus[i].die_id = get_die_id(i);
8309                 if (cpus[i].die_id > topo.max_die_id)
8310                         topo.max_die_id = cpus[i].die_id;
8311 
8312                 /* get numa node information */
8313                 cpus[i].physical_node_id = get_physical_node_id(&cpus[i]);
8314                 if (cpus[i].physical_node_id > topo.max_node_num)
8315                         topo.max_node_num = cpus[i].physical_node_id;
8316 
8317                 /* get core information */
8318                 cpus[i].physical_core_id = get_core_id(i);
8319                 if (cpus[i].physical_core_id > max_core_id)
8320                         max_core_id = cpus[i].physical_core_id;
8321 
8322                 /* get thread information */
8323                 siblings = get_thread_siblings(&cpus[i]);
8324                 if (siblings > max_siblings)
8325                         max_siblings = siblings;
8326                 if (cpus[i].thread_id == 0)
8327                         topo.num_cores++;
8328         }
8329         topo.max_core_id = max_core_id;
8330         topo.max_package_id = max_package_id;
8331 
8332         topo.cores_per_node = max_core_id + 1;
8333         if (debug > 1)
8334                 fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node);
8335         if (!summary_only && topo.cores_per_node > 1)
8336                 BIC_PRESENT(BIC_Core);
8337 
8338         topo.num_die = topo.max_die_id + 1;
8339         if (debug > 1)
8340                 fprintf(outf, "max_die_id %d, sizing for %d die\n", topo.max_die_id, topo.num_die);
8341         if (!summary_only && topo.num_die > 1)
8342                 BIC_PRESENT(BIC_Die);
8343 
8344         topo.num_packages = max_package_id + 1;
8345         if (debug > 1)
8346                 fprintf(outf, "max_package_id %d, sizing for %d packages\n", max_package_id, topo.num_packages);
8347         if (!summary_only && topo.num_packages > 1)
8348                 BIC_PRESENT(BIC_Package);
8349 
8350         set_node_data();
8351         if (debug > 1)
8352                 fprintf(outf, "nodes_per_pkg %d\n", topo.nodes_per_pkg);
8353         if (!summary_only && topo.nodes_per_pkg > 1)
8354                 BIC_PRESENT(BIC_Node);
8355 
8356         topo.threads_per_core = max_siblings;
8357         if (debug > 1)
8358                 fprintf(outf, "max_siblings %d\n", max_siblings);
8359 
8360         if (debug < 1)
8361                 return;
8362 
8363         for (i = 0; i <= topo.max_cpu_num; ++i) {
8364                 if (cpu_is_not_present(i))
8365                         continue;
8366                 fprintf(outf,
8367                         "cpu %d pkg %d die %d node %d lnode %d core %d thread %d\n",
8368                         i, cpus[i].physical_package_id, cpus[i].die_id,
8369                         cpus[i].physical_node_id, cpus[i].logical_node_id, cpus[i].physical_core_id, cpus[i].thread_id);
8370         }
8371 
8372 }
8373 
8374 void allocate_counters(struct thread_data **t, struct core_data **c, struct pkg_data **p)
8375 {
8376         int i;
8377         int num_cores = topo.cores_per_node * topo.nodes_per_pkg * topo.num_packages;
8378         int num_threads = topo.threads_per_core * num_cores;
8379 
8380         *t = calloc(num_threads, sizeof(struct thread_data));
8381         if (*t == NULL)
8382                 goto error;
8383 
8384         for (i = 0; i < num_threads; i++)
8385                 (*t)[i].cpu_id = -1;
8386 
8387         *c = calloc(num_cores, sizeof(struct core_data));
8388         if (*c == NULL)
8389                 goto error;
8390 
8391         for (i = 0; i < num_cores; i++) {
8392                 (*c)[i].core_id = -1;
8393                 (*c)[i].base_cpu = -1;
8394         }
8395 
8396         *p = calloc(topo.num_packages, sizeof(struct pkg_data));
8397         if (*p == NULL)
8398                 goto error;
8399 
8400         for (i = 0; i < topo.num_packages; i++) {
8401                 (*p)[i].package_id = i;
8402                 (*p)[i].base_cpu = -1;
8403         }
8404 
8405         return;
8406 error:
8407         err(1, "calloc counters");
8408 }
8409 
8410 /*
8411  * init_counter()
8412  *
8413  * set FIRST_THREAD_IN_CORE and FIRST_CORE_IN_PACKAGE
8414  */
8415 void init_counter(struct thread_data *thread_base, struct core_data *core_base, struct pkg_data *pkg_base, int cpu_id)
8416 {
8417         int pkg_id = cpus[cpu_id].physical_package_id;
8418         int node_id = cpus[cpu_id].logical_node_id;
8419         int core_id = cpus[cpu_id].physical_core_id;
8420         int thread_id = cpus[cpu_id].thread_id;
8421         struct thread_data *t;
8422         struct core_data *c;
8423         struct pkg_data *p;
8424 
8425         /* Workaround for systems where physical_node_id==-1
8426          * and logical_node_id==(-1 - topo.num_cpus)
8427          */
8428         if (node_id < 0)
8429                 node_id = 0;
8430 
8431         t = GET_THREAD(thread_base, thread_id, core_id, node_id, pkg_id);
8432         c = GET_CORE(core_base, core_id, node_id, pkg_id);
8433         p = GET_PKG(pkg_base, pkg_id);
8434 
8435         t->cpu_id = cpu_id;
8436         if (!cpu_is_not_allowed(cpu_id)) {
8437                 if (c->base_cpu < 0)
8438                         c->base_cpu = t->cpu_id;
8439                 if (p->base_cpu < 0)
8440                         p->base_cpu = t->cpu_id;
8441         }
8442 
8443         c->core_id = core_id;
8444         p->package_id = pkg_id;
8445 }
8446 
8447 int initialize_counters(int cpu_id)
8448 {
8449         init_counter(EVEN_COUNTERS, cpu_id);
8450         init_counter(ODD_COUNTERS, cpu_id);
8451         return 0;
8452 }
8453 
8454 void allocate_output_buffer()
8455 {
8456         output_buffer = calloc(1, (1 + topo.num_cpus) * 2048);
8457         outp = output_buffer;
8458         if (outp == NULL)
8459                 err(-1, "calloc output buffer");
8460 }
8461 
8462 void allocate_fd_percpu(void)
8463 {
8464         fd_percpu = calloc(topo.max_cpu_num + 1, sizeof(int));
8465         if (fd_percpu == NULL)
8466                 err(-1, "calloc fd_percpu");
8467 }
8468 
8469 void allocate_irq_buffers(void)
8470 {
8471         irq_column_2_cpu = calloc(topo.num_cpus, sizeof(int));
8472         if (irq_column_2_cpu == NULL)
8473                 err(-1, "calloc %d", topo.num_cpus);
8474 
8475         irqs_per_cpu = calloc(topo.max_cpu_num + 1, sizeof(int));
8476         if (irqs_per_cpu == NULL)
8477                 err(-1, "calloc %d", topo.max_cpu_num + 1);
8478 }
8479 
8480 int update_topo(struct thread_data *t, struct core_data *c, struct pkg_data *p)
8481 {
8482         topo.allowed_cpus++;
8483         if ((int)t->cpu_id == c->base_cpu)
8484                 topo.allowed_cores++;
8485         if ((int)t->cpu_id == p->base_cpu)
8486                 topo.allowed_packages++;
8487 
8488         return 0;
8489 }
8490 
8491 void topology_update(void)
8492 {
8493         topo.allowed_cpus = 0;
8494         topo.allowed_cores = 0;
8495         topo.allowed_packages = 0;
8496         for_all_cpus(update_topo, ODD_COUNTERS);
8497 }
8498 
8499 void setup_all_buffers(bool startup)
8500 {
8501         topology_probe(startup);
8502         allocate_irq_buffers();
8503         allocate_fd_percpu();
8504         allocate_counters(&thread_even, &core_even, &package_even);
8505         allocate_counters(&thread_odd, &core_odd, &package_odd);
8506         allocate_output_buffer();
8507         for_all_proc_cpus(initialize_counters);
8508         topology_update();
8509 }
8510 
8511 void set_base_cpu(void)
8512 {
8513         int i;
8514 
8515         for (i = 0; i < topo.max_cpu_num + 1; ++i) {
8516                 if (cpu_is_not_allowed(i))
8517                         continue;
8518                 base_cpu = i;
8519                 if (debug > 1)
8520                         fprintf(outf, "base_cpu = %d\n", base_cpu);
8521                 return;
8522         }
8523         err(-ENODEV, "No valid cpus found");
8524 }
8525 
8526 bool has_added_counters(void)
8527 {
8528         /*
8529          * It only makes sense to call this after the command line is parsed,
8530          * otherwise sys structure is not populated.
8531          */
8532 
8533         return sys.added_core_counters | sys.added_thread_counters | sys.added_package_counters;
8534 }
8535 
8536 void check_msr_access(void)
8537 {
8538         check_dev_msr();
8539         check_msr_permission();
8540 
8541         if (no_msr)
8542                 bic_disable_msr_access();
8543 }
8544 
8545 void check_perf_access(void)
8546 {
8547         if (no_perf || !BIC_IS_ENABLED(BIC_IPC) || !has_instr_count_access())
8548                 bic_enabled &= ~BIC_IPC;
8549 }
8550 
8551 int added_perf_counters_init_(struct perf_counter_info *pinfo)
8552 {
8553         size_t num_domains = 0;
8554         unsigned int next_domain;
8555         bool *domain_visited;
8556         unsigned int perf_type, perf_config;
8557         double perf_scale;
8558         int fd_perf;
8559 
8560         if (!pinfo)
8561                 return 0;
8562 
8563         const size_t max_num_domains = MAX(topo.max_cpu_num + 1, MAX(topo.max_core_id + 1, topo.max_package_id + 1));
8564 
8565         domain_visited = calloc(max_num_domains, sizeof(*domain_visited));
8566 
8567         while (pinfo) {
8568                 switch (pinfo->scope) {
8569                 case SCOPE_CPU:
8570                         num_domains = topo.max_cpu_num + 1;
8571                         break;
8572 
8573                 case SCOPE_CORE:
8574                         num_domains = topo.max_core_id + 1;
8575                         break;
8576 
8577                 case SCOPE_PACKAGE:
8578                         num_domains = topo.max_package_id + 1;
8579                         break;
8580                 }
8581 
8582                 /* Allocate buffer for file descriptor for each domain. */
8583                 pinfo->fd_perf_per_domain = calloc(num_domains, sizeof(*pinfo->fd_perf_per_domain));
8584                 if (!pinfo->fd_perf_per_domain)
8585                         errx(1, "%s: alloc %s", __func__, "fd_perf_per_domain");
8586 
8587                 for (size_t i = 0; i < num_domains; ++i)
8588                         pinfo->fd_perf_per_domain[i] = -1;
8589 
8590                 pinfo->num_domains = num_domains;
8591                 pinfo->scale = 1.0;
8592 
8593                 memset(domain_visited, 0, max_num_domains * sizeof(*domain_visited));
8594 
8595                 for (int cpu = 0; cpu < topo.max_cpu_num + 1; ++cpu) {
8596 
8597                         next_domain = cpu_to_domain(pinfo, cpu);
8598 
8599                         assert(next_domain < num_domains);
8600 
8601                         if (cpu_is_not_allowed(cpu))
8602                                 continue;
8603 
8604                         if (domain_visited[next_domain])
8605                                 continue;
8606 
8607                         perf_type = read_perf_type(pinfo->device);
8608                         if (perf_type == (unsigned int)-1) {
8609                                 warnx("%s: perf/%s/%s: failed to read %s",
8610                                       __func__, pinfo->device, pinfo->event, "type");
8611                                 continue;
8612                         }
8613 
8614                         perf_config = read_perf_config(pinfo->device, pinfo->event);
8615                         if (perf_config == (unsigned int)-1) {
8616                                 warnx("%s: perf/%s/%s: failed to read %s",
8617                                       __func__, pinfo->device, pinfo->event, "config");
8618                                 continue;
8619                         }
8620 
8621                         /* Scale is not required, some counters just don't have it. */
8622                         perf_scale = read_perf_scale(pinfo->device, pinfo->event);
8623                         if (perf_scale == 0.0)
8624                                 perf_scale = 1.0;
8625 
8626                         fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0);
8627                         if (fd_perf == -1) {
8628                                 warnx("%s: perf/%s/%s: failed to open counter on cpu%d",
8629                                       __func__, pinfo->device, pinfo->event, cpu);
8630                                 continue;
8631                         }
8632 
8633                         domain_visited[next_domain] = 1;
8634                         pinfo->fd_perf_per_domain[next_domain] = fd_perf;
8635                         pinfo->scale = perf_scale;
8636 
8637                         if (debug)
8638                                 fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n",
8639                                         pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]);
8640                 }
8641 
8642                 pinfo = pinfo->next;
8643         }
8644 
8645         free(domain_visited);
8646 
8647         return 0;
8648 }
8649 
8650 void added_perf_counters_init(void)
8651 {
8652         if (added_perf_counters_init_(sys.perf_tp))
8653                 errx(1, "%s: %s", __func__, "thread");
8654 
8655         if (added_perf_counters_init_(sys.perf_cp))
8656                 errx(1, "%s: %s", __func__, "core");
8657 
8658         if (added_perf_counters_init_(sys.perf_pp))
8659                 errx(1, "%s: %s", __func__, "package");
8660 }
8661 
8662 int parse_telem_info_file(int fd_dir, const char *info_filename, const char *format, unsigned long *output)
8663 {
8664         int fd_telem_info;
8665         FILE *file_telem_info;
8666         unsigned long value;
8667 
8668         fd_telem_info = openat(fd_dir, info_filename, O_RDONLY);
8669         if (fd_telem_info == -1)
8670                 return -1;
8671 
8672         file_telem_info = fdopen(fd_telem_info, "r");
8673         if (file_telem_info == NULL) {
8674                 close(fd_telem_info);
8675                 return -1;
8676         }
8677 
8678         if (fscanf(file_telem_info, format, &value) != 1) {
8679                 fclose(file_telem_info);
8680                 return -1;
8681         }
8682 
8683         fclose(file_telem_info);
8684 
8685         *output = value;
8686 
8687         return 0;
8688 }
8689 
8690 struct pmt_mmio *pmt_mmio_open(unsigned int target_guid)
8691 {
8692         DIR *dirp;
8693         struct dirent *entry;
8694         struct stat st;
8695         unsigned int telem_idx;
8696         int fd_telem_dir, fd_pmt;
8697         unsigned long guid, size, offset;
8698         size_t mmap_size;
8699         void *mmio;
8700         struct pmt_mmio *ret = NULL;
8701 
8702         if (stat(SYSFS_TELEM_PATH, &st) == -1)
8703                 return NULL;
8704 
8705         dirp = opendir(SYSFS_TELEM_PATH);
8706         if (dirp == NULL)
8707                 return NULL;
8708 
8709         for (;;) {
8710                 entry = readdir(dirp);
8711 
8712                 if (entry == NULL)
8713                         break;
8714 
8715                 if (strcmp(entry->d_name, ".") == 0)
8716                         continue;
8717 
8718                 if (strcmp(entry->d_name, "..") == 0)
8719                         continue;
8720 
8721                 if (sscanf(entry->d_name, "telem%u", &telem_idx) != 1)
8722                         continue;
8723 
8724                 if (fstatat(dirfd(dirp), entry->d_name, &st, 0) == -1) {
8725                         break;
8726                 }
8727 
8728                 if (!S_ISDIR(st.st_mode))
8729                         continue;
8730 
8731                 fd_telem_dir = openat(dirfd(dirp), entry->d_name, O_RDONLY);
8732                 if (fd_telem_dir == -1) {
8733                         break;
8734                 }
8735 
8736                 if (parse_telem_info_file(fd_telem_dir, "guid", "%lx", &guid)) {
8737                         close(fd_telem_dir);
8738                         break;
8739                 }
8740 
8741                 if (parse_telem_info_file(fd_telem_dir, "size", "%lu", &size)) {
8742                         close(fd_telem_dir);
8743                         break;
8744                 }
8745 
8746                 if (guid != target_guid) {
8747                         close(fd_telem_dir);
8748                         continue;
8749                 }
8750 
8751                 if (parse_telem_info_file(fd_telem_dir, "offset", "%lu", &offset)) {
8752                         close(fd_telem_dir);
8753                         break;
8754                 }
8755 
8756                 assert(offset == 0);
8757 
8758                 fd_pmt = openat(fd_telem_dir, "telem", O_RDONLY);
8759                 if (fd_pmt == -1)
8760                         goto loop_cleanup_and_break;
8761 
8762                 mmap_size = (size + 0x1000UL) & (~0x1000UL);
8763                 mmio = mmap(0, mmap_size, PROT_READ, MAP_SHARED, fd_pmt, 0);
8764                 if (mmio != MAP_FAILED) {
8765 
8766                         if (debug)
8767                                 fprintf(stderr, "%s: 0x%lx mmaped at: %p\n", __func__, guid, mmio);
8768 
8769                         ret = calloc(1, sizeof(*ret));
8770 
8771                         if (!ret) {
8772                                 fprintf(stderr, "%s: Failed to allocate pmt_mmio\n", __func__);
8773                                 exit(1);
8774                         }
8775 
8776                         ret->guid = guid;
8777                         ret->mmio_base = mmio;
8778                         ret->pmt_offset = offset;
8779                         ret->size = size;
8780 
8781                         ret->next = pmt_mmios;
8782                         pmt_mmios = ret;
8783                 }
8784 
8785 loop_cleanup_and_break:
8786                 close(fd_pmt);
8787                 close(fd_telem_dir);
8788                 break;
8789         }
8790 
8791         closedir(dirp);
8792 
8793         return ret;
8794 }
8795 
8796 struct pmt_mmio *pmt_mmio_find(unsigned int guid)
8797 {
8798         struct pmt_mmio *pmmio = pmt_mmios;
8799 
8800         while (pmmio) {
8801                 if (pmmio->guid == guid)
8802                         return pmmio;
8803 
8804                 pmmio = pmmio->next;
8805         }
8806 
8807         return NULL;
8808 }
8809 
8810 void *pmt_get_counter_pointer(struct pmt_mmio *pmmio, unsigned long counter_offset)
8811 {
8812         char *ret;
8813 
8814         /* Get base of mmaped PMT file. */
8815         ret = (char *)pmmio->mmio_base;
8816 
8817         /*
8818          * Apply PMT MMIO offset to obtain beginning of the mmaped telemetry data.
8819          * It's not guaranteed that the mmaped memory begins with the telemetry data
8820          *      - we might have to apply the offset first.
8821          */
8822         ret += pmmio->pmt_offset;
8823 
8824         /* Apply the counter offset to get the address to the mmaped counter. */
8825         ret += counter_offset;
8826 
8827         return ret;
8828 }
8829 
8830 struct pmt_mmio *pmt_add_guid(unsigned int guid)
8831 {
8832         struct pmt_mmio *ret;
8833 
8834         ret = pmt_mmio_find(guid);
8835         if (!ret)
8836                 ret = pmt_mmio_open(guid);
8837 
8838         return ret;
8839 }
8840 
8841 enum pmt_open_mode {
8842         PMT_OPEN_TRY,           /* Open failure is not an error. */
8843         PMT_OPEN_REQUIRED,      /* Open failure is a fatal error. */
8844 };
8845 
8846 struct pmt_counter *pmt_find_counter(struct pmt_counter *pcounter, const char *name)
8847 {
8848         while (pcounter) {
8849                 if (strcmp(pcounter->name, name) == 0)
8850                         break;
8851 
8852                 pcounter = pcounter->next;
8853         }
8854 
8855         return pcounter;
8856 }
8857 
8858 struct pmt_counter **pmt_get_scope_root(enum counter_scope scope)
8859 {
8860         switch (scope) {
8861         case SCOPE_CPU:
8862                 return &sys.pmt_tp;
8863         case SCOPE_CORE:
8864                 return &sys.pmt_cp;
8865         case SCOPE_PACKAGE:
8866                 return &sys.pmt_pp;
8867         }
8868 
8869         __builtin_unreachable();
8870 }
8871 
8872 void pmt_counter_add_domain(struct pmt_counter *pcounter, unsigned long *pmmio, unsigned int domain_id)
8873 {
8874         /* Make sure the new domain fits. */
8875         if (domain_id >= pcounter->num_domains)
8876                 pmt_counter_resize(pcounter, domain_id + 1);
8877 
8878         assert(pcounter->domains);
8879         assert(domain_id < pcounter->num_domains);
8880 
8881         pcounter->domains[domain_id].pcounter = pmmio;
8882 }
8883 
8884 int pmt_add_counter(unsigned int guid, const char *name, enum pmt_datatype type,
8885                     unsigned int lsb, unsigned int msb, unsigned int offset, enum counter_scope scope,
8886                     enum counter_format format, unsigned int domain_id, enum pmt_open_mode mode)
8887 {
8888         struct pmt_mmio *mmio;
8889         struct pmt_counter *pcounter;
8890         struct pmt_counter **const pmt_root = pmt_get_scope_root(scope);
8891         bool new_counter = false;
8892         int conflict = 0;
8893 
8894         if (lsb > msb) {
8895                 fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "lsb <= msb", name);
8896                 exit(1);
8897         }
8898 
8899         if (msb >= 64) {
8900                 fprintf(stderr, "%s: %s: `%s` must be satisfied\n", __func__, "msb < 64", name);
8901                 exit(1);
8902         }
8903 
8904         mmio = pmt_add_guid(guid);
8905         if (!mmio) {
8906                 if (mode != PMT_OPEN_TRY) {
8907                         fprintf(stderr, "%s: failed to map PMT MMIO for guid %x\n", __func__, guid);
8908                         exit(1);
8909                 }
8910 
8911                 return 1;
8912         }
8913 
8914         if (offset >= mmio->size) {
8915                 if (mode != PMT_OPEN_TRY) {
8916                         fprintf(stderr, "%s: offset %u outside of PMT MMIO size %u\n", __func__, offset, mmio->size);
8917                         exit(1);
8918                 }
8919 
8920                 return 1;
8921         }
8922 
8923         pcounter = pmt_find_counter(*pmt_root, name);
8924         if (!pcounter) {
8925                 pcounter = calloc(1, sizeof(*pcounter));
8926                 new_counter = true;
8927         }
8928 
8929         if (new_counter) {
8930                 strncpy(pcounter->name, name, ARRAY_SIZE(pcounter->name) - 1);
8931                 pcounter->type = type;
8932                 pcounter->scope = scope;
8933                 pcounter->lsb = lsb;
8934                 pcounter->msb = msb;
8935                 pcounter->format = format;
8936         } else {
8937                 conflict += pcounter->type != type;
8938                 conflict += pcounter->scope != scope;
8939                 conflict += pcounter->lsb != lsb;
8940                 conflict += pcounter->msb != msb;
8941                 conflict += pcounter->format != format;
8942         }
8943 
8944         if (conflict) {
8945                 fprintf(stderr, "%s: conflicting parameters for the PMT counter with the same name %s\n",
8946                         __func__, name);
8947                 exit(1);
8948         }
8949 
8950         pmt_counter_add_domain(pcounter, pmt_get_counter_pointer(mmio, offset), domain_id);
8951 
8952         if (new_counter) {
8953                 pcounter->next = *pmt_root;
8954                 *pmt_root = pcounter;
8955         }
8956 
8957         return 0;
8958 }
8959 
8960 void pmt_init(void)
8961 {
8962         if (BIC_IS_ENABLED(BIC_Diec6)) {
8963                 pmt_add_counter(PMT_MTL_DC6_GUID, "Die%c6", PMT_TYPE_XTAL_TIME, PMT_COUNTER_MTL_DC6_LSB,
8964                                 PMT_COUNTER_MTL_DC6_MSB, PMT_COUNTER_MTL_DC6_OFFSET, SCOPE_PACKAGE, FORMAT_DELTA,
8965                                 0, PMT_OPEN_TRY);
8966         }
8967 }
8968 
8969 void turbostat_init()
8970 {
8971         setup_all_buffers(true);
8972         set_base_cpu();
8973         check_msr_access();
8974         check_perf_access();
8975         process_cpuid();
8976         counter_info_init();
8977         probe_pm_features();
8978         msr_perf_init();
8979         linux_perf_init();
8980         rapl_perf_init();
8981         cstate_perf_init();
8982         added_perf_counters_init();
8983         pmt_init();
8984 
8985         for_all_cpus(get_cpu_type, ODD_COUNTERS);
8986         for_all_cpus(get_cpu_type, EVEN_COUNTERS);
8987 
8988         if (BIC_IS_ENABLED(BIC_IPC) && has_aperf_access && get_instr_count_fd(base_cpu) != -1)
8989                 BIC_PRESENT(BIC_IPC);
8990 
8991         /*
8992          * If TSC tweak is needed, but couldn't get it,
8993          * disable more BICs, since it can't be reported accurately.
8994          */
8995         if (platform->enable_tsc_tweak && !has_base_hz) {
8996                 bic_enabled &= ~BIC_Busy;
8997                 bic_enabled &= ~BIC_Bzy_MHz;
8998         }
8999 }
9000 
9001 int fork_it(char **argv)
9002 {
9003         pid_t child_pid;
9004         int status;
9005 
9006         snapshot_proc_sysfs_files();
9007         status = for_all_cpus(get_counters, EVEN_COUNTERS);
9008         first_counter_read = 0;
9009         if (status)
9010                 exit(status);
9011         gettimeofday(&tv_even, (struct timezone *)NULL);
9012 
9013         child_pid = fork();
9014         if (!child_pid) {
9015                 /* child */
9016                 execvp(argv[0], argv);
9017                 err(errno, "exec %s", argv[0]);
9018         } else {
9019 
9020                 /* parent */
9021                 if (child_pid == -1)
9022                         err(1, "fork");
9023 
9024                 signal(SIGINT, SIG_IGN);
9025                 signal(SIGQUIT, SIG_IGN);
9026                 if (waitpid(child_pid, &status, 0) == -1)
9027                         err(status, "waitpid");
9028 
9029                 if (WIFEXITED(status))
9030                         status = WEXITSTATUS(status);
9031         }
9032         /*
9033          * n.b. fork_it() does not check for errors from for_all_cpus()
9034          * because re-starting is problematic when forking
9035          */
9036         snapshot_proc_sysfs_files();
9037         for_all_cpus(get_counters, ODD_COUNTERS);
9038         gettimeofday(&tv_odd, (struct timezone *)NULL);
9039         timersub(&tv_odd, &tv_even, &tv_delta);
9040         if (for_all_cpus_2(delta_cpu, ODD_COUNTERS, EVEN_COUNTERS))
9041                 fprintf(outf, "%s: Counter reset detected\n", progname);
9042         else {
9043                 compute_average(EVEN_COUNTERS);
9044                 format_all_counters(EVEN_COUNTERS);
9045         }
9046 
9047         fprintf(outf, "%.6f sec\n", tv_delta.tv_sec + tv_delta.tv_usec / 1000000.0);
9048 
9049         flush_output_stderr();
9050 
9051         return status;
9052 }
9053 
9054 int get_and_dump_counters(void)
9055 {
9056         int status;
9057 
9058         snapshot_proc_sysfs_files();
9059         status = for_all_cpus(get_counters, ODD_COUNTERS);
9060         if (status)
9061                 return status;
9062 
9063         status = for_all_cpus(dump_counters, ODD_COUNTERS);
9064         if (status)
9065                 return status;
9066 
9067         flush_output_stdout();
9068 
9069         return status;
9070 }
9071 
9072 void print_version()
9073 {
9074         fprintf(outf, "turbostat version 2024.07.26 - Len Brown <lenb@kernel.org>\n");
9075 }
9076 
9077 #define COMMAND_LINE_SIZE 2048
9078 
9079 void print_bootcmd(void)
9080 {
9081         char bootcmd[COMMAND_LINE_SIZE];
9082         FILE *fp;
9083         int ret;
9084 
9085         memset(bootcmd, 0, COMMAND_LINE_SIZE);
9086         fp = fopen("/proc/cmdline", "r");
9087         if (!fp)
9088                 return;
9089 
9090         ret = fread(bootcmd, sizeof(char), COMMAND_LINE_SIZE - 1, fp);
9091         if (ret) {
9092                 bootcmd[ret] = '\0';
9093                 /* the last character is already '\n' */
9094                 fprintf(outf, "Kernel command line: %s", bootcmd);
9095         }
9096 
9097         fclose(fp);
9098 }
9099 
9100 struct msr_counter *find_msrp_by_name(struct msr_counter *head, char *name)
9101 {
9102         struct msr_counter *mp;
9103 
9104         for (mp = head; mp; mp = mp->next) {
9105                 if (debug)
9106                         fprintf(stderr, "%s: %s %s\n", __func__, name, mp->name);
9107                 if (!strncmp(name, mp->name, strlen(mp->name)))
9108                         return mp;
9109         }
9110         return NULL;
9111 }
9112 
9113 int add_counter(unsigned int msr_num, char *path, char *name,
9114                 unsigned int width, enum counter_scope scope,
9115                 enum counter_type type, enum counter_format format, int flags, int id)
9116 {
9117         struct msr_counter *msrp;
9118 
9119         if (no_msr && msr_num)
9120                 errx(1, "Requested MSR counter 0x%x, but in --no-msr mode", msr_num);
9121 
9122         if (debug)
9123                 fprintf(stderr, "%s(msr%d, %s, %s, width%d, scope%d, type%d, format%d, flags%x, id%d)\n",
9124                         __func__, msr_num, path, name, width, scope, type, format, flags, id);
9125 
9126         switch (scope) {
9127 
9128         case SCOPE_CPU:
9129                 msrp = find_msrp_by_name(sys.tp, name);
9130                 if (msrp) {
9131                         if (debug)
9132                                 fprintf(stderr, "%s: %s FOUND\n", __func__, name);
9133                         break;
9134                 }
9135                 if (sys.added_thread_counters++ >= MAX_ADDED_THREAD_COUNTERS) {
9136                         warnx("ignoring thread counter %s", name);
9137                         return -1;
9138                 }
9139                 break;
9140         case SCOPE_CORE:
9141                 msrp = find_msrp_by_name(sys.cp, name);
9142                 if (msrp) {
9143                         if (debug)
9144                                 fprintf(stderr, "%s: %s FOUND\n", __func__, name);
9145                         break;
9146                 }
9147                 if (sys.added_core_counters++ >= MAX_ADDED_CORE_COUNTERS) {
9148                         warnx("ignoring core counter %s", name);
9149                         return -1;
9150                 }
9151                 break;
9152         case SCOPE_PACKAGE:
9153                 msrp = find_msrp_by_name(sys.pp, name);
9154                 if (msrp) {
9155                         if (debug)
9156                                 fprintf(stderr, "%s: %s FOUND\n", __func__, name);
9157                         break;
9158                 }
9159                 if (sys.added_package_counters++ >= MAX_ADDED_PACKAGE_COUNTERS) {
9160                         warnx("ignoring package counter %s", name);
9161                         return -1;
9162                 }
9163                 break;
9164         default:
9165                 warnx("ignoring counter %s with unknown scope", name);
9166                 return -1;
9167         }
9168 
9169         if (msrp == NULL) {
9170                 msrp = calloc(1, sizeof(struct msr_counter));
9171                 if (msrp == NULL)
9172                         err(-1, "calloc msr_counter");
9173 
9174                 msrp->msr_num = msr_num;
9175                 strncpy(msrp->name, name, NAME_BYTES - 1);
9176                 msrp->width = width;
9177                 msrp->type = type;
9178                 msrp->format = format;
9179                 msrp->flags = flags;
9180 
9181                 switch (scope) {
9182                 case SCOPE_CPU:
9183                         msrp->next = sys.tp;
9184                         sys.tp = msrp;
9185                         break;
9186                 case SCOPE_CORE:
9187                         msrp->next = sys.cp;
9188                         sys.cp = msrp;
9189                         break;
9190                 case SCOPE_PACKAGE:
9191                         msrp->next = sys.pp;
9192                         sys.pp = msrp;
9193                         break;
9194                 }
9195         }
9196 
9197         if (path) {
9198                 struct sysfs_path *sp;
9199 
9200                 sp = calloc(1, sizeof(struct sysfs_path));
9201                 if (sp == NULL) {
9202                         perror("calloc");
9203                         exit(1);
9204                 }
9205                 strncpy(sp->path, path, PATH_BYTES - 1);
9206                 sp->id = id;
9207                 sp->next = msrp->sp;
9208                 msrp->sp = sp;
9209         }
9210 
9211         return 0;
9212 }
9213 
9214 /*
9215  * Initialize the fields used for identifying and opening the counter.
9216  *
9217  * Defer the initialization of any runtime buffers for actually reading
9218  * the counters for when we initialize all perf counters, so we can later
9219  * easily call re_initialize().
9220  */
9221 struct perf_counter_info *make_perf_counter_info(const char *perf_device,
9222                                                  const char *perf_event,
9223                                                  const char *name,
9224                                                  unsigned int width,
9225                                                  enum counter_scope scope,
9226                                                  enum counter_type type, enum counter_format format)
9227 {
9228         struct perf_counter_info *pinfo;
9229 
9230         pinfo = calloc(1, sizeof(*pinfo));
9231         if (!pinfo)
9232                 errx(1, "%s: Failed to allocate %s/%s\n", __func__, perf_device, perf_event);
9233 
9234         strncpy(pinfo->device, perf_device, ARRAY_SIZE(pinfo->device) - 1);
9235         strncpy(pinfo->event, perf_event, ARRAY_SIZE(pinfo->event) - 1);
9236 
9237         strncpy(pinfo->name, name, ARRAY_SIZE(pinfo->name) - 1);
9238         pinfo->width = width;
9239         pinfo->scope = scope;
9240         pinfo->type = type;
9241         pinfo->format = format;
9242 
9243         return pinfo;
9244 }
9245 
9246 int add_perf_counter(const char *perf_device, const char *perf_event, const char *name_buffer, unsigned int width,
9247                      enum counter_scope scope, enum counter_type type, enum counter_format format)
9248 {
9249         struct perf_counter_info *pinfo;
9250 
9251         switch (scope) {
9252         case SCOPE_CPU:
9253                 if (sys.added_thread_perf_counters >= MAX_ADDED_THREAD_COUNTERS) {
9254                         warnx("ignoring thread counter perf/%s/%s", perf_device, perf_event);
9255                         return -1;
9256                 }
9257                 break;
9258 
9259         case SCOPE_CORE:
9260                 if (sys.added_core_perf_counters >= MAX_ADDED_CORE_COUNTERS) {
9261                         warnx("ignoring core counter perf/%s/%s", perf_device, perf_event);
9262                         return -1;
9263                 }
9264                 break;
9265 
9266         case SCOPE_PACKAGE:
9267                 if (sys.added_package_perf_counters >= MAX_ADDED_PACKAGE_COUNTERS) {
9268                         warnx("ignoring package counter perf/%s/%s", perf_device, perf_event);
9269                         return -1;
9270                 }
9271                 break;
9272         }
9273 
9274         pinfo = make_perf_counter_info(perf_device, perf_event, name_buffer, width, scope, type, format);
9275 
9276         if (!pinfo)
9277                 return -1;
9278 
9279         switch (scope) {
9280         case SCOPE_CPU:
9281                 pinfo->next = sys.perf_tp;
9282                 sys.perf_tp = pinfo;
9283                 ++sys.added_thread_perf_counters;
9284                 break;
9285 
9286         case SCOPE_CORE:
9287                 pinfo->next = sys.perf_cp;
9288                 sys.perf_cp = pinfo;
9289                 ++sys.added_core_perf_counters;
9290                 break;
9291 
9292         case SCOPE_PACKAGE:
9293                 pinfo->next = sys.perf_pp;
9294                 sys.perf_pp = pinfo;
9295                 ++sys.added_package_perf_counters;
9296                 break;
9297         }
9298 
9299         // FIXME: we might not have debug here yet
9300         if (debug)
9301                 fprintf(stderr, "%s: %s/%s, name: %s, scope%d\n",
9302                         __func__, pinfo->device, pinfo->event, pinfo->name, pinfo->scope);
9303 
9304         return 0;
9305 }
9306 
9307 void parse_add_command_msr(char *add_command)
9308 {
9309         int msr_num = 0;
9310         char *path = NULL;
9311         char perf_device[PERF_DEV_NAME_BYTES] = "";
9312         char perf_event[PERF_EVT_NAME_BYTES] = "";
9313         char name_buffer[PERF_NAME_BYTES] = "";
9314         int width = 64;
9315         int fail = 0;
9316         enum counter_scope scope = SCOPE_CPU;
9317         enum counter_type type = COUNTER_CYCLES;
9318         enum counter_format format = FORMAT_DELTA;
9319 
9320         while (add_command) {
9321 
9322                 if (sscanf(add_command, "msr0x%x", &msr_num) == 1)
9323                         goto next;
9324 
9325                 if (sscanf(add_command, "msr%d", &msr_num) == 1)
9326                         goto next;
9327 
9328                 BUILD_BUG_ON(ARRAY_SIZE(perf_device) <= 31);
9329                 BUILD_BUG_ON(ARRAY_SIZE(perf_event) <= 31);
9330                 if (sscanf(add_command, "perf/%31[^/]/%31[^,]", &perf_device[0], &perf_event[0]) == 2)
9331                         goto next;
9332 
9333                 if (*add_command == '/') {
9334                         path = add_command;
9335                         goto next;
9336                 }
9337 
9338                 if (sscanf(add_command, "u%d", &width) == 1) {
9339                         if ((width == 32) || (width == 64))
9340                                 goto next;
9341                         width = 64;
9342                 }
9343                 if (!strncmp(add_command, "cpu", strlen("cpu"))) {
9344                         scope = SCOPE_CPU;
9345                         goto next;
9346                 }
9347                 if (!strncmp(add_command, "core", strlen("core"))) {
9348                         scope = SCOPE_CORE;
9349                         goto next;
9350                 }
9351                 if (!strncmp(add_command, "package", strlen("package"))) {
9352                         scope = SCOPE_PACKAGE;
9353                         goto next;
9354                 }
9355                 if (!strncmp(add_command, "cycles", strlen("cycles"))) {
9356                         type = COUNTER_CYCLES;
9357                         goto next;
9358                 }
9359                 if (!strncmp(add_command, "seconds", strlen("seconds"))) {
9360                         type = COUNTER_SECONDS;
9361                         goto next;
9362                 }
9363                 if (!strncmp(add_command, "usec", strlen("usec"))) {
9364                         type = COUNTER_USEC;
9365                         goto next;
9366                 }
9367                 if (!strncmp(add_command, "raw", strlen("raw"))) {
9368                         format = FORMAT_RAW;
9369                         goto next;
9370                 }
9371                 if (!strncmp(add_command, "delta", strlen("delta"))) {
9372                         format = FORMAT_DELTA;
9373                         goto next;
9374                 }
9375                 if (!strncmp(add_command, "percent", strlen("percent"))) {
9376                         format = FORMAT_PERCENT;
9377                         goto next;
9378                 }
9379 
9380                 BUILD_BUG_ON(ARRAY_SIZE(name_buffer) <= 18);
9381                 if (sscanf(add_command, "%18s,%*s", name_buffer) == 1) {
9382                         char *eos;
9383 
9384                         eos = strchr(name_buffer, ',');
9385                         if (eos)
9386                                 *eos = '\0';
9387                         goto next;
9388                 }
9389 
9390 next:
9391                 add_command = strchr(add_command, ',');
9392                 if (add_command) {
9393                         *add_command = '\0';
9394                         add_command++;
9395                 }
9396 
9397         }
9398         if ((msr_num == 0) && (path == NULL) && (perf_device[0] == '\0' || perf_event[0] == '\0')) {
9399                 fprintf(stderr, "--add: (msrDDD | msr0xXXX | /path_to_counter | perf/device/event ) required\n");
9400                 fail++;
9401         }
9402 
9403         /* Test for non-empty perf_device and perf_event */
9404         const bool is_perf_counter = perf_device[0] && perf_event[0];
9405 
9406         /* generate default column header */
9407         if (*name_buffer == '\0') {
9408                 if (is_perf_counter) {
9409                         snprintf(name_buffer, ARRAY_SIZE(name_buffer), "perf/%s", perf_event);
9410                 } else {
9411                         if (width == 32)
9412                                 sprintf(name_buffer, "M0x%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
9413                         else
9414                                 sprintf(name_buffer, "M0X%x%s", msr_num, format == FORMAT_PERCENT ? "%" : "");
9415                 }
9416         }
9417 
9418         if (is_perf_counter) {
9419                 if (add_perf_counter(perf_device, perf_event, name_buffer, width, scope, type, format))
9420                         fail++;
9421         } else {
9422                 if (add_counter(msr_num, path, name_buffer, width, scope, type, format, 0, 0))
9423                         fail++;
9424         }
9425 
9426         if (fail) {
9427                 help();
9428                 exit(1);
9429         }
9430 }
9431 
9432 bool starts_with(const char *str, const char *prefix)
9433 {
9434         return strncmp(prefix, str, strlen(prefix)) == 0;
9435 }
9436 
9437 void parse_add_command_pmt(char *add_command)
9438 {
9439         char *name = NULL;
9440         char *type_name = NULL;
9441         char *format_name = NULL;
9442         unsigned int offset;
9443         unsigned int lsb;
9444         unsigned int msb;
9445         unsigned int guid;
9446         unsigned int domain_id;
9447         enum counter_scope scope = 0;
9448         enum pmt_datatype type = PMT_TYPE_RAW;
9449         enum counter_format format = FORMAT_RAW;
9450         bool has_offset = false;
9451         bool has_lsb = false;
9452         bool has_msb = false;
9453         bool has_format = true; /* Format has a default value. */
9454         bool has_guid = false;
9455         bool has_scope = false;
9456         bool has_type = true;   /* Type has a default value. */
9457 
9458         /* Consume the "pmt," prefix. */
9459         add_command = strchr(add_command, ',');
9460         if (!add_command) {
9461                 help();
9462                 exit(1);
9463         }
9464         ++add_command;
9465 
9466         while (add_command) {
9467                 if (starts_with(add_command, "name=")) {
9468                         name = add_command + strlen("name=");
9469                         goto next;
9470                 }
9471 
9472                 if (starts_with(add_command, "type=")) {
9473                         type_name = add_command + strlen("type=");
9474                         goto next;
9475                 }
9476 
9477                 if (starts_with(add_command, "domain=")) {
9478                         const size_t prefix_len = strlen("domain=");
9479 
9480                         if (sscanf(add_command + prefix_len, "cpu%u", &domain_id) == 1) {
9481                                 scope = SCOPE_CPU;
9482                                 has_scope = true;
9483                         } else if (sscanf(add_command + prefix_len, "core%u", &domain_id) == 1) {
9484                                 scope = SCOPE_CORE;
9485                                 has_scope = true;
9486                         } else if (sscanf(add_command + prefix_len, "package%u", &domain_id) == 1) {
9487                                 scope = SCOPE_PACKAGE;
9488                                 has_scope = true;
9489                         }
9490 
9491                         if (!has_scope) {
9492                                 printf("%s: invalid value for scope. Expected cpu%%u, core%%u or package%%u.\n",
9493                                        __func__);
9494                                 exit(1);
9495                         }
9496 
9497                         goto next;
9498                 }
9499 
9500                 if (starts_with(add_command, "format=")) {
9501                         format_name = add_command + strlen("format=");
9502                         goto next;
9503                 }
9504 
9505                 if (sscanf(add_command, "offset=%u", &offset) == 1) {
9506                         has_offset = true;
9507                         goto next;
9508                 }
9509 
9510                 if (sscanf(add_command, "lsb=%u", &lsb) == 1) {
9511                         has_lsb = true;
9512                         goto next;
9513                 }
9514 
9515                 if (sscanf(add_command, "msb=%u", &msb) == 1) {
9516                         has_msb = true;
9517                         goto next;
9518                 }
9519 
9520                 if (sscanf(add_command, "guid=%x", &guid) == 1) {
9521                         has_guid = true;
9522                         goto next;
9523                 }
9524 
9525 next:
9526                 add_command = strchr(add_command, ',');
9527                 if (add_command) {
9528                         *add_command = '\0';
9529                         add_command++;
9530                 }
9531         }
9532 
9533         if (!name) {
9534                 printf("%s: missing %s\n", __func__, "name");
9535                 exit(1);
9536         }
9537 
9538         if (strlen(name) >= PMT_COUNTER_NAME_SIZE_BYTES) {
9539                 printf("%s: name has to be at most %d characters long\n", __func__, PMT_COUNTER_NAME_SIZE_BYTES);
9540                 exit(1);
9541         }
9542 
9543         if (format_name) {
9544                 has_format = false;
9545 
9546                 if (strcmp("raw", format_name) == 0) {
9547                         format = FORMAT_RAW;
9548                         has_format = true;
9549                 }
9550 
9551                 if (strcmp("delta", format_name) == 0) {
9552                         format = FORMAT_DELTA;
9553                         has_format = true;
9554                 }
9555 
9556                 if (!has_format) {
9557                         fprintf(stderr, "%s: Invalid format %s. Expected raw or delta\n", __func__, format_name);
9558                         exit(1);
9559                 }
9560         }
9561 
9562         if (type_name) {
9563                 has_type = false;
9564 
9565                 if (strcmp("raw", type_name) == 0) {
9566                         type = PMT_TYPE_RAW;
9567                         has_type = true;
9568                 }
9569 
9570                 if (strcmp("txtal_time", type_name) == 0) {
9571                         type = PMT_TYPE_XTAL_TIME;
9572                         has_type = true;
9573                 }
9574 
9575                 if (!has_type) {
9576                         printf("%s: invalid %s: %s\n", __func__, "type", type_name);
9577                         exit(1);
9578                 }
9579         }
9580 
9581         if (!has_offset) {
9582                 printf("%s : missing %s\n", __func__, "offset");
9583                 exit(1);
9584         }
9585 
9586         if (!has_lsb) {
9587                 printf("%s: missing %s\n", __func__, "lsb");
9588                 exit(1);
9589         }
9590 
9591         if (!has_msb) {
9592                 printf("%s: missing %s\n", __func__, "msb");
9593                 exit(1);
9594         }
9595 
9596         if (!has_guid) {
9597                 printf("%s: missing %s\n", __func__, "guid");
9598                 exit(1);
9599         }
9600 
9601         if (!has_scope) {
9602                 printf("%s: missing %s\n", __func__, "scope");
9603                 exit(1);
9604         }
9605 
9606         if (lsb > msb) {
9607                 printf("%s: lsb > msb doesn't make sense\n", __func__);
9608                 exit(1);
9609         }
9610 
9611         pmt_add_counter(guid, name, type, lsb, msb, offset, scope, format, domain_id, PMT_OPEN_REQUIRED);
9612 }
9613 
9614 void parse_add_command(char *add_command)
9615 {
9616         if (strncmp(add_command, "pmt", strlen("pmt")) == 0)
9617                 return parse_add_command_pmt(add_command);
9618         return parse_add_command_msr(add_command);
9619 }
9620 
9621 int is_deferred_add(char *name)
9622 {
9623         int i;
9624 
9625         for (i = 0; i < deferred_add_index; ++i)
9626                 if (!strcmp(name, deferred_add_names[i]))
9627                         return 1;
9628         return 0;
9629 }
9630 
9631 int is_deferred_skip(char *name)
9632 {
9633         int i;
9634 
9635         for (i = 0; i < deferred_skip_index; ++i)
9636                 if (!strcmp(name, deferred_skip_names[i]))
9637                         return 1;
9638         return 0;
9639 }
9640 
9641 void probe_sysfs(void)
9642 {
9643         char path[64];
9644         char name_buf[16];
9645         FILE *input;
9646         int state;
9647         char *sp;
9648 
9649         for (state = 10; state >= 0; --state) {
9650 
9651                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
9652                 input = fopen(path, "r");
9653                 if (input == NULL)
9654                         continue;
9655                 if (!fgets(name_buf, sizeof(name_buf), input))
9656                         err(1, "%s: failed to read file", path);
9657 
9658                 /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
9659                 sp = strchr(name_buf, '-');
9660                 if (!sp)
9661                         sp = strchrnul(name_buf, '\n');
9662                 *sp = '%';
9663                 *(sp + 1) = '\0';
9664 
9665                 remove_underbar(name_buf);
9666 
9667                 fclose(input);
9668 
9669                 sprintf(path, "cpuidle/state%d/time", state);
9670 
9671                 if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
9672                         continue;
9673 
9674                 if (is_deferred_skip(name_buf))
9675                         continue;
9676 
9677                 add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_USEC, FORMAT_PERCENT, SYSFS_PERCPU, 0);
9678         }
9679 
9680         for (state = 10; state >= 0; --state) {
9681 
9682                 sprintf(path, "/sys/devices/system/cpu/cpu%d/cpuidle/state%d/name", base_cpu, state);
9683                 input = fopen(path, "r");
9684                 if (input == NULL)
9685                         continue;
9686                 if (!fgets(name_buf, sizeof(name_buf), input))
9687                         err(1, "%s: failed to read file", path);
9688                 /* truncate "C1-HSW\n" to "C1", or truncate "C1\n" to "C1" */
9689                 sp = strchr(name_buf, '-');
9690                 if (!sp)
9691                         sp = strchrnul(name_buf, '\n');
9692                 *sp = '\0';
9693                 fclose(input);
9694 
9695                 remove_underbar(name_buf);
9696 
9697                 sprintf(path, "cpuidle/state%d/usage", state);
9698 
9699                 if (!DO_BIC(BIC_sysfs) && !is_deferred_add(name_buf))
9700                         continue;
9701 
9702                 if (is_deferred_skip(name_buf))
9703                         continue;
9704 
9705                 add_counter(0, path, name_buf, 64, SCOPE_CPU, COUNTER_ITEMS, FORMAT_DELTA, SYSFS_PERCPU, 0);
9706         }
9707 
9708 }
9709 
9710 /*
9711  * parse cpuset with following syntax
9712  * 1,2,4..6,8-10 and set bits in cpu_subset
9713  */
9714 void parse_cpu_command(char *optarg)
9715 {
9716         if (!strcmp(optarg, "core")) {
9717                 if (cpu_subset)
9718                         goto error;
9719                 show_core_only++;
9720                 return;
9721         }
9722         if (!strcmp(optarg, "package")) {
9723                 if (cpu_subset)
9724                         goto error;
9725                 show_pkg_only++;
9726                 return;
9727         }
9728         if (show_core_only || show_pkg_only)
9729                 goto error;
9730 
9731         cpu_subset = CPU_ALLOC(CPU_SUBSET_MAXCPUS);
9732         if (cpu_subset == NULL)
9733                 err(3, "CPU_ALLOC");
9734         cpu_subset_size = CPU_ALLOC_SIZE(CPU_SUBSET_MAXCPUS);
9735 
9736         CPU_ZERO_S(cpu_subset_size, cpu_subset);
9737 
9738         if (parse_cpu_str(optarg, cpu_subset, cpu_subset_size))
9739                 goto error;
9740 
9741         return;
9742 
9743 error:
9744         fprintf(stderr, "\"--cpu %s\" malformed\n", optarg);
9745         help();
9746         exit(-1);
9747 }
9748 
9749 void cmdline(int argc, char **argv)
9750 {
9751         int opt;
9752         int option_index = 0;
9753         static struct option long_options[] = {
9754                 { "add", required_argument, 0, 'a' },
9755                 { "cpu", required_argument, 0, 'c' },
9756                 { "Dump", no_argument, 0, 'D' },
9757                 { "debug", no_argument, 0, 'd' },       /* internal, not documented */
9758                 { "enable", required_argument, 0, 'e' },
9759                 { "interval", required_argument, 0, 'i' },
9760                 { "IPC", no_argument, 0, 'I' },
9761                 { "num_iterations", required_argument, 0, 'n' },
9762                 { "header_iterations", required_argument, 0, 'N' },
9763                 { "help", no_argument, 0, 'h' },
9764                 { "hide", required_argument, 0, 'H' },  // meh, -h taken by --help
9765                 { "Joules", no_argument, 0, 'J' },
9766                 { "list", no_argument, 0, 'l' },
9767                 { "out", required_argument, 0, 'o' },
9768                 { "quiet", no_argument, 0, 'q' },
9769                 { "no-msr", no_argument, 0, 'M' },
9770                 { "no-perf", no_argument, 0, 'P' },
9771                 { "show", required_argument, 0, 's' },
9772                 { "Summary", no_argument, 0, 'S' },
9773                 { "TCC", required_argument, 0, 'T' },
9774                 { "version", no_argument, 0, 'v' },
9775                 { 0, 0, 0, 0 }
9776         };
9777 
9778         progname = argv[0];
9779 
9780         /*
9781          * Parse some options early, because they may make other options invalid,
9782          * like adding the MSR counter with --add and at the same time using --no-msr.
9783          */
9784         while ((opt = getopt_long_only(argc, argv, "MPn:", long_options, &option_index)) != -1) {
9785                 switch (opt) {
9786                 case 'M':
9787                         no_msr = 1;
9788                         break;
9789                 case 'P':
9790                         no_perf = 1;
9791                         break;
9792                 default:
9793                         break;
9794                 }
9795         }
9796         optind = 0;
9797 
9798         while ((opt = getopt_long_only(argc, argv, "+C:c:Dde:hi:Jn:o:qMST:v", long_options, &option_index)) != -1) {
9799                 switch (opt) {
9800                 case 'a':
9801                         parse_add_command(optarg);
9802                         break;
9803                 case 'c':
9804                         parse_cpu_command(optarg);
9805                         break;
9806                 case 'D':
9807                         dump_only++;
9808                         break;
9809                 case 'e':
9810                         /* --enable specified counter */
9811                         bic_enabled = bic_enabled | bic_lookup(optarg, SHOW_LIST);
9812                         break;
9813                 case 'd':
9814                         debug++;
9815                         ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
9816                         break;
9817                 case 'H':
9818                         /*
9819                          * --hide: do not show those specified
9820                          *  multiple invocations simply clear more bits in enabled mask
9821                          */
9822                         bic_enabled &= ~bic_lookup(optarg, HIDE_LIST);
9823                         break;
9824                 case 'h':
9825                 default:
9826                         help();
9827                         exit(1);
9828                 case 'i':
9829                         {
9830                                 double interval = strtod(optarg, NULL);
9831 
9832                                 if (interval < 0.001) {
9833                                         fprintf(outf, "interval %f seconds is too small\n", interval);
9834                                         exit(2);
9835                                 }
9836 
9837                                 interval_tv.tv_sec = interval_ts.tv_sec = interval;
9838                                 interval_tv.tv_usec = (interval - interval_tv.tv_sec) * 1000000;
9839                                 interval_ts.tv_nsec = (interval - interval_ts.tv_sec) * 1000000000;
9840                         }
9841                         break;
9842                 case 'J':
9843                         rapl_joules++;
9844                         break;
9845                 case 'l':
9846                         ENABLE_BIC(BIC_DISABLED_BY_DEFAULT);
9847                         list_header_only++;
9848                         quiet++;
9849                         break;
9850                 case 'o':
9851                         outf = fopen_or_die(optarg, "w");
9852                         break;
9853                 case 'q':
9854                         quiet = 1;
9855                         break;
9856                 case 'M':
9857                 case 'P':
9858                         /* Parsed earlier */
9859                         break;
9860                 case 'n':
9861                         num_iterations = strtod(optarg, NULL);
9862 
9863                         if (num_iterations <= 0) {
9864                                 fprintf(outf, "iterations %d should be positive number\n", num_iterations);
9865                                 exit(2);
9866                         }
9867                         break;
9868                 case 'N':
9869                         header_iterations = strtod(optarg, NULL);
9870 
9871                         if (header_iterations <= 0) {
9872                                 fprintf(outf, "iterations %d should be positive number\n", header_iterations);
9873                                 exit(2);
9874                         }
9875                         break;
9876                 case 's':
9877                         /*
9878                          * --show: show only those specified
9879                          *  The 1st invocation will clear and replace the enabled mask
9880                          *  subsequent invocations can add to it.
9881                          */
9882                         if (shown == 0)
9883                                 bic_enabled = bic_lookup(optarg, SHOW_LIST);
9884                         else
9885                                 bic_enabled |= bic_lookup(optarg, SHOW_LIST);
9886                         shown = 1;
9887                         break;
9888                 case 'S':
9889                         summary_only++;
9890                         break;
9891                 case 'T':
9892                         tj_max_override = atoi(optarg);
9893                         break;
9894                 case 'v':
9895                         print_version();
9896                         exit(0);
9897                         break;
9898                 }
9899         }
9900 }
9901 
9902 void set_rlimit(void)
9903 {
9904         struct rlimit limit;
9905 
9906         if (getrlimit(RLIMIT_NOFILE, &limit) < 0)
9907                 err(1, "Failed to get rlimit");
9908 
9909         if (limit.rlim_max < MAX_NOFILE)
9910                 limit.rlim_max = MAX_NOFILE;
9911         if (limit.rlim_cur < MAX_NOFILE)
9912                 limit.rlim_cur = MAX_NOFILE;
9913 
9914         if (setrlimit(RLIMIT_NOFILE, &limit) < 0)
9915                 err(1, "Failed to set rlimit");
9916 }
9917 
9918 int main(int argc, char **argv)
9919 {
9920         int fd, ret;
9921 
9922         fd = open("/sys/fs/cgroup/cgroup.procs", O_WRONLY);
9923         if (fd < 0)
9924                 goto skip_cgroup_setting;
9925 
9926         ret = write(fd, "\n", 2);
9927         if (ret == -1)
9928                 perror("Can't update cgroup\n");
9929 
9930         close(fd);
9931 
9932 skip_cgroup_setting:
9933         outf = stderr;
9934         cmdline(argc, argv);
9935 
9936         if (!quiet) {
9937                 print_version();
9938                 print_bootcmd();
9939         }
9940 
9941         probe_sysfs();
9942 
9943         if (!getuid())
9944                 set_rlimit();
9945 
9946         turbostat_init();
9947 
9948         if (!no_msr)
9949                 msr_sum_record();
9950 
9951         /* dump counters and exit */
9952         if (dump_only)
9953                 return get_and_dump_counters();
9954 
9955         /* list header and exit */
9956         if (list_header_only) {
9957                 print_header(",");
9958                 flush_output_stdout();
9959                 return 0;
9960         }
9961 
9962         /*
9963          * if any params left, it must be a command to fork
9964          */
9965         if (argc - optind)
9966                 return fork_it(argv + optind);
9967         else
9968                 turbostat_loop();
9969 
9970         return 0;
9971 }
9972 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php