Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events changes for v3.4 from Ingo Molnar:

 - New "hardware based branch profiling" feature both on the kernel and
   the tooling side, on CPUs that support it.  (modern x86 Intel CPUs
   with the 'LBR' hardware feature currently.)

   This new feature is basically a sophisticated 'magnifying glass' for
   branch execution - something that is pretty difficult to extract from
   regular, function histogram centric profiles.

   The simplest mode is activated via 'perf record -b', and the result
   looks like this in perf report:

	$ perf record -b any_call,u -e cycles:u branchy

	$ perf report -b --sort=symbol
	    52.34%  [.] main                   [.] f1
	    24.04%  [.] f1                     [.] f3
	    23.60%  [.] f1                     [.] f2
	     0.01%  [k] _IO_new_file_xsputn    [k] _IO_file_overflow
	     0.01%  [k] _IO_vfprintf_internal  [k] _IO_new_file_xsputn
	     0.01%  [k] _IO_vfprintf_internal  [k] strchrnul
	     0.01%  [k] __printf               [k] _IO_vfprintf_internal
	     0.01%  [k] main                   [k] __printf

   This output shows from/to branch columns and shows the highest
   percentage (from,to) jump combinations - i.e.  the most likely taken
   branches in the system.  "branches" can also include function calls
   and any other synchronous and asynchronous transitions of the
   instruction pointer that are not 'next instruction' - such as system
   calls, traps, interrupts, etc.

   This feature comes with (hopefully intuitive) flat ascii and TUI
   support in perf report.

 - Various 'perf annotate' visual improvements for us assembly junkies.
   It will now recognize function calls in the TUI and by hitting enter
   you can follow the call (recursively) and back, amongst other
   improvements.

 - Multiple threads/processes recording support in perf record, perf
   stat, perf top - which is activated via a comma-list of PIDs:

	perf top -p 21483,21485
	perf stat -p 21483,21485 -ddd
	perf record -p 21483,21485

 - Support for per UID views, via the --uid paramter to perf top, perf
   report, etc.  For example 'perf top --uid mingo' will only show the
   tasks that I am running, excluding other users, root, etc.

 - Jump label restructurings and improvements - this includes the
   factoring out of the (hopefully much clearer) include/linux/static_key.h
   generic facility:

	struct static_key key = STATIC_KEY_INIT_FALSE;

	...

	if (static_key_false(&key))
	        do unlikely code
	else
	        do likely code

	...
	static_key_slow_inc();
	...
	static_key_slow_inc();
	...

   The static_key_false() branch will be generated into the code with as
   little impact to the likely code path as possible.  the
   static_key_slow_*() APIs flip the branch via live kernel code patching.

   This facility can now be used more widely within the kernel to
   micro-optimize hot branches whose likelihood matches the static-key
   usage and fast/slow cost patterns.

 - SW function tracer improvements: perf support and filtering support.

 - Various hardenings of the perf.data ABI, to make older perf.data's
   smoother on newer tool versions, to make new features integrate more
   smoothly, to support cross-endian recording/analyzing workflows
   better, etc.

 - Restructuring of the kprobes code, the splitting out of 'optprobes',
   and a corner case bugfix.

 - Allow the tracing of kernel console output (printk).

 - Improvements/fixes to user-space RDPMC support, allowing user-space
   self-profiling code to extract PMU counts without performing any
   system calls, while playing nice with the kernel side.

 - 'perf bench' improvements

 - ... and lots of internal restructurings, cleanups and fixes that made
   these features possible.  And, as usual this list is incomplete as
   there were also lots of other improvements

* 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (120 commits)
  perf report: Fix annotate double quit issue in branch view mode
  perf report: Remove duplicate annotate choice in branch view mode
  perf/x86: Prettify pmu config literals
  perf report: Enable TUI in branch view mode
  perf report: Auto-detect branch stack sampling mode
  perf record: Add HEADER_BRANCH_STACK tag
  perf record: Provide default branch stack sampling mode option
  perf tools: Make perf able to read files from older ABIs
  perf tools: Fix ABI compatibility bug in print_event_desc()
  perf tools: Enable reading of perf.data files from different ABI rev
  perf: Add ABI reference sizes
  perf report: Add support for taken branch sampling
  perf record: Add support for sampling taken branch
  perf tools: Add code to support PERF_SAMPLE_BRANCH_STACK
  x86/kprobes: Split out optprobe related code to kprobes-opt.c
  x86/kprobes: Fix a bug which can modify kernel code permanently
  x86/kprobes: Fix instruction recovery on optimized path
  perf: Add callback to flush branch_stack on context switch
  perf: Disable PERF_SAMPLE_BRANCH_* when not supported
  perf/x86: Add LBR software filter support for Intel CPUs
  ...
This commit is contained in:
Linus Torvalds
2012-03-20 10:29:15 -07:00
165 changed files with 6110 additions and 1987 deletions
+51 -33
View File
@@ -1,3 +1,10 @@
OUTPUT := ./
ifeq ("$(origin O)", "command line")
ifneq ($(O),)
OUTPUT := $(O)/
endif
endif
MAN1_TXT= \
$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
$(wildcard perf-*.txt)) \
@@ -6,10 +13,11 @@ MAN5_TXT=
MAN7_TXT=
MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
_MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
_MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
DOC_HTML=$(MAN_HTML)
MAN_XML=$(addprefix $(OUTPUT),$(_MAN_XML))
MAN_HTML=$(addprefix $(OUTPUT),$(_MAN_HTML))
ARTICLES =
# with their own formatting rules.
@@ -18,11 +26,17 @@ API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technica
SP_ARTICLES += $(API_DOCS)
SP_ARTICLES += technical/api-index
DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
_DOC_HTML = $(_MAN_HTML)
_DOC_HTML+=$(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
DOC_HTML=$(addprefix $(OUTPUT),$(_DOC_HTML))
DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
_DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
_DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
_DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
DOC_MAN1=$(addprefix $(OUTPUT),$(_DOC_MAN1))
DOC_MAN5=$(addprefix $(OUTPUT),$(_DOC_MAN5))
DOC_MAN7=$(addprefix $(OUTPUT),$(_DOC_MAN7))
# Make the path relative to DESTDIR, not prefix
ifndef DESTDIR
@@ -150,9 +164,9 @@ man1: $(DOC_MAN1)
man5: $(DOC_MAN5)
man7: $(DOC_MAN7)
info: perf.info perfman.info
info: $(OUTPUT)perf.info $(OUTPUT)perfman.info
pdf: user-manual.pdf
pdf: $(OUTPUT)user-manual.pdf
install: install-man
@@ -166,7 +180,7 @@ install-man: man
install-info: info
$(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
$(INSTALL) -m 644 $(OUTPUT)perf.info $(OUTPUT)perfman.info $(DESTDIR)$(infodir)
if test -r $(DESTDIR)$(infodir)/dir; then \
$(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
$(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
@@ -176,7 +190,7 @@ install-info: info
install-pdf: pdf
$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
$(INSTALL) -m 644 $(OUTPUT)user-manual.pdf $(DESTDIR)$(pdfdir)
#install-html: html
# '$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
@@ -189,14 +203,14 @@ install-pdf: pdf
#
# Determine "include::" file references in asciidoc files.
#
doc.dep : $(wildcard *.txt) build-docdep.perl
$(OUTPUT)doc.dep : $(wildcard *.txt) build-docdep.perl
$(QUIET_GEN)$(RM) $@+ $@ && \
$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
mv $@+ $@
-include doc.dep
-include $(OUPTUT)doc.dep
cmds_txt = cmds-ancillaryinterrogators.txt \
_cmds_txt = cmds-ancillaryinterrogators.txt \
cmds-ancillarymanipulators.txt \
cmds-mainporcelain.txt \
cmds-plumbinginterrogators.txt \
@@ -205,32 +219,36 @@ cmds_txt = cmds-ancillaryinterrogators.txt \
cmds-synchelpers.txt \
cmds-purehelpers.txt \
cmds-foreignscminterface.txt
cmds_txt=$(addprefix $(OUTPUT),$(_cmds_txt))
$(cmds_txt): cmd-list.made
$(cmds_txt): $(OUTPUT)cmd-list.made
cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
$(OUTPUT)cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
$(QUIET_GEN)$(RM) $@ && \
$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
date >$@
clean:
$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
$(RM) howto-index.txt howto/*.html doc.dep
$(RM) technical/api-*.html technical/api-index.txt
$(RM) $(cmds_txt) *.made
$(RM) $(MAN_XML) $(addsuffix +,$(MAN_XML))
$(RM) $(MAN_HTML) $(addsuffix +,$(MAN_HTML))
$(RM) $(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7)
$(RM) $(OUTPUT)*.texi $(OUTPUT)*.texi+ $(OUTPUT)*.texi++
$(RM) $(OUTPUT)perf.info $(OUTPUT)perfman.info
$(RM) $(OUTPUT)howto-index.txt $(OUTPUT)howto/*.html $(OUTPUT)doc.dep
$(RM) $(OUTPUT)technical/api-*.html $(OUTPUT)technical/api-index.txt
$(RM) $(cmds_txt) $(OUTPUT)*.made
$(MAN_HTML): %.html : %.txt
$(MAN_HTML): $(OUTPUT)%.html : %.txt
$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
mv $@+ $@
%.1 %.5 %.7 : %.xml
$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml
$(QUIET_XMLTO)$(RM) $@ && \
xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
xmlto -o $(OUTPUT) -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
%.xml : %.txt
$(OUTPUT)%.xml : %.txt
$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
@@ -239,25 +257,25 @@ $(MAN_HTML): %.html : %.txt
XSLT = docbook.xsl
XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
user-manual.html: user-manual.xml
$(OUTPUT)user-manual.html: $(OUTPUT)user-manual.xml
$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
perf.info: user-manual.texi
$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
$(OUTPUT)perf.info: $(OUTPUT)user-manual.texi
$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ $(OUTPUT)user-manual.texi
user-manual.texi: user-manual.xml
$(OUTPUT)user-manual.texi: $(OUTPUT)user-manual.xml
$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
$(DOCBOOK2X_TEXI) $(OUTPUT)user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
rm $@++ && \
mv $@+ $@
user-manual.pdf: user-manual.xml
$(OUTPUT)user-manual.pdf: $(OUTPUT)user-manual.xml
$(QUIET_DBLATEX)$(RM) $@+ $@ && \
$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
mv $@+ $@
perfman.texi: $(MAN_XML) cat-texi.perl
$(OUTPUT)perfman.texi: $(MAN_XML) cat-texi.perl
$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
--to-stdout $(xml) &&) true) > $@++ && \
@@ -265,7 +283,7 @@ perfman.texi: $(MAN_XML) cat-texi.perl
rm $@++ && \
mv $@+ $@
perfman.info: perfman.texi
$(OUTPUT)perfman.info: $(OUTPUT)perfman.texi
$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
+17 -3
View File
@@ -8,7 +8,7 @@ perf-lock - Analyze lock events
SYNOPSIS
--------
[verse]
'perf lock' {record|report|trace}
'perf lock' {record|report|script|info}
DESCRIPTION
-----------
@@ -20,10 +20,13 @@ and statistics with this 'perf lock' command.
produces the file "perf.data" which contains tracing
results of lock events.
'perf lock trace' shows raw lock events.
'perf lock report' reports statistical data.
'perf lock script' shows raw lock events.
'perf lock info' shows metadata like threads or addresses
of lock instances.
COMMON OPTIONS
--------------
@@ -47,6 +50,17 @@ REPORT OPTIONS
Sorting key. Possible values: acquired (default), contended,
wait_total, wait_max, wait_min.
INFO OPTIONS
------------
-t::
--threads::
dump thread list in perf.data
-m::
--map::
dump map of lock instances (address:name table)
SEE ALSO
--------
linkperf:perf[1]
+36 -2
View File
@@ -52,11 +52,15 @@ OPTIONS
-p::
--pid=::
Record events on existing process ID.
Record events on existing process ID (comma separated list).
-t::
--tid=::
Record events on existing thread ID.
Record events on existing thread ID (comma separated list).
-u::
--uid=::
Record events in threads owned by uid. Name or number.
-r::
--realtime=::
@@ -148,6 +152,36 @@ an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must ha
corresponding events, i.e., they always refer to events defined earlier on the command
line.
-b::
--branch-any::
Enable taken branch stack sampling. Any type of taken branch may be sampled.
This is a shortcut for --branch-filter any. See --branch-filter for more infos.
-j::
--branch-filter::
Enable taken branch stack sampling. Each sample captures a series of consecutive
taken branches. The number of branches captured with each sample depends on the
underlying hardware, the type of branches of interest, and the executed code.
It is possible to select the types of branches captured by enabling filters. The
following filters are defined:
- any: any type of branches
- any_call: any function call or system call
- any_ret: any function return or system call return
- any_ind: any indirect branch
- u: only when the branch target is at the user level
- k: only when the branch target is in the kernel
- hv: only when the target is at the hypervisor level
+
The option requires at least one branch type among any, any_call, any_ret, ind_call.
The privilege levels may be ommitted, in which case, the privilege levels of the associated
event are applied to the branch filter. Both kernel (k) and hypervisor (hv) privilege
levels are subject to permissions. When sampling on multiple events, branch stack sampling
is enabled for all the sampling events. The sampled branch type is the same for all events.
The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
Note that this feature may not be available on all processors.
SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-list[1]
+10
View File
@@ -153,6 +153,16 @@ OPTIONS
information which may be very large and thus may clutter the display.
It currently includes: cpu and numa topology of the host system.
-b::
--branch-stack::
Use the addresses of sampled taken branches instead of the instruction
address to build the histograms. To generate meaningful output, the
perf.data file must have been obtained using perf record -b or
perf record --branch-filter xxx where xxx is a branch filter option.
perf report is able to auto-detect whether a perf.data file contains
branch stacks and it will automatically switch to the branch view mode,
unless --no-branch-stack is used.
SEE ALSO
--------
linkperf:perf-stat[1], linkperf:perf-annotate[1]
+4 -1
View File
@@ -115,7 +115,7 @@ OPTIONS
-f::
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr.
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff.
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace
@@ -200,6 +200,9 @@ OPTIONS
It currently includes: cpu and numa topology of the host system.
It can only be used with the perf script report mode.
--show-kernel-path::
Try to resolve the path of [kernel.kallsyms]
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-script-perl[1],
+2 -2
View File
@@ -35,11 +35,11 @@ OPTIONS
child tasks do not inherit counters
-p::
--pid=<pid>::
stat events on existing process id
stat events on existing process id (comma separated list)
-t::
--tid=<tid>::
stat events on existing thread id
stat events on existing thread id (comma separated list)
-a::
+6 -2
View File
@@ -72,11 +72,15 @@ Default is to monitor all CPUS.
-p <pid>::
--pid=<pid>::
Profile events on existing Process ID.
Profile events on existing Process ID (comma separated list).
-t <tid>::
--tid=<tid>::
Profile events on existing thread ID.
Profile events on existing thread ID (comma separated list).
-u::
--uid=::
Record events in threads owned by uid. Name or number.
-r <priority>::
--realtime=<priority>::
+1
View File
@@ -9,6 +9,7 @@ lib/rbtree.c
include/linux/swab.h
arch/*/include/asm/unistd*.h
arch/*/lib/memcpy*.S
arch/*/lib/memset*.S
include/linux/poison.h
include/linux/magic.h
include/linux/hw_breakpoint.h
+23 -3
View File
@@ -15,6 +15,16 @@ endif
# Define V to have a more verbose compile.
#
# Define O to save output files in a separate directory.
#
# Define ARCH as name of target architecture if you want cross-builds.
#
# Define CROSS_COMPILE as prefix name of compiler if you want cross-builds.
#
# Define NO_LIBPERL to disable perl script extension.
#
# Define NO_LIBPYTHON to disable python script extension.
#
# Define PYTHON to point to the python binary if the default
# `python' is not correct; for example: PYTHON=python2
#
@@ -32,6 +42,10 @@ endif
# Define NO_DWARF if you do not want debug-info analysis feature at all.
#
# Define WERROR=0 to disable treating any warnings as errors.
#
# Define NO_NEWT if you do not want TUI support.
#
# Define NO_DEMANGLE if you do not want C++ symbol demangling.
$(OUTPUT)PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
@$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
@@ -61,7 +75,7 @@ ifeq ($(ARCH),x86_64)
ifeq (${IS_X86_64}, 1)
RAW_ARCH := x86_64
ARCH_CFLAGS := -DARCH_X86_64
ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
endif
endif
@@ -183,7 +197,10 @@ SCRIPT_SH += perf-archive.sh
grep-libs = $(filter -l%,$(1))
strip-libs = $(filter-out -l%,$(1))
$(OUTPUT)python/perf.so: $(PYRF_OBJS)
PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py
$(OUTPUT)python/perf.so: $(PYRF_OBJS) $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
$(QUIET_GEN)CFLAGS='$(BASIC_CFLAGS)' $(PYTHON_WORD) util/setup.py \
--quiet build_ext; \
mkdir -p $(OUTPUT)python && \
@@ -258,6 +275,7 @@ LIB_H += util/callchain.h
LIB_H += util/build-id.h
LIB_H += util/debug.h
LIB_H += util/debugfs.h
LIB_H += util/sysfs.h
LIB_H += util/event.h
LIB_H += util/evsel.h
LIB_H += util/evlist.h
@@ -304,6 +322,7 @@ LIB_OBJS += $(OUTPUT)util/build-id.o
LIB_OBJS += $(OUTPUT)util/config.o
LIB_OBJS += $(OUTPUT)util/ctype.o
LIB_OBJS += $(OUTPUT)util/debugfs.o
LIB_OBJS += $(OUTPUT)util/sysfs.o
LIB_OBJS += $(OUTPUT)util/environment.o
LIB_OBJS += $(OUTPUT)util/event.o
LIB_OBJS += $(OUTPUT)util/evlist.o
@@ -361,8 +380,10 @@ BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
ifeq ($(RAW_ARCH),x86_64)
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o
endif
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
BUILTIN_OBJS += $(OUTPUT)bench/mem-memset.o
BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
@@ -794,7 +815,6 @@ help:
@echo ' quick-install-html - install the html documentation quickly'
@echo ''
@echo 'Perf maintainer targets:'
@echo ' distclean - alias to clean'
@echo ' clean - clean all binary objects and build output'
doc:
+1
View File
@@ -4,6 +4,7 @@
extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
extern int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used);
extern int bench_mem_memset(int argc, const char **argv, const char *prefix);
#define BENCH_FORMAT_DEFAULT_STR "default"
#define BENCH_FORMAT_DEFAULT 0
@@ -2,3 +2,11 @@
MEMCPY_FN(__memcpy,
"x86-64-unrolled",
"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_c,
"x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_c_e,
"x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
+5 -1
View File
@@ -1,4 +1,8 @@
#define memcpy MEMCPY /* don't hide glibc's memcpy() */
#define altinstr_replacement text
#define globl p2align 4; .globl
#define Lmemcpy_c globl memcpy_c; memcpy_c
#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
#include "../../../arch/x86/lib/memcpy_64.S"
/*
* We need to provide note.GNU-stack section, saying that we want
+9 -3
View File
@@ -5,7 +5,6 @@
*
* Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
*/
#include <ctype.h>
#include "../perf.h"
#include "../util/util.h"
@@ -24,6 +23,7 @@
static const char *length_str = "1MB";
static const char *routine = "default";
static int iterations = 1;
static bool use_clock;
static int clock_fd;
static bool only_prefault;
@@ -35,6 +35,8 @@ static const struct option options[] = {
"available unit: B, MB, GB (upper and lower)"),
OPT_STRING('r', "routine", &routine, "default",
"Specify routine to copy"),
OPT_INTEGER('i', "iterations", &iterations,
"repeat memcpy() invocation this number of times"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
OPT_BOOLEAN('o', "only-prefault", &only_prefault,
@@ -121,6 +123,7 @@ static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
{
u64 clock_start = 0ULL, clock_end = 0ULL;
void *src = NULL, *dst = NULL;
int i;
alloc_mem(&src, &dst, len);
@@ -128,7 +131,8 @@ static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
fn(dst, src, len);
clock_start = get_clock();
fn(dst, src, len);
for (i = 0; i < iterations; ++i)
fn(dst, src, len);
clock_end = get_clock();
free(src);
@@ -140,6 +144,7 @@ static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
{
struct timeval tv_start, tv_end, tv_diff;
void *src = NULL, *dst = NULL;
int i;
alloc_mem(&src, &dst, len);
@@ -147,7 +152,8 @@ static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
fn(dst, src, len);
BUG_ON(gettimeofday(&tv_start, NULL));
fn(dst, src, len);
for (i = 0; i < iterations; ++i)
fn(dst, src, len);
BUG_ON(gettimeofday(&tv_end, NULL));
timersub(&tv_end, &tv_start, &tv_diff);
+12
View File
@@ -0,0 +1,12 @@
#ifdef ARCH_X86_64
#define MEMSET_FN(fn, name, desc) \
extern void *fn(void *, int, size_t);
#include "mem-memset-x86-64-asm-def.h"
#undef MEMSET_FN
#endif
@@ -0,0 +1,12 @@
MEMSET_FN(__memset,
"x86-64-unrolled",
"unrolled memset() in arch/x86/lib/memset_64.S")
MEMSET_FN(memset_c,
"x86-64-stosq",
"movsq-based memset() in arch/x86/lib/memset_64.S")
MEMSET_FN(memset_c_e,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
+13
View File
@@ -0,0 +1,13 @@
#define memset MEMSET /* don't hide glibc's memset() */
#define altinstr_replacement text
#define globl p2align 4; .globl
#define Lmemset_c globl memset_c; memset_c
#define Lmemset_c_e globl memset_c_e; memset_c_e
#include "../../../arch/x86/lib/memset_64.S"
/*
* We need to provide note.GNU-stack section, saying that we want
* NOT executable stack. Otherwise the final linking will assume that
* the ELF stack should not be restricted at all and set it RWX.
*/
.section .note.GNU-stack,"",@progbits
+297
View File
@@ -0,0 +1,297 @@
/*
* mem-memset.c
*
* memset: Simple memory set in various ways
*
* Trivial clone of mem-memcpy.c.
*/
#include "../perf.h"
#include "../util/util.h"
#include "../util/parse-options.h"
#include "../util/header.h"
#include "bench.h"
#include "mem-memset-arch.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <errno.h>
#define K 1024
static const char *length_str = "1MB";
static const char *routine = "default";
static int iterations = 1;
static bool use_clock;
static int clock_fd;
static bool only_prefault;
static bool no_prefault;
static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
"Specify length of memory to copy. "
"available unit: B, MB, GB (upper and lower)"),
OPT_STRING('r', "routine", &routine, "default",
"Specify routine to copy"),
OPT_INTEGER('i', "iterations", &iterations,
"repeat memset() invocation this number of times"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
OPT_BOOLEAN('o', "only-prefault", &only_prefault,
"Show only the result with page faults before memset()"),
OPT_BOOLEAN('n', "no-prefault", &no_prefault,
"Show only the result without page faults before memset()"),
OPT_END()
};
typedef void *(*memset_t)(void *, int, size_t);
struct routine {
const char *name;
const char *desc;
memset_t fn;
};
static const struct routine routines[] = {
{ "default",
"Default memset() provided by glibc",
memset },
#ifdef ARCH_X86_64
#define MEMSET_FN(fn, name, desc) { name, desc, fn },
#include "mem-memset-x86-64-asm-def.h"
#undef MEMSET_FN
#endif
{ NULL,
NULL,
NULL }
};
static const char * const bench_mem_memset_usage[] = {
"perf bench mem memset <options>",
NULL
};
static struct perf_event_attr clock_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES
};
static void init_clock(void)
{
clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0);
if (clock_fd < 0 && errno == ENOSYS)
die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
else
BUG_ON(clock_fd < 0);
}
static u64 get_clock(void)
{
int ret;
u64 clk;
ret = read(clock_fd, &clk, sizeof(u64));
BUG_ON(ret != sizeof(u64));
return clk;
}
static double timeval2double(struct timeval *ts)
{
return (double)ts->tv_sec +
(double)ts->tv_usec / (double)1000000;
}
static void alloc_mem(void **dst, size_t length)
{
*dst = zalloc(length);
if (!dst)
die("memory allocation failed - maybe length is too large?\n");
}
static u64 do_memset_clock(memset_t fn, size_t len, bool prefault)
{
u64 clock_start = 0ULL, clock_end = 0ULL;
void *dst = NULL;
int i;
alloc_mem(&dst, len);
if (prefault)
fn(dst, -1, len);
clock_start = get_clock();
for (i = 0; i < iterations; ++i)
fn(dst, i, len);
clock_end = get_clock();
free(dst);
return clock_end - clock_start;
}
static double do_memset_gettimeofday(memset_t fn, size_t len, bool prefault)
{
struct timeval tv_start, tv_end, tv_diff;
void *dst = NULL;
int i;
alloc_mem(&dst, len);
if (prefault)
fn(dst, -1, len);
BUG_ON(gettimeofday(&tv_start, NULL));
for (i = 0; i < iterations; ++i)
fn(dst, i, len);
BUG_ON(gettimeofday(&tv_end, NULL));
timersub(&tv_end, &tv_start, &tv_diff);
free(dst);
return (double)((double)len / timeval2double(&tv_diff));
}
#define pf (no_prefault ? 0 : 1)
#define print_bps(x) do { \
if (x < K) \
printf(" %14lf B/Sec", x); \
else if (x < K * K) \
printf(" %14lfd KB/Sec", x / K); \
else if (x < K * K * K) \
printf(" %14lf MB/Sec", x / K / K); \
else \
printf(" %14lf GB/Sec", x / K / K / K); \
} while (0)
int bench_mem_memset(int argc, const char **argv,
const char *prefix __used)
{
int i;
size_t len;
double result_bps[2];
u64 result_clock[2];
argc = parse_options(argc, argv, options,
bench_mem_memset_usage, 0);
if (use_clock)
init_clock();
len = (size_t)perf_atoll((char *)length_str);
result_clock[0] = result_clock[1] = 0ULL;
result_bps[0] = result_bps[1] = 0.0;
if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}
/* same to without specifying either of prefault and no-prefault */
if (only_prefault && no_prefault)
only_prefault = no_prefault = false;
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
}
if (!routines[i].name) {
printf("Unknown routine:%s\n", routine);
printf("Available routines...\n");
for (i = 0; routines[i].name; i++) {
printf("\t%s ... %s\n",
routines[i].name, routines[i].desc);
}
return 1;
}
if (bench_format == BENCH_FORMAT_DEFAULT)
printf("# Copying %s Bytes ...\n\n", length_str);
if (!only_prefault && !no_prefault) {
/* show both of results */
if (use_clock) {
result_clock[0] =
do_memset_clock(routines[i].fn, len, false);
result_clock[1] =
do_memset_clock(routines[i].fn, len, true);
} else {
result_bps[0] =
do_memset_gettimeofday(routines[i].fn,
len, false);
result_bps[1] =
do_memset_gettimeofday(routines[i].fn,
len, true);
}
} else {
if (use_clock) {
result_clock[pf] =
do_memset_clock(routines[i].fn,
len, only_prefault);
} else {
result_bps[pf] =
do_memset_gettimeofday(routines[i].fn,
len, only_prefault);
}
}
switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
if (!only_prefault && !no_prefault) {
if (use_clock) {
printf(" %14lf Clock/Byte\n",
(double)result_clock[0]
/ (double)len);
printf(" %14lf Clock/Byte (with prefault)\n ",
(double)result_clock[1]
/ (double)len);
} else {
print_bps(result_bps[0]);
printf("\n");
print_bps(result_bps[1]);
printf(" (with prefault)\n");
}
} else {
if (use_clock) {
printf(" %14lf Clock/Byte",
(double)result_clock[pf]
/ (double)len);
} else
print_bps(result_bps[pf]);
printf("%s\n", only_prefault ? " (with prefault)" : "");
}
break;
case BENCH_FORMAT_SIMPLE:
if (!only_prefault && !no_prefault) {
if (use_clock) {
printf("%lf %lf\n",
(double)result_clock[0] / (double)len,
(double)result_clock[1] / (double)len);
} else {
printf("%lf %lf\n",
result_bps[0], result_bps[1]);
}
} else {
if (use_clock) {
printf("%lf\n", (double)result_clock[pf]
/ (double)len);
} else
printf("%lf\n", result_bps[pf]);
}
break;
default:
/* reaching this means there's some disaster: */
die("unknown format: %d\n", bench_format);
break;
}
return 0;
}
+3
View File
@@ -52,6 +52,9 @@ static struct bench_suite mem_suites[] = {
{ "memcpy",
"Simple memory copy in various ways",
bench_mem_memcpy },
{ "memset",
"Simple memory set in various ways",
bench_mem_memset },
suite_all,
{ NULL,
NULL,
+2 -2
View File
@@ -922,12 +922,12 @@ static const struct option info_options[] = {
OPT_BOOLEAN('t', "threads", &info_threads,
"dump thread list in perf.data"),
OPT_BOOLEAN('m', "map", &info_map,
"map of lock instances (name:address table)"),
"map of lock instances (address:name table)"),
OPT_END()
};
static const char * const lock_usage[] = {
"perf lock [<options>] {record|trace|report}",
"perf lock [<options>] {record|report|script|info}",
NULL
};
+6 -6
View File
@@ -58,7 +58,7 @@ static struct {
struct perf_probe_event events[MAX_PROBES];
struct strlist *dellist;
struct line_range line_range;
const char *target_module;
const char *target;
int max_probe_points;
struct strfilter *filter;
} params;
@@ -246,7 +246,7 @@ static const struct option options[] = {
"file", "vmlinux pathname"),
OPT_STRING('s', "source", &symbol_conf.source_prefix,
"directory", "path to kernel source"),
OPT_STRING('m', "module", &params.target_module,
OPT_STRING('m', "module", &params.target,
"modname|path",
"target module name (for online) or path (for offline)"),
#endif
@@ -333,7 +333,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
if (!params.filter)
params.filter = strfilter__new(DEFAULT_FUNC_FILTER,
NULL);
ret = show_available_funcs(params.target_module,
ret = show_available_funcs(params.target,
params.filter);
strfilter__delete(params.filter);
if (ret < 0)
@@ -354,7 +354,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
usage_with_options(probe_usage, options);
}
ret = show_line_range(&params.line_range, params.target_module);
ret = show_line_range(&params.line_range, params.target);
if (ret < 0)
pr_err(" Error: Failed to show lines. (%d)\n", ret);
return ret;
@@ -371,7 +371,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
ret = show_available_vars(params.events, params.nevents,
params.max_probe_points,
params.target_module,
params.target,
params.filter,
params.show_ext_vars);
strfilter__delete(params.filter);
@@ -393,7 +393,7 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
if (params.nevents) {
ret = add_perf_probe_events(params.events, params.nevents,
params.max_probe_points,
params.target_module,
params.target,
params.force_add);
if (ret < 0) {
pr_err(" Error: Failed to add events. (%d)\n", ret);

Some files were not shown because too many files have changed in this diff Show More