Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
This commit is contained in:
Xamarin Public Jenkins (auto-signing)
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions

View File

@@ -0,0 +1 @@
AddHandler server-parsed .html

View File

@@ -0,0 +1,36 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />
<title>Polly - Bugs</title>
<link type="text/css" rel="stylesheet" href="menu.css" />
<link type="text/css" rel="stylesheet" href="content.css" />
</head>
<body>
<div id="box">
<!--#include virtual="menu.html.incl"-->
<div id="content">
<h1>Bug Reports</h1>
Polly uses the LLVM bug tracking system:
<ul>
<li>
<a href="https://bugs.llvm.org/enter_bug.cgi?product=Polly">File new bug</a>
</li>
<li>
<a
href="https://bugs.llvm.org/buglist.cgi?query_format=advanced&bug_status=NEW&bug_status=ASSIGNED&bug_status=REOPENED&product=Polly&list_id=91437">Show open bugs</a>
</li>
</ul>
</div>
</div>
</body>
</html>

View File

@@ -0,0 +1,59 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
<html>
<head> <META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Polly - ChangeLog</title>
<link type="text/css" rel="stylesheet" href="menu.css">
<link type="text/css" rel="stylesheet" href="content.css">
</head>
<body>
<div id="box">
<!--#include virtual="menu.html.incl"-->
<div id="content">
<h1> ChangeLog </h1>
<h2> trunk</h2>
<ul>
<li>Optimized isl for small integers, such that mostly cheap 32bit operations
are used instead of costly arbitrary precision integers that often also involve
malloc/free calls. As a result, the compile-time increase due to Polly has
been largely reduced.</li>
<li>Support for modulo operations: Accesses such as <pre>A[t%2][i]</pre> can
now be analyzed.
</ul>
<h2> 3.7 </h2>
<ul>
<li>libPluto support has been removed. It has not been tested regularly and
due to it being copyleft license it had never a chance to become a a core
piece of Polly. Experiments with different schedulers should use the jscop
interface.</li>
</ul>
<h2> 3.6</h2>
<ul>
<li>Switch to the new isl AST generator (replacing CLooG)</li>
<li>Run-time alias checks</li>
<li>Computation of no-alias information for later LLVM optimizations
(vectorizer, LICM, ...)</li>
<li>Support for multi-dimensional arrays of parameteric size (still tested)</li>
<li>New assumption tracking framework</li>
<ul>
<li>Accesses to multi-dimensional arrays of fixed size are within bounds</li>
</ul>
<li>Compile-time reduction</li>
</ul>
<h2> Older releases</h2>
No changelog available. Please look at the <a
href="http://repo.or.cz/w/polly-mirror.git">commit history</a>.
</html>
</div>
</body>
</html>

View File

@@ -0,0 +1,139 @@
html { margin: 0px; } body { margin: 8px; }
html, body {
padding:0px;
font-family:"Lucida Grande", "Lucida Sans Unicode", Arial, Verdana, Helvetica, sans-serif; background-color: #fff; color: #222;
}
#box {
margin-left: auto;
margin-right: auto;
max-width: 67em;
}
[id=content] {
/* ***** EDIT THIS VALUE IF CONTENT OVERLAPS MENU ***** */
margin-left: 21em;
padding-left: 3em;
}
a:visited {
color: #931e24;
}
h1, h2, h3, tt { color: #000 }
h1 { padding-top:0px; margin-top:0px;}
h2 { color:#333333; padding-top:0.5em; }
h3 { padding-top: 0.5em; color:#2d58b7}
li { padding-bottom: 0.5em; }
ul { padding-left:1.5em; }
TD.done {background-color: #88ff99; text-align: center}
TD.inprogress{background-color: #ffce00; text-align: center}
TD.open{background-color: #e6705f; text-align: center}
TD.nice{background-color: #5555df; text-align: center}
TD.niceinprogress{background-color: #8888ff; text-align: center}
PRE.code {padding-left: 0.5em; background-color: #eeeeee}
PRE {padding-left: 0.5em}
/* Slides */
IMG.img_slide {
display: block;
margin-left: auto;
margin-right: auto
}
.itemTitle { color:#2d58b7 }
span.error { color:red }
span.caret { color:green; font-weight:bold }
/* Tables */
tr { vertical-align:top }
#news P {padding: 0px; margin: 0px; border: 0px}
#head {
min-height: 15em;
background-image:url(images/header-background.png);
background-repeat:no-repeat;
background-position: right;
max-width: 70em;
margin-bottom: 1em
}
#head h1 {
padding-bottom: 0em;
margin-bottom: 0em;
padding-top: 1em;
padding-left: 2em;
font-size: 3em;
}
#head h1 span {
background: white;
-webkit-border-radius: 5px;
-moz-border-radius: 5px;
border-radius: 5px;
background: white;
box-shadow: 1px 2px 5px 1px rgba(0, 0, 0, 0.7),
-1px 2px 20px rgba(255, 255, 255, 0.6) inset;
}
#head h1 span a {
text-decoration: none;
color: #3b4567;
}
#head h1 span:before {
content: "\00a0 ";
}
#head h1 span:after {
content: "\00a0 ";
}
#head h2 {
color: #3b4567;
text-align: center;
padding-top: 0em;
margin-top: 0em;
padding-left: .5em;
padding-bottom: .5em;
}
#head h2 span {
background: white;
-webkit-border-radius: 5px;
-moz-border-radius: 5px;
border-radius: 5px;
background: white;
box-shadow: 1px 2px 5px 1px rgba(0, 0, 0, 0.7),
-1px 2px 20px rgba(255, 255, 255, 0.6) inset;
padding-top: 0.2em;
padding-bottom: 0.2em;
}
#head h2 span:before {
content: "\00a0\00a0\00a0";
}
#head h2 span:after {
content: "\00a0\00a0\00a0";
}
#head p:before {
content: "\00a0\00a0\00a0";
}
#head p:after {
content: "\00a0\00a0\00a0";
}
#head p {
padding:0.1em;
background: rgba(0,0,0,0.3);
color: white;
position: absolute;
top: -1em; right: 0.5em;
-webkit-border-radius: 5px;
-moz-border-radius: 5px;
}

View File

@@ -0,0 +1,65 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />
<title>Polly - Contributors</title>
<link type="text/css" rel="stylesheet" href="menu.css" />
<link type="text/css" rel="stylesheet" href="content.css" />
</head>
<body>
<div id="box">
<!--#include virtual="menu.html.incl"-->
<div id="content">
<h1>Contributors</h1>
Polly is developed by a team of students supported by different universities.
<h2>People</h2>
<h3>Raghesh Aloor</h3>
<p>Raghesh works on OpenMP code generation. He is funded as Google Summer of Code
Student 2011.</p>
<h3>Johannes Doerfert</h3>
<p>Johannes works on Polly as part of his PhD research at Saarland University
in Germany.</p>
<h3>Tobias Grosser</h3>
<p>Tobias is one of the two Co-founders of Polly. He designed the overall
architecture and contributed to almost every part of Polly. Polly was started
during his diploma studies at University of Passau. Furthermore, he spent 6
months at Ohio State University (funded by the U.S. National Science Foundation
through awards 0811781 and 0926688). From August 2011 he works on Polly,
during his PhD with INRIA/UMPC/ENS (funded through a
<a href="http://research.google.com/university/relations/fellowship_recipients.html">
Google Europe Fellowship in Efficient Computing</a>).</p>
<p>Website: <a href="http://www.grosser.es">www.grosser.es</a></p>
<h3>Andreas Simb&uuml;rger</h3>
<p>
Andreas works on the profiling infrastructure during his PhD at University of
Passau.
</p>
<p>Website: <a href="http://www.infosun.fim.uni-passau.de/cl/staff/simbuerger/">
http://www.infosun.fim.uni-passau.de/cl/staff/simbuerger/</a></p>
<h3>Hongbin Zheng</h3>
<p>Hongbin Zheng is one of the two Co-founders of Polly. He was funded as a
Google Summer of Code Student 2010 and implemented parts of the Polly frontends
as well as the automake/cmake infrastructure.</p>
<h2> Universities</h2>
<p>Polly is supported by the following Universities.</p>
<img src="images/iit-madras.png" style="padding:1em" />
<img src="images/uni-passau.png" style="padding: 1em; padding-bottom:2em;"/>
<img src="images/osu.png" style="padding:1em"/>
<img src="images/sys-uni.png" style="padding:1em"/>
</div>
</div>
</body>
</html>

View File

@@ -0,0 +1,37 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Polly - Documentation</title>
<link type="text/css" rel="stylesheet" href="menu.css">
<link type="text/css" rel="stylesheet" href="content.css">
</head>
<body>
<div id="box">
<!--#include virtual="menu.html.incl"-->
<div id="content">
<!--*********************************************************************-->
<h1>Documentation</h1>
<!--*********************************************************************-->
<ul>
<li><a href="documentation/architecture.html">The Architecture of Polly</a></li>
<li><a href="example_load_Polly_into_clang.html">Use Polly in clang/clang++</a>
</li>
<li>
<a href="example_manual_matmul.html">Inside Polly - How to manually use the
individual pieces of Polly</a>
</li>
<li><a href="documentation/passes.html">A list of the LLVM passes available
in Polly</a></li>
<li><a href="docs/">New SPINX based documentation (early stage)</a></li>
</ul>
</div>
</div>
</body>
</html>

View File

@@ -0,0 +1,23 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Polly - The architecture</title>
<link type="text/css" rel="stylesheet" href="../menu.css">
<link type="text/css" rel="stylesheet" href="../content.css">
</head>
<body>
<div id="box">
<!--#include virtual="../menu.html.incl"-->
<div id="content">
<!--*********************************************************************-->
<h1>The Architecture Diagram of Polly</h1>
<!--*********************************************************************-->
<img src='../images/architecture.png' />
</div>
</div>
</body>
</html>

View File

@@ -0,0 +1,229 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Polly - GPGPU Code Generation</title>
<link type="text/css" rel="stylesheet" href="../menu.css">
<link type="text/css" rel="stylesheet" href="../content.css">
</head>
<body>
<div id="box">
<!--#include virtual="../menu.html.incl"-->
<div id="content">
<!--*********************************************************************-->
<h1>Polly - GPGPU Code Generation</h1>
<!--*********************************************************************-->
<p><em>WARNING: This project was part of the Google Summer of Code 2012.
It is currently not finished, but it is in the design and implementation stage.
The ideas/plans described here may not yet be implemented in Polly and may
change later on.</em></p>
This project adds GPGPU code generation feature to Polly.
<h2>Objective</h2>
<p>The overall objective of this GSoC project is to create a preliminary
implementation of GPGPU code generation for Polly. With this addition, users
can parallelize some perfectly nested loops with Polly to execute on a
heterogeneous platform, composed of CPU and GPU.</p>
<p>There are several successful projects about automatic source-to-source gpu
code transformation. C-to-CUDA[1] uses the standard Pluto algorithms for
computing an affine schedule and then applies a wavefront transformation to
obtain one sequential and n-1 parallel loops. The parallel loops are then
mapped onto the blocks and threads of GPU. PPCG[2] introduces some advanced
algorithms which can expose much more parallelism than other methods . And It
also introduces affine partition heuristics and code generation algorithms
for locality enhancement in the registers and shared memory.</p>
<p>Since automatic GPGPU code generation is quite a complex problem and what we
target is a low-level intermediate representation, LLVM IR, rather than a
high-level language source, it is important for us to set a proper objective
as a start step to give a complete solution to GPGPU code generation for LLVM
IR.</p>
<p>Firstly, we plan to target two kinds of relatively simple test cases. One is
comprised of pure parallel and perfectly nested loops, like the following
code.</p>
<pre>
parfor(int i=0 to M)
parfor(int j=0 to N)
LoopBody(i, j);
</pre>
<p>Another one is that all the loops in it are parallel except the inner-most
one, just like this:</p>
<pre>
parfor(int i=0 to M)
parfor(int j=0 to N)
non-parfor(int k=0 to K)
LoopBody(i, j, k);
</pre>
<p>The LoopBody part should be limited to instructions or functions calls
(intrinsics) which can be handled by LLVM's NVPTX backend.</p>
<p>On the other hand, we focus on building a preliminary and scalable framework
of GPGPU code generation for polly. Thus we plan to employ relatively simple
tiling and mapping algorithms and optimize them later.</p>
<h2>Work Flow</h2>
<h3>GPGPU Code Generation In General</h3>
<p>C-to-CUDA[1] and PPCG[2] propose similar steps to solve the automatic GPGPU
code generation problem.</p>
<li>Look for parallel loops.</li>
<li>Create a polyhedral model from the loops.</li>
<li>Tile and map the loops to GPU blocks and threads.</li>
<li>Determine where to place the data.</li>
<h3>What has been done in Polly</h3>
<p>Polly has implemented the 1st, 2nd and part of the 3rd of the above steps and
many other analysis and transformation passes.</p>
<h3>What to do in Polly</h3>
<p>Unlike many source-to-source optimizers such as C-to-CUDA and PPCG, Polly is
a low-level optimizer, which means we can't use a source-level compiler
(e.g. NVCC) to generate the final assembly for the device. We need manually
insert device driver API calls to execute the generated kernel assembly
text.</p>
<p>In this project, we assume that the device driver library has provided an
interface to launch kernels in the form of assembly text. Fortunately, most
of the mainstream GPU vendors provide such a feature in thier products (see
ptxjit of NVIDIA GPUs and CAL of AMD GPUs). Generally speaking, what we
are going to do in Polly is:</p>
<li>Find a way to tile the parallel loops.</li>
<li>Find a way to extract the loop body and transform it into thread-centric
parallel code.</li>
<li>Find a way to store/load the thread-centric code into/from a device module.
<li>Find a way to pass the target machine information and generate code of the
device module for the target.
<li>Find a way to map the tiled loop to GPU blocks and threads.</li>
<li>Find a way to insert CUDA synchronization operations on-demand.
<li>Find a way to generate the memory copy operations between a host and a
device.</li>
<li>Implement/Wrap a runtime library to serve as the execution engine for the
generated device code.</li>
<h3>The Work Flow</h3>
<p>In this section, we assume that the host cpu is X86 and the device is NVIDIA
CUDA-compatible. we will use the following test case to describe our work
flow.</p>
<pre>
for(i = 0; i &lt; 128; i++)
for(j = 0; j &lt; 128; j++)
A[i][j] = i*128 + j;
</pre>
<p>The work flow of our code generator is as follows.</p>
<p>1.We first use Polly's jscop file importer to get a wanted 4-level parallel
tiled code.</p>
The "schedule" part of the pre-optimization jscop file is as the following:
<pre>
"schedule" : "{ Stmt_for_body3[i0, i1] -&gt; schedule[0, i0, 0, i1, 0] }"
</pre>
The jscop file describing the tiling transformation is:
<pre>
"schedule" : "{ Stmt_for_body3[i0, i1] -&gt; schedule[0, o0, o1, o2, o3]:
o0 &gt;= 0 and o0 &lt;= 7 and o1 &gt;= 0 and o1 &lt;= 15 and
o2 &gt;= 0 and o2 &lt;= 7 and o3 &gt;= 0 and o3 &lt;= 15 and
i0 = 16o0 + o1 and i1 = 16o2 + o3 }"
</pre>
We can test the schedule with the following command line.
<pre>
opt -load /path/to/polly/build/LLVMPolly.so -basicaa -polly-import-jscop
-polly-ast -analyze -q ./test.ll
-polly-import-jscop-postfix=transformed+gpu
</pre>
The output of this schedule is:
<pre>
for (c2=0;c2&lt;=7;c2++) {
for (c3=0;c3&lt;=15;c3++) {
for (c4=0;c4&lt;=7;c4++) {
for (c5=0;c5&lt;=15;c5++) {
Stmt_for_body3(16*c2+c3,16*c4+c5);
}
}
}
}
</pre>
Now we get a 4-dimensional parallel loops with a single SCoP statement in it.
<p>2.We then extract the loop body (or the inner-most non-parallel loop) into a
LLVM function, tagging it with PTX_Kernel call convention.</p>
<p>3.We extract the PTX_kernel function into a temporary module, set the target
triple (e.g. nvptx64-unknown-linux) for the module, transform the temporary
module into a string, store it in the original module and erase the
PTX_kernel function.</p>
<p>4.We replace the loops with their GPGPU counterpart. The GPGPU part of code
is composed of a call to the llvm.codegen intrinsic and function calls to our
GPU runtime library.</p>
<p>5.Finally, we generate the executable program with <em>llc</em> or run the
optimized LLVM IRs with a JIT compiler like <em>lli</em>.</p>
<h2>Usage</h2>
<p>1. Apply the llvm.codegen intrinsic patch to LLVM code base.</p>
<pre>cd /path/to/llvm/source
git am /path/to/polly/source/utils/0001-Add-llvm.codegen-intrinsic.patch</pre>
<p>2. Build the test case.</p>
<pre>/path/to/polly/source/test/create_ll.sh test.c</pre>
<p>3. Get and edit the jscop file (take function "gpu_codegen" as an example).
</p>
<pre>opt -load /path/to/polly/build/lib/LLVMPolly.so -basicaa
-polly-export-jscop ./test.ll
cp gpu_codegen___%for.cond---%for.end8.jscop
gpu_codegen___%for.cond---%for.end8.jscop.transformed+gpu
vi gpu_codegen___%for.cond---%for.end8.jscop.transformed+gpu</pre>
<p><em>(Please refer to section "The Work Flow" on how to edit the "schedule"
part of a statement)</em></p>
<p>4. Optimize the code with GPGPU code generation.</p>
<pre>opt -load /path/to/polly/build/lib/LLVMPolly.so -basicaa
-polly-import-jscop-postfix=transformed+gpu -enable-polly-gpgpu
-polly-gpgpu-triple=nvptx64-unknown-unknown -polly-codegen ./test.ll -S
-o test.gpued.ll</pre>
<p>5. Build the final assembly and executable.</p>
<pre>llc test.gpued.ll -o test.s
gcc test.s -lGPURuntime -o test</pre>
<p><em>(Please make sure that LD_LIBRARY_PATH is set properly so that
/path/to/polly/build/lib/libGPURuntime.so is visible to gcc.)</em></p>
<h2>TODO List</h2>
<table class="wikitable" cellpadding="2">
<tbody>
<tr style="background: rgb(239, 239, 239)">
<th width="400px"> Tasks</th>
<th width="150px"> Status </th>
<th> Owner </th>
</tr>
<tr>
<th align="left">Tiling the Parallel Loops with An External Jscop File</th>
<td align="center" class='open'>Open, In Design</td>
<td>Yabin Hu</td>
</tr>
<tr>
<th align="left">GPU Runtime Library Implementation</th>
<td align="center" class='inprogress'>Coding Finished, In Reviewing</td>
<td></td>
</tr>
<tr>
<th align="left">llvm.codegen Intrinsic Implementation</th>
<td align="center" class='inprogress'>Codeing Finished, To Be Reviewed</td>
<td></td>
</tr>
<tr>
<th align="left">Code Generation For Host</th>
<td align="center" class='inprogress'>50% Done</td>
<td></td>
</tr>
</tbody></table>
<h2>References</h2>
<li type="1" value="1">
<em>Automatic C-to-CUDA Code Generation for Affine Programs. </em><br />
Muthu Manikandan Baskaran, J. Ramanujam and P. Sadayappan.<br />
International Conference on Compiler Construction (CC) 2010.<br />
</li>
<li type="1"><em>PPCG Project</em><br />
<a href="http://freecode.com/projects/ppcg">http://freecode.com/projects/ppcg
</a></li>
<li type="1">
<em>Where is the Data? Why You Cannot Debate GPU vs. CPU Performance Without the
Answer. </em><br />
Chris Gregg and Kim Hazelwood<br />
International Symposium on Performance Analysis of Systems and Software
(ISPASS) 2011.
</li>
<p></p>
</div>
</div>
</body>
</html>

View File

@@ -0,0 +1,56 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd">
<!-- Material used from: HTML 4.01 specs: http://www.w3.org/TR/html401/ -->
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Polly - The available LLVM passes</title>
<link type="text/css" rel="stylesheet" href="../menu.css">
<link type="text/css" rel="stylesheet" href="../content.css">
</head>
<body>
<div id="box">
<!--#include virtual="../menu.html.incl"-->
<div id="content">
<!--*********************************************************************-->
<h1>The available LLVM passes</h1>
<!--*********************************************************************-->
<p>Polly consists of a set of LLVM passes. </p>
<h2>Front End</h2>
<ul>
<li><em>polly-canonicalize</em> Prepare code for Polly</li>
<li><em>polly-detect</em> Detect SCoPs in functions</li>
<li><em>polly-scops</em> Create polyhedral description of SCoPs</li>
</ul>
<h2>Middle End</h2>
<ul>
<li><em>polly-dependences</em> Calculate the dependences in a SCoPs</li>
<li><em>polly-opt-isl</em> Optimize the SCoP using isl</li>
<li>Import/Export
<ul>
<li><em>polly-export-jscop</em> Export SCoPs as JSON
(Writes a .jscop file for each SCoP)</li>
<li><em>polly-import-jscop</em> Import SCoPs from JSON
(Reads a .jscop file for each SCoP)</li>
</ul>
</li>
<li>Graphviz
<ul>
<li><em>dot-scops</em> Print SCoPs of function</li>
<li><em>dot-scops-only</em> Print SCoPs of function (without function bodies)</li>
<li><em>view-scops</em> View SCoPs of function</li>
<li><em>view-scops-only</em> View SCoPs of function (without function bodies)</li>
</ul></li>
</ul>
<h2>Back End</h2>
<ul>
<li><em>polly-ast</em> Execute isl code generation</li>
<li><em>polly-codegen</em> Create LLVM-IR from the polyhedral information</li>
</ul>
</div>
</div>
</body>
</html>

View File

@@ -0,0 +1,52 @@
#include <stdio.h>
#define N 1536
float A[N][N];
float B[N][N];
float C[N][N];
void init_array()
{
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
A[i][j] = (1+(i*j)%1024)/2.0;
B[i][j] = (1+(i*j)%1024)/2.0;
}
}
}
void print_array()
{
int i, j;
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
fprintf(stdout, "%lf ", C[i][j]);
if (j%80 == 79) fprintf(stdout, "\n");
}
fprintf(stdout, "\n");
}
}
int main()
{
int i, j, k;
double t_start, t_end;
init_array();
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
C[i][j] = 0;
for (k = 0; k < N; k++)
C[i][j] = C[i][j] + A[i][k] * B[k][j];
}
}
#ifdef TEST
print_array();
#endif
return 0;
}

View File

@@ -0,0 +1,274 @@
.file "matmul.normalopt.ll"
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI0_0:
.quad 4602678819172646912 # double 0.5
.text
.globl init_array
.align 16, 0x90
.type init_array,@function
init_array: # @init_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp2:
.cfi_def_cfa_offset 16
.Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp4:
.cfi_def_cfa_register %rbp
xorl %r8d, %r8d
vmovsd .LCPI0_0(%rip), %xmm0
.align 16, 0x90
.LBB0_1: # %for.cond1.preheader
# =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB0_2: # %for.body3
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %r8d, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %r8, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB0_2
# BB#3: # %for.inc17
# in Loop: Header=BB0_1 Depth=1
incq %r8
cmpq $1536, %r8 # imm = 0x600
jne .LBB0_1
# BB#4: # %for.end19
popq %rbp
ret
.Ltmp5:
.size init_array, .Ltmp5-init_array
.cfi_endproc
.globl print_array
.align 16, 0x90
.type print_array,@function
print_array: # @print_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp9:
.cfi_def_cfa_offset 16
.Ltmp10:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp11:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
.Ltmp12:
.cfi_offset %rbx, -48
.Ltmp13:
.cfi_offset %r12, -40
.Ltmp14:
.cfi_offset %r14, -32
.Ltmp15:
.cfi_offset %r15, -24
xorl %r14d, %r14d
movl $C, %r15d
.align 16, 0x90
.LBB1_1: # %for.cond1.preheader
# =>This Loop Header: Depth=1
# Child Loop BB1_2 Depth 2
movq stdout(%rip), %rax
movq %r15, %r12
xorl %ebx, %ebx
.align 16, 0x90
.LBB1_2: # %for.body3
# Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovss (%r12), %xmm0
vcvtss2sd %xmm0, %xmm0, %xmm0
movq %rax, %rdi
movl $.L.str, %esi
movb $1, %al
callq fprintf
movslq %ebx, %rax
imulq $1717986919, %rax, %rcx # imm = 0x66666667
movq %rcx, %rdx
shrq $63, %rdx
sarq $37, %rcx
addl %edx, %ecx
imull $80, %ecx, %ecx
subl %ecx, %eax
cmpl $79, %eax
jne .LBB1_4
# BB#3: # %if.then
# in Loop: Header=BB1_2 Depth=2
movq stdout(%rip), %rsi
movl $10, %edi
callq fputc
.LBB1_4: # %for.inc
# in Loop: Header=BB1_2 Depth=2
addq $4, %r12
incq %rbx
movq stdout(%rip), %rax
cmpq $1536, %rbx # imm = 0x600
jne .LBB1_2
# BB#5: # %for.end
# in Loop: Header=BB1_1 Depth=1
movl $10, %edi
movq %rax, %rsi
callq fputc
addq $6144, %r15 # imm = 0x1800
incq %r14
cmpq $1536, %r14 # imm = 0x600
jne .LBB1_1
# BB#6: # %for.end12
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp16:
.size print_array, .Ltmp16-print_array
.cfi_endproc
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI2_0:
.quad 4602678819172646912 # double 0.5
.text
.globl main
.align 16, 0x90
.type main,@function
main: # @main
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp19:
.cfi_def_cfa_offset 16
.Ltmp20:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp21:
.cfi_def_cfa_register %rbp
xorl %r8d, %r8d
vmovsd .LCPI2_0(%rip), %xmm0
.align 16, 0x90
.LBB2_1: # %for.cond1.preheader.i
# =>This Loop Header: Depth=1
# Child Loop BB2_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB2_2: # %for.body3.i
# Parent Loop BB2_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %r8d, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %r8, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB2_2
# BB#3: # %for.inc17.i
# in Loop: Header=BB2_1 Depth=1
incq %r8
cmpq $1536, %r8 # imm = 0x600
jne .LBB2_1
# BB#4:
xorl %r8d, %r8d
movl $A, %r9d
.align 16, 0x90
.LBB2_5: # %for.cond1.preheader
# =>This Loop Header: Depth=1
# Child Loop BB2_6 Depth 2
# Child Loop BB2_7 Depth 3
leaq (%r8,%r8,2), %rdx
shlq $11, %rdx
leaq C(%rdx), %rsi
xorl %edi, %edi
.align 16, 0x90
.LBB2_6: # %for.body3
# Parent Loop BB2_5 Depth=1
# => This Loop Header: Depth=2
# Child Loop BB2_7 Depth 3
movl $0, (%rsi)
vxorps %xmm0, %xmm0, %xmm0
movq $-9437184, %rax # imm = 0xFFFFFFFFFF700000
movq %r9, %rcx
.align 16, 0x90
.LBB2_7: # %for.body8
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_6 Depth=2
# => This Inner Loop Header: Depth=3
vmovss (%rcx), %xmm1
vmulss B+9437184(%rax,%rdi,4), %xmm1, %xmm1
vaddss %xmm1, %xmm0, %xmm0
addq $4, %rcx
addq $6144, %rax # imm = 0x1800
jne .LBB2_7
# BB#8: # %for.inc25
# in Loop: Header=BB2_6 Depth=2
vmovss %xmm0, (%rsi)
leaq C+4(%rdx,%rdi,4), %rsi
incq %rdi
cmpq $1536, %rdi # imm = 0x600
jne .LBB2_6
# BB#9: # %for.inc28
# in Loop: Header=BB2_5 Depth=1
addq $6144, %r9 # imm = 0x1800
incq %r8
cmpq $1536, %r8 # imm = 0x600
jne .LBB2_5
# BB#10: # %for.end30
xorl %eax, %eax
popq %rbp
ret
.Ltmp22:
.size main, .Ltmp22-main
.cfi_endproc
.type A,@object # @A
.comm A,9437184,16
.type B,@object # @B
.comm B,9437184,16
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%lf "
.size .L.str, 5
.type C,@object # @C
.comm C,9437184,16
.section ".note.GNU-stack","",@progbits

View File

@@ -0,0 +1,396 @@
.file "matmul.polly.interchanged+tiled+vector.ll"
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI0_0:
.quad 4602678819172646912 # double 0.5
.text
.globl init_array
.align 16, 0x90
.type init_array,@function
init_array: # @init_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp2:
.cfi_def_cfa_offset 16
.Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp4:
.cfi_def_cfa_register %rbp
xorl %r8d, %r8d
vmovsd .LCPI0_0(%rip), %xmm0
.align 16, 0x90
.LBB0_1: # %polly.loop_preheader3
# =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB0_2: # %polly.loop_header2
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %r8d, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %r8, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB0_2
# BB#3: # %polly.loop_exit4
# in Loop: Header=BB0_1 Depth=1
incq %r8
cmpq $1536, %r8 # imm = 0x600
jne .LBB0_1
# BB#4: # %polly.loop_exit
popq %rbp
ret
.Ltmp5:
.size init_array, .Ltmp5-init_array
.cfi_endproc
.globl print_array
.align 16, 0x90
.type print_array,@function
print_array: # @print_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp9:
.cfi_def_cfa_offset 16
.Ltmp10:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp11:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
.Ltmp12:
.cfi_offset %rbx, -48
.Ltmp13:
.cfi_offset %r12, -40
.Ltmp14:
.cfi_offset %r14, -32
.Ltmp15:
.cfi_offset %r15, -24
xorl %r14d, %r14d
movl $C, %r15d
.align 16, 0x90
.LBB1_1: # %for.cond1.preheader
# =>This Loop Header: Depth=1
# Child Loop BB1_2 Depth 2
movq stdout(%rip), %rax
movq %r15, %r12
xorl %ebx, %ebx
.align 16, 0x90
.LBB1_2: # %for.body3
# Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovss (%r12), %xmm0
vcvtss2sd %xmm0, %xmm0, %xmm0
movq %rax, %rdi
movl $.L.str, %esi
movb $1, %al
callq fprintf
movslq %ebx, %rax
imulq $1717986919, %rax, %rcx # imm = 0x66666667
movq %rcx, %rdx
shrq $63, %rdx
sarq $37, %rcx
addl %edx, %ecx
imull $80, %ecx, %ecx
subl %ecx, %eax
cmpl $79, %eax
jne .LBB1_4
# BB#3: # %if.then
# in Loop: Header=BB1_2 Depth=2
movq stdout(%rip), %rsi
movl $10, %edi
callq fputc
.LBB1_4: # %for.inc
# in Loop: Header=BB1_2 Depth=2
addq $4, %r12
incq %rbx
movq stdout(%rip), %rax
cmpq $1536, %rbx # imm = 0x600
jne .LBB1_2
# BB#5: # %for.end
# in Loop: Header=BB1_1 Depth=1
movl $10, %edi
movq %rax, %rsi
callq fputc
addq $6144, %r15 # imm = 0x1800
incq %r14
cmpq $1536, %r14 # imm = 0x600
jne .LBB1_1
# BB#6: # %for.end12
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp16:
.size print_array, .Ltmp16-print_array
.cfi_endproc
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI2_0:
.quad 4602678819172646912 # double 0.5
.text
.globl main
.align 16, 0x90
.type main,@function
main: # @main
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp20:
.cfi_def_cfa_offset 16
.Ltmp21:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp22:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $56, %rsp
.Ltmp23:
.cfi_offset %rbx, -56
.Ltmp24:
.cfi_offset %r12, -48
.Ltmp25:
.cfi_offset %r13, -40
.Ltmp26:
.cfi_offset %r14, -32
.Ltmp27:
.cfi_offset %r15, -24
xorl %ebx, %ebx
vmovsd .LCPI2_0(%rip), %xmm0
.align 16, 0x90
.LBB2_1: # %polly.loop_preheader3.i
# =>This Loop Header: Depth=1
# Child Loop BB2_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB2_2: # %polly.loop_header2.i
# Parent Loop BB2_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %ebx, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %rbx, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB2_2
# BB#3: # %polly.loop_exit4.i
# in Loop: Header=BB2_1 Depth=1
incq %rbx
cmpq $1536, %rbx # imm = 0x600
jne .LBB2_1
# BB#4: # %polly.loop_preheader3.preheader
movl $C, %edi
xorl %esi, %esi
movl $9437184, %edx # imm = 0x900000
callq memset
xorl %esi, %esi
movl $C+16, %eax
movq %rax, -88(%rbp) # 8-byte Spill
.align 16, 0x90
.LBB2_5: # %polly.loop_preheader17
# =>This Loop Header: Depth=1
# Child Loop BB2_15 Depth 2
# Child Loop BB2_8 Depth 3
# Child Loop BB2_11 Depth 4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
movq %rsi, -56(%rbp) # 8-byte Spill
movq %rsi, %rax
orq $63, %rax
movq %rax, -72(%rbp) # 8-byte Spill
leaq -1(%rax), %rax
movq %rax, -48(%rbp) # 8-byte Spill
xorl %edx, %edx
.align 16, 0x90
.LBB2_15: # %polly.loop_preheader24
# Parent Loop BB2_5 Depth=1
# => This Loop Header: Depth=2
# Child Loop BB2_8 Depth 3
# Child Loop BB2_11 Depth 4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
movq %rdx, -80(%rbp) # 8-byte Spill
leaq -4(%rdx), %rcx
movq %rdx, %rax
decq %rax
cmovsq %rcx, %rax
movq %rax, %r15
sarq $63, %r15
shrq $62, %r15
addq %rax, %r15
andq $-4, %r15
movq %rdx, %r13
orq $63, %r13
leaq -4(%r13), %rdx
xorl %r10d, %r10d
movq -88(%rbp), %rax # 8-byte Reload
leaq (%rax,%r15,4), %rax
movq %rax, -64(%rbp) # 8-byte Spill
leaq B+16(,%r15,4), %rbx
leaq 4(%r15), %r12
.align 16, 0x90
.LBB2_8: # %polly.loop_header23
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# => This Loop Header: Depth=3
# Child Loop BB2_11 Depth 4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
cmpq -72(%rbp), %rsi # 8-byte Folded Reload
jg .LBB2_13
# BB#9: # %polly.loop_header30.preheader
# in Loop: Header=BB2_8 Depth=3
movq %r10, %rax
orq $63, %rax
cmpq %rax, %r10
jg .LBB2_13
# BB#10: # in Loop: Header=BB2_8 Depth=3
decq %rax
movq -64(%rbp), %r14 # 8-byte Reload
movq -56(%rbp), %r11 # 8-byte Reload
.align 16, 0x90
.LBB2_11: # %polly.loop_header37.preheader
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# Parent Loop BB2_8 Depth=3
# => This Loop Header: Depth=4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
cmpq %r13, %r12
movq %rbx, %r8
movq %r10, %rsi
jg .LBB2_12
.align 16, 0x90
.LBB2_17: # %polly.loop_header46.preheader
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# Parent Loop BB2_8 Depth=3
# Parent Loop BB2_11 Depth=4
# => This Loop Header: Depth=5
# Child Loop BB2_18 Depth 6
leaq (%r11,%r11,2), %rcx
shlq $11, %rcx
vbroadcastss A(%rcx,%rsi,4), %xmm0
movq %r14, %rdi
movq %r8, %r9
movq %r15, %rcx
.LBB2_18: # %polly.loop_header46
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# Parent Loop BB2_8 Depth=3
# Parent Loop BB2_11 Depth=4
# Parent Loop BB2_17 Depth=5
# => This Inner Loop Header: Depth=6
vmulps (%r9), %xmm0, %xmm1
vaddps (%rdi), %xmm1, %xmm1
vmovaps %xmm1, (%rdi)
addq $16, %rdi
addq $16, %r9
addq $4, %rcx
cmpq %rdx, %rcx
jle .LBB2_18
# BB#16: # %polly.loop_exit48
# in Loop: Header=BB2_17 Depth=5
addq $6144, %r8 # imm = 0x1800
cmpq %rax, %rsi
leaq 1(%rsi), %rsi
jle .LBB2_17
.align 16, 0x90
.LBB2_12: # %polly.loop_exit39
# in Loop: Header=BB2_11 Depth=4
addq $6144, %r14 # imm = 0x1800
cmpq -48(%rbp), %r11 # 8-byte Folded Reload
leaq 1(%r11), %r11
jle .LBB2_11
.align 16, 0x90
.LBB2_13: # %polly.loop_exit32
# in Loop: Header=BB2_8 Depth=3
addq $393216, %rbx # imm = 0x60000
cmpq $1472, %r10 # imm = 0x5C0
leaq 64(%r10), %r10
movq -56(%rbp), %rsi # 8-byte Reload
jl .LBB2_8
# BB#14: # %polly.loop_exit25
# in Loop: Header=BB2_15 Depth=2
movq -80(%rbp), %rdx # 8-byte Reload
cmpq $1472, %rdx # imm = 0x5C0
leaq 64(%rdx), %rdx
jl .LBB2_15
# BB#6: # %polly.loop_exit18
# in Loop: Header=BB2_5 Depth=1
addq $393216, -88(%rbp) # 8-byte Folded Spill
# imm = 0x60000
cmpq $1472, %rsi # imm = 0x5C0
leaq 64(%rsi), %rsi
jl .LBB2_5
# BB#7: # %polly.loop_exit11
xorl %eax, %eax
addq $56, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
ret
.Ltmp28:
.size main, .Ltmp28-main
.cfi_endproc
.type A,@object # @A
.comm A,9437184,16
.type B,@object # @B
.comm B,9437184,16
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%lf "
.size .L.str, 5
.type C,@object # @C
.comm C,9437184,16
.section ".note.GNU-stack","",@progbits

View File

@@ -0,0 +1,390 @@
.file "matmul.polly.interchanged+tiled.ll"
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI0_0:
.quad 4602678819172646912 # double 0.5
.text
.globl init_array
.align 16, 0x90
.type init_array,@function
init_array: # @init_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp2:
.cfi_def_cfa_offset 16
.Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp4:
.cfi_def_cfa_register %rbp
xorl %r8d, %r8d
vmovsd .LCPI0_0(%rip), %xmm0
.align 16, 0x90
.LBB0_1: # %polly.loop_preheader3
# =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB0_2: # %polly.loop_header2
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %r8d, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %r8, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB0_2
# BB#3: # %polly.loop_exit4
# in Loop: Header=BB0_1 Depth=1
incq %r8
cmpq $1536, %r8 # imm = 0x600
jne .LBB0_1
# BB#4: # %polly.loop_exit
popq %rbp
ret
.Ltmp5:
.size init_array, .Ltmp5-init_array
.cfi_endproc
.globl print_array
.align 16, 0x90
.type print_array,@function
print_array: # @print_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp9:
.cfi_def_cfa_offset 16
.Ltmp10:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp11:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
.Ltmp12:
.cfi_offset %rbx, -48
.Ltmp13:
.cfi_offset %r12, -40
.Ltmp14:
.cfi_offset %r14, -32
.Ltmp15:
.cfi_offset %r15, -24
xorl %r14d, %r14d
movl $C, %r15d
.align 16, 0x90
.LBB1_1: # %for.cond1.preheader
# =>This Loop Header: Depth=1
# Child Loop BB1_2 Depth 2
movq stdout(%rip), %rax
movq %r15, %r12
xorl %ebx, %ebx
.align 16, 0x90
.LBB1_2: # %for.body3
# Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovss (%r12), %xmm0
vcvtss2sd %xmm0, %xmm0, %xmm0
movq %rax, %rdi
movl $.L.str, %esi
movb $1, %al
callq fprintf
movslq %ebx, %rax
imulq $1717986919, %rax, %rcx # imm = 0x66666667
movq %rcx, %rdx
shrq $63, %rdx
sarq $37, %rcx
addl %edx, %ecx
imull $80, %ecx, %ecx
subl %ecx, %eax
cmpl $79, %eax
jne .LBB1_4
# BB#3: # %if.then
# in Loop: Header=BB1_2 Depth=2
movq stdout(%rip), %rsi
movl $10, %edi
callq fputc
.LBB1_4: # %for.inc
# in Loop: Header=BB1_2 Depth=2
addq $4, %r12
incq %rbx
movq stdout(%rip), %rax
cmpq $1536, %rbx # imm = 0x600
jne .LBB1_2
# BB#5: # %for.end
# in Loop: Header=BB1_1 Depth=1
movl $10, %edi
movq %rax, %rsi
callq fputc
addq $6144, %r15 # imm = 0x1800
incq %r14
cmpq $1536, %r14 # imm = 0x600
jne .LBB1_1
# BB#6: # %for.end12
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp16:
.size print_array, .Ltmp16-print_array
.cfi_endproc
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI2_0:
.quad 4602678819172646912 # double 0.5
.text
.globl main
.align 16, 0x90
.type main,@function
main: # @main
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp20:
.cfi_def_cfa_offset 16
.Ltmp21:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp22:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $56, %rsp
.Ltmp23:
.cfi_offset %rbx, -56
.Ltmp24:
.cfi_offset %r12, -48
.Ltmp25:
.cfi_offset %r13, -40
.Ltmp26:
.cfi_offset %r14, -32
.Ltmp27:
.cfi_offset %r15, -24
xorl %ebx, %ebx
vmovsd .LCPI2_0(%rip), %xmm0
.align 16, 0x90
.LBB2_1: # %polly.loop_preheader3.i
# =>This Loop Header: Depth=1
# Child Loop BB2_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB2_2: # %polly.loop_header2.i
# Parent Loop BB2_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %ebx, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %rbx, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB2_2
# BB#3: # %polly.loop_exit4.i
# in Loop: Header=BB2_1 Depth=1
incq %rbx
cmpq $1536, %rbx # imm = 0x600
jne .LBB2_1
# BB#4: # %polly.loop_preheader3.preheader
movl $C, %ebx
movl $C, %edi
xorl %esi, %esi
movl $9437184, %edx # imm = 0x900000
callq memset
xorl %eax, %eax
.align 16, 0x90
.LBB2_5: # %polly.loop_preheader17
# =>This Loop Header: Depth=1
# Child Loop BB2_15 Depth 2
# Child Loop BB2_8 Depth 3
# Child Loop BB2_11 Depth 4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
movq %rax, -56(%rbp) # 8-byte Spill
movq %rbx, -88(%rbp) # 8-byte Spill
movq %rax, %rcx
orq $63, %rcx
movq %rcx, -72(%rbp) # 8-byte Spill
leaq -1(%rcx), %rcx
movq %rcx, -48(%rbp) # 8-byte Spill
movq $-1, %r15
movl $B, %ecx
movq %rbx, -64(%rbp) # 8-byte Spill
xorl %r12d, %r12d
.align 16, 0x90
.LBB2_15: # %polly.loop_preheader24
# Parent Loop BB2_5 Depth=1
# => This Loop Header: Depth=2
# Child Loop BB2_8 Depth 3
# Child Loop BB2_11 Depth 4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
movq %rcx, -80(%rbp) # 8-byte Spill
movq %r12, %r13
orq $63, %r13
leaq -1(%r13), %rbx
xorl %r9d, %r9d
movq %rcx, %rdx
.align 16, 0x90
.LBB2_8: # %polly.loop_header23
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# => This Loop Header: Depth=3
# Child Loop BB2_11 Depth 4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
cmpq -72(%rbp), %rax # 8-byte Folded Reload
jg .LBB2_13
# BB#9: # %polly.loop_header30.preheader
# in Loop: Header=BB2_8 Depth=3
movq %r9, %rax
orq $63, %rax
cmpq %rax, %r9
jg .LBB2_13
# BB#10: # in Loop: Header=BB2_8 Depth=3
decq %rax
movq -64(%rbp), %r10 # 8-byte Reload
movq -56(%rbp), %r11 # 8-byte Reload
.align 16, 0x90
.LBB2_11: # %polly.loop_header37.preheader
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# Parent Loop BB2_8 Depth=3
# => This Loop Header: Depth=4
# Child Loop BB2_17 Depth 5
# Child Loop BB2_18 Depth 6
cmpq %r13, %r12
movq %rdx, %r14
movq %r9, %rcx
jg .LBB2_12
.align 16, 0x90
.LBB2_17: # %polly.loop_header46.preheader
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# Parent Loop BB2_8 Depth=3
# Parent Loop BB2_11 Depth=4
# => This Loop Header: Depth=5
# Child Loop BB2_18 Depth 6
leaq (%r11,%r11,2), %rsi
shlq $11, %rsi
vmovss A(%rsi,%rcx,4), %xmm0
movq %r10, %rdi
movq %r14, %r8
movq %r15, %rsi
.LBB2_18: # %polly.loop_header46
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_15 Depth=2
# Parent Loop BB2_8 Depth=3
# Parent Loop BB2_11 Depth=4
# Parent Loop BB2_17 Depth=5
# => This Inner Loop Header: Depth=6
vmulss (%r8), %xmm0, %xmm1
vaddss (%rdi), %xmm1, %xmm1
vmovss %xmm1, (%rdi)
addq $4, %rdi
addq $4, %r8
incq %rsi
cmpq %rbx, %rsi
jle .LBB2_18
# BB#16: # %polly.loop_exit48
# in Loop: Header=BB2_17 Depth=5
addq $6144, %r14 # imm = 0x1800
cmpq %rax, %rcx
leaq 1(%rcx), %rcx
jle .LBB2_17
.align 16, 0x90
.LBB2_12: # %polly.loop_exit39
# in Loop: Header=BB2_11 Depth=4
addq $6144, %r10 # imm = 0x1800
cmpq -48(%rbp), %r11 # 8-byte Folded Reload
leaq 1(%r11), %r11
jle .LBB2_11
.align 16, 0x90
.LBB2_13: # %polly.loop_exit32
# in Loop: Header=BB2_8 Depth=3
addq $393216, %rdx # imm = 0x60000
cmpq $1472, %r9 # imm = 0x5C0
leaq 64(%r9), %r9
movq -56(%rbp), %rax # 8-byte Reload
jl .LBB2_8
# BB#14: # %polly.loop_exit25
# in Loop: Header=BB2_15 Depth=2
addq $256, -64(%rbp) # 8-byte Folded Spill
# imm = 0x100
movq -80(%rbp), %rcx # 8-byte Reload
addq $256, %rcx # imm = 0x100
addq $64, %r15
cmpq $1472, %r12 # imm = 0x5C0
leaq 64(%r12), %r12
jl .LBB2_15
# BB#6: # %polly.loop_exit18
# in Loop: Header=BB2_5 Depth=1
movq -88(%rbp), %rbx # 8-byte Reload
addq $393216, %rbx # imm = 0x60000
cmpq $1472, %rax # imm = 0x5C0
leaq 64(%rax), %rax
jl .LBB2_5
# BB#7: # %polly.loop_exit11
xorl %eax, %eax
addq $56, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
ret
.Ltmp28:
.size main, .Ltmp28-main
.cfi_endproc
.type A,@object # @A
.comm A,9437184,16
.type B,@object # @B
.comm B,9437184,16
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%lf "
.size .L.str, 5
.type C,@object # @C
.comm C,9437184,16
.section ".note.GNU-stack","",@progbits

View File

@@ -0,0 +1,286 @@
.file "matmul.polly.interchanged.ll"
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI0_0:
.quad 4602678819172646912 # double 0.5
.text
.globl init_array
.align 16, 0x90
.type init_array,@function
init_array: # @init_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp2:
.cfi_def_cfa_offset 16
.Ltmp3:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp4:
.cfi_def_cfa_register %rbp
xorl %r8d, %r8d
vmovsd .LCPI0_0(%rip), %xmm0
.align 16, 0x90
.LBB0_1: # %polly.loop_preheader3
# =>This Loop Header: Depth=1
# Child Loop BB0_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB0_2: # %polly.loop_header2
# Parent Loop BB0_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %r8d, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %r8, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB0_2
# BB#3: # %polly.loop_exit4
# in Loop: Header=BB0_1 Depth=1
incq %r8
cmpq $1536, %r8 # imm = 0x600
jne .LBB0_1
# BB#4: # %polly.loop_exit
popq %rbp
ret
.Ltmp5:
.size init_array, .Ltmp5-init_array
.cfi_endproc
.globl print_array
.align 16, 0x90
.type print_array,@function
print_array: # @print_array
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp9:
.cfi_def_cfa_offset 16
.Ltmp10:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp11:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r12
pushq %rbx
.Ltmp12:
.cfi_offset %rbx, -48
.Ltmp13:
.cfi_offset %r12, -40
.Ltmp14:
.cfi_offset %r14, -32
.Ltmp15:
.cfi_offset %r15, -24
xorl %r14d, %r14d
movl $C, %r15d
.align 16, 0x90
.LBB1_1: # %for.cond1.preheader
# =>This Loop Header: Depth=1
# Child Loop BB1_2 Depth 2
movq stdout(%rip), %rax
movq %r15, %r12
xorl %ebx, %ebx
.align 16, 0x90
.LBB1_2: # %for.body3
# Parent Loop BB1_1 Depth=1
# => This Inner Loop Header: Depth=2
vmovss (%r12), %xmm0
vcvtss2sd %xmm0, %xmm0, %xmm0
movq %rax, %rdi
movl $.L.str, %esi
movb $1, %al
callq fprintf
movslq %ebx, %rax
imulq $1717986919, %rax, %rcx # imm = 0x66666667
movq %rcx, %rdx
shrq $63, %rdx
sarq $37, %rcx
addl %edx, %ecx
imull $80, %ecx, %ecx
subl %ecx, %eax
cmpl $79, %eax
jne .LBB1_4
# BB#3: # %if.then
# in Loop: Header=BB1_2 Depth=2
movq stdout(%rip), %rsi
movl $10, %edi
callq fputc
.LBB1_4: # %for.inc
# in Loop: Header=BB1_2 Depth=2
addq $4, %r12
incq %rbx
movq stdout(%rip), %rax
cmpq $1536, %rbx # imm = 0x600
jne .LBB1_2
# BB#5: # %for.end
# in Loop: Header=BB1_1 Depth=1
movl $10, %edi
movq %rax, %rsi
callq fputc
addq $6144, %r15 # imm = 0x1800
incq %r14
cmpq $1536, %r14 # imm = 0x600
jne .LBB1_1
# BB#6: # %for.end12
popq %rbx
popq %r12
popq %r14
popq %r15
popq %rbp
ret
.Ltmp16:
.size print_array, .Ltmp16-print_array
.cfi_endproc
.section .rodata.cst8,"aM",@progbits,8
.align 8
.LCPI2_0:
.quad 4602678819172646912 # double 0.5
.text
.globl main
.align 16, 0x90
.type main,@function
main: # @main
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Ltmp20:
.cfi_def_cfa_offset 16
.Ltmp21:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp22:
.cfi_def_cfa_register %rbp
pushq %r14
pushq %rbx
.Ltmp23:
.cfi_offset %rbx, -32
.Ltmp24:
.cfi_offset %r14, -24
xorl %ebx, %ebx
vmovsd .LCPI2_0(%rip), %xmm0
.align 16, 0x90
.LBB2_1: # %polly.loop_preheader3.i
# =>This Loop Header: Depth=1
# Child Loop BB2_2 Depth 2
xorl %ecx, %ecx
.align 16, 0x90
.LBB2_2: # %polly.loop_header2.i
# Parent Loop BB2_1 Depth=1
# => This Inner Loop Header: Depth=2
movl %ecx, %edx
imull %ebx, %edx
movl %edx, %esi
sarl $31, %esi
shrl $22, %esi
addl %edx, %esi
andl $-1024, %esi # imm = 0xFFFFFFFFFFFFFC00
negl %esi
movq %rbx, %rax
shlq $11, %rax
leal 1(%rdx,%rsi), %edi
leaq (%rax,%rax,2), %rsi
leaq 1(%rcx), %rdx
cmpq $1536, %rdx # imm = 0x600
vcvtsi2sdl %edi, %xmm0, %xmm1
vmulsd %xmm0, %xmm1, %xmm1
vcvtsd2ss %xmm1, %xmm1, %xmm1
vmovss %xmm1, A(%rsi,%rcx,4)
vmovss %xmm1, B(%rsi,%rcx,4)
movq %rdx, %rcx
jne .LBB2_2
# BB#3: # %polly.loop_exit4.i
# in Loop: Header=BB2_1 Depth=1
incq %rbx
cmpq $1536, %rbx # imm = 0x600
jne .LBB2_1
# BB#4: # %polly.loop_preheader3.preheader
movl $C, %r14d
movl $C, %edi
xorl %esi, %esi
movl $9437184, %edx # imm = 0x900000
callq memset
xorl %eax, %eax
.align 16, 0x90
.LBB2_5: # %polly.loop_preheader17
# =>This Loop Header: Depth=1
# Child Loop BB2_10 Depth 2
# Child Loop BB2_8 Depth 3
movl $B, %ebx
xorl %edx, %edx
.align 16, 0x90
.LBB2_10: # %polly.loop_preheader24
# Parent Loop BB2_5 Depth=1
# => This Loop Header: Depth=2
# Child Loop BB2_8 Depth 3
leaq (%rax,%rax,2), %rcx
shlq $11, %rcx
vmovss A(%rcx,%rdx,4), %xmm0
movl $1536, %esi # imm = 0x600
movq %r14, %rdi
movq %rbx, %rcx
.align 16, 0x90
.LBB2_8: # %polly.loop_header23
# Parent Loop BB2_5 Depth=1
# Parent Loop BB2_10 Depth=2
# => This Inner Loop Header: Depth=3
vmulss (%rcx), %xmm0, %xmm1
vaddss (%rdi), %xmm1, %xmm1
vmovss %xmm1, (%rdi)
addq $4, %rdi
addq $4, %rcx
decq %rsi
jne .LBB2_8
# BB#9: # %polly.loop_exit25
# in Loop: Header=BB2_10 Depth=2
addq $6144, %rbx # imm = 0x1800
incq %rdx
cmpq $1536, %rdx # imm = 0x600
jne .LBB2_10
# BB#6: # %polly.loop_exit18
# in Loop: Header=BB2_5 Depth=1
addq $6144, %r14 # imm = 0x1800
incq %rax
cmpq $1536, %rax # imm = 0x600
jne .LBB2_5
# BB#7: # %polly.loop_exit11
xorl %eax, %eax
popq %rbx
popq %r14
popq %rbp
ret
.Ltmp25:
.size main, .Ltmp25-main
.cfi_endproc
.type A,@object # @A
.comm A,9437184,16
.type B,@object # @B
.comm B,9437184,16
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%lf "
.size .L.str, 5
.type C,@object # @C
.comm C,9437184,16
.section ".note.GNU-stack","",@progbits

Some files were not shown because too many files have changed in this diff Show More