diff --git a/tools/gzinject/.gitattributes b/tools/gzinject/.gitattributes
new file mode 100644
index 000000000..66cccc58a
--- /dev/null
+++ b/tools/gzinject/.gitattributes
@@ -0,0 +1,65 @@
+###############################################################################
+# Set default behavior to automatically normalize line endings.
+###############################################################################
+* text=auto
+
+###############################################################################
+# Set default behavior for command prompt diff.
+#
+# This is need for earlier builds of msysgit that does not have it on by
+# default for csharp files.
+# Note: This is only used by command line
+###############################################################################
+#*.cs     diff=csharp
+
+###############################################################################
+# Set the merge driver for project and solution files
+#
+# Merging from the command prompt will add diff markers to the files if there
+# are conflicts (Merging from VS is not affected by the settings below, in VS
+# the diff markers are never inserted). Diff markers may cause the following 
+# file extensions to fail to load in VS. An alternative would be to treat
+# these files as binary and thus will always conflict and require user
+# intervention with every merge. To do so, just uncomment the entries below
+###############################################################################
+#*.sln       merge=binary
+#*.csproj    merge=binary
+#*.vbproj    merge=binary
+#*.vcxproj   merge=binary
+#*.vcproj    merge=binary
+#*.dbproj    merge=binary
+#*.fsproj    merge=binary
+#*.lsproj    merge=binary
+#*.wixproj   merge=binary
+#*.modelproj merge=binary
+#*.sqlproj   merge=binary
+#*.wwaproj   merge=binary
+
+###############################################################################
+# behavior for image files
+#
+# image files are treated as binary by default.
+###############################################################################
+#*.jpg   binary
+#*.png   binary
+#*.gif   binary
+
+###############################################################################
+# diff behavior for common document formats
+# 
+# Convert binary document formats to text before diffing them. This feature
+# is only available from the command line. Turn it on by uncommenting the 
+# entries below.
+###############################################################################
+#*.doc   diff=astextplain
+#*.DOC   diff=astextplain
+#*.docx  diff=astextplain
+#*.DOCX  diff=astextplain
+#*.dot   diff=astextplain
+#*.DOT   diff=astextplain
+#*.pdf   diff=astextplain
+#*.PDF   diff=astextplain
+#*.rtf   diff=astextplain
+#*.RTF   diff=astextplain
+*.h linguist-language=C
+*.c linguist-language=C
diff --git a/tools/gzinject/.gitignore b/tools/gzinject/.gitignore
new file mode 100644
index 000000000..9ed774981
--- /dev/null
+++ b/tools/gzinject/.gitignore
@@ -0,0 +1,279 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+
+# Visual Studio 2015 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# DNX
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# JustCode is a .NET coding add-in
+.JustCode
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# TODO: Comment the next line if you want to checkin your web deploy settings
+# but database connection strings (with potential passwords) will be unencrypted
+#*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/packages/*
+# except build/, which is used as an MSBuild target.
+!**/packages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/packages/repositories.config
+# NuGet v3's project.json files produces more ignoreable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+node_modules/
+orleans.codegen.cs
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+*.mdf
+*.ldf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# JetBrains Rider
+.idea/
+*.sln.iml
+
+# CodeRush
+.cr/
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Testing File
+*.Wad
+TestExtract/
+*.exe
+*.stackdump
+/gzinject/Debug
+/gzinject.zip
+*.bin
+Debug/
+CppProperties.json
+wadextract/
+*.o
+Makefile
+config.*
+gzinject
+*.zip
+autom4te.cache/
\ No newline at end of file
diff --git a/tools/gzinject/.gitrepo b/tools/gzinject/.gitrepo
new file mode 100644
index 000000000..ccddd36d5
--- /dev/null
+++ b/tools/gzinject/.gitrepo
@@ -0,0 +1,12 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme
+;
+[subrepo]
+	remote = https://github.com/krimtonz/gzinject.git
+	branch = master
+	commit = ee44efce5d842e5d4488ee47c16da8b673da5086
+	parent = 53941daac4bb1482a9125f3595642df1bafb5f6d
+	method = merge
+	cmdver = 0.4.5
diff --git a/tools/gzinject/BUILDING.md b/tools/gzinject/BUILDING.md
new file mode 100644
index 000000000..7fb64a1c3
--- /dev/null
+++ b/tools/gzinject/BUILDING.md
@@ -0,0 +1,8 @@
+## Prerequisites
+
+gcc, make
+
+## Building
+Run `./configure` You can use `--prefx=DIR` to specify the output directory, install will install gzinject to `DIR/bin`, then run `make` to build the executable, and `make install` to install it to `DIR/bin`
+
+By default gzinject will use the crypto library provided by OpenSSL, to disable this and use builtin (slower) crypto functions remove `-D_USE_LIBCRYPTO` From the make file, and change SRC = gzinject.c to SRC = *.c
diff --git a/tools/gzinject/LICENSE b/tools/gzinject/LICENSE
new file mode 100644
index 000000000..94a9ed024
--- /dev/null
+++ b/tools/gzinject/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/tools/gzinject/Makefile.in b/tools/gzinject/Makefile.in
new file mode 100644
index 000000000..92b57f7ca
--- /dev/null
+++ b/tools/gzinject/Makefile.in
@@ -0,0 +1,39 @@
+CC		= @CC@
+LD		= @CC@
+INSTALL		= @INSTALL@
+CFLAGS		= -Wall -Wno-unused-result @CFLAGS@
+CPPFLAGS		= @CPPFLAGS@
+prefix 		= @prefix@
+exec_prefix 	= @exec_prefix@
+bindir		= @bindir@
+PROGNAME	= gzinject
+LDFLAGS		= -s @LDFLAGS@
+CFILES		= *.c
+SRCDIR		= src
+CSRC	   	:= $(foreach s,$(CFILES),$(wildcard $(SRCDIR)/$(s)))
+COBJ		= $(patsubst $(SRCDIR)/%,$(OBJDIR)/%.o,$(CSRC))
+LIBS		=
+OBJDIR		= obj
+OUTDIR		= $(OBJDIR)
+
+.PHONY		: all install clean distclean
+
+all		:	$(PROGNAME)
+
+clean		:
+	rm -rf $(PROGNAME) obj
+
+distclean	:	clean
+	rm -f Makefile
+
+install		:	$(PROGNAME)
+	$(INSTALL) -p -D --target-directory=$(DESTDIR)$(bindir) $(PROGNAME)
+
+$(PROGNAME)	:	$(COBJ)
+	$(LD) $(LDFLAGS) $^ -o $@ $(LIBS)
+
+$(OUTDIR)   	:
+	mkdir -p $@
+
+$(COBJ)	 	:	$(OBJDIR)/%.o: $(SRCDIR)/% | $(OBJDIR)
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
diff --git a/tools/gzinject/README.md b/tools/gzinject/README.md
new file mode 100644
index 000000000..a3893a7d2
--- /dev/null
+++ b/tools/gzinject/README.md
@@ -0,0 +1,74 @@
+## About
+
+gzinject is a wad editing utility, primarily used for patching N64 VC Emulators, and replacing the rom inside.  gzinject uses patch files to patch content files within the wad. A description of the patch file format can be seen in the [Patch](#Patch) section.  
+
+## Executable 
+
+To build your own, run ./configure, then make, and make install. See BUILDING for more instructions
+
+Prebuilt Windows executable is contained under releases (https://github.com/krimtonz/gzinject/releases/latest)
+
+## Usage 
+```
+Usage:
+  gzinject -a extract -w SOURCEWAD [options]
+  gzinject -a pack -w DESTWAD [options]
+  gzinject -a inject -w SOURCEWAD -m ROM [options]
+  gzinject -a genkey [options]
+  gzinject --help
+  gzinject --version
+
+Actions:
+  extract      extracts SOURCEWAD to directory
+  pack         packs directory into DESTWAD
+  inject       injects rom into SOURCEWAD
+  genkey       generates wii common-key
+
+Options:
+  -i, --channelid=ID           New Channel ID For Pack and Inject actions (default: none)
+  -t, --title=title            New Channel name for pack and inject actions (default: none)
+  -h, --help                   Prints this help message
+  -k, --key=keyfile            Location of the common-key file (default: common-key.bin)
+  -r, --region=1-3             Region to use (default: 3)
+  --verbose                    Print out verbose program execution information
+  -d, --directory=directory    Directory to extract contents to, or directory to read contents from (default: wadextract)
+  --cleanup                    Remove files before performing actions
+  --version                    Prints the current version
+  -m, --rom=rom                Rom to inject for inject action (default: none)
+  -o, --outputwad=outwad       The output wad for inject actions (default: SOURCEWAD-inject.wad)
+  -p, --patch-file=patchfile   gzi file to use for applying patches (default: none)
+  -c, --content=contentfile    the primary content file (default: 5)
+  --dol-inject                 Binary data to inject into the emulator program, requires --dol-loading
+  --dol-loading                The loading address for the binary specified by --dol-inject
+  --dol-after                  After which patch file to inject the dol, default: after all patches
+```
+
+## Patch
+gzi files are text files with a command on each line.  A # starting the line indicates a comment.
+
+line format:
+ccss oooooooo dddddddd\
+Where c indicates the command, s indicates the data size, o indicates the offset into the current file, and d indicates the data to replace with.
+
+```
+Commands:
+  00: Begin using content file specified by d, offset and size are not used for this command
+  01: lz77 decompress the current content file.  offset, size, and data are not used for this command
+  02: lz77 compress the current content file.  offset, size, and data are not used for this command
+  03: apply patch to currently selected file. If offset is higher than the file sizes, or a current file has not been selected, the patch is not applied
+
+Sizes:
+  01: a one byte value.  data & 0x000000FF is applied to content + offset
+  02: a two byte value.  data & 0x0000FFFF is applied to content + offset
+  04: a four byte value.  data is applied to content + offset
+```
+
+
+## Thanks/Authors
+
+gzinject was primarily written by me.\n
+Thanks to glankk (https://github.com/glankk) for providing memory/controller fixes for OOT as well as debugging, testing, and providing fixes for various errors\
+The general workflow of extracting/packing the wad was taken from showmiiwads (https://github.com/dnasdw/showmiiwads/)\
+AES encryption/decryption was taken from kokke (https://github.com/kokke/tiny-AES-c)\
+SHA1 taken from clibs (https://github.com/clibs/sha1)\
+MD5 taken from Alexander Peslyak http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
diff --git a/tools/gzinject/configure b/tools/gzinject/configure
new file mode 100644
index 000000000..b9f5c308f
--- /dev/null
+++ b/tools/gzinject/configure
@@ -0,0 +1,4468 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.69.
+#
+#
+# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
+#
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+# Use a proper internal environment variable to ensure we don't fall
+  # into an infinite loop, continuously re-executing ourselves.
+  if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then
+    _as_can_reexec=no; export _as_can_reexec;
+    # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+as_fn_exit 255
+  fi
+  # We don't want this to propagate to other subprocesses.
+          { _as_can_reexec=; unset _as_can_reexec;}
+if test "x$CONFIG_SHELL" = x; then
+  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '\${1+\"\$@\"}'='\"\$@\"'
+  setopt NO_GLOB_SUBST
+else
+  case \`(set -o) 2>/dev/null\` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+"
+  as_required="as_fn_return () { (exit \$1); }
+as_fn_success () { as_fn_return 0; }
+as_fn_failure () { as_fn_return 1; }
+as_fn_ret_success () { return 0; }
+as_fn_ret_failure () { return 1; }
+
+exitcode=0
+as_fn_success || { exitcode=1; echo as_fn_success failed.; }
+as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
+as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
+as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
+if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+
+else
+  exitcode=1; echo positional parameters were not saved.
+fi
+test x\$exitcode = x0 || exit 1
+test -x / || exit 1"
+  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
+  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
+  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
+  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
+test \$(( 1 + 1 )) = 2 || exit 1"
+  if (eval "$as_required") 2>/dev/null; then :
+  as_have_required=yes
+else
+  as_have_required=no
+fi
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
+
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+  as_found=:
+  case $as_dir in #(
+	 /*)
+	   for as_base in sh bash ksh sh5; do
+	     # Try only shells that exist, to save several forks.
+	     as_shell=$as_dir/$as_base
+	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  CONFIG_SHELL=$as_shell as_have_required=yes
+		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
+  break 2
+fi
+fi
+	   done;;
+       esac
+  as_found=false
+done
+$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi; }
+IFS=$as_save_IFS
+
+
+      if test "x$CONFIG_SHELL" != x; then :
+  export CONFIG_SHELL
+             # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed `exec'.
+$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
+fi
+
+    if test x$as_have_required = xno; then :
+  $as_echo "$0: This script requires a shell more modern than all"
+  $as_echo "$0: the shells that I found on your system."
+  if test x${ZSH_VERSION+set} = xset ; then
+    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
+  else
+    $as_echo "$0: Please tell bug-autoconf@gnu.org about your system,
+$0: including any error possibly output before this
+$0: message. Then install a modern shell, or manually run
+$0: the script under such a shell if you do have one."
+  fi
+  exit 1
+fi
+fi
+fi
+SHELL=${CONFIG_SHELL-/bin/sh}
+export SHELL
+# Unset more variables known to interfere with behavior of common tools.
+CLICOLOR_FORCE= GREP_OPTIONS=
+unset CLICOLOR_FORCE GREP_OPTIONS
+
+## --------------------- ##
+## M4sh Shell Functions. ##
+## --------------------- ##
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+  as_lineno_1=$LINENO as_lineno_1a=$LINENO
+  as_lineno_2=$LINENO as_lineno_2a=$LINENO
+  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
+  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
+  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+
+  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
+  # already done that, so ensure we don't try to do so again and fall
+  # in an infinite loop.  This has already happened in practice.
+  _as_can_reexec=no; export _as_can_reexec
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+test -n "$DJDIR" || exec 7<&0 </dev/null
+exec 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+
+# Identity of this package.
+PACKAGE_NAME=
+PACKAGE_TARNAME=
+PACKAGE_VERSION=
+PACKAGE_STRING=
+PACKAGE_BUGREPORT=
+PACKAGE_URL=
+
+ac_default_prefix=/usr/local
+# Factoring default headers for most tests.
+ac_includes_default="\
+#include <stdio.h>
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif
+#ifdef HAVE_SYS_STAT_H
+# include <sys/stat.h>
+#endif
+#ifdef STDC_HEADERS
+# include <stdlib.h>
+# include <stddef.h>
+#else
+# ifdef HAVE_STDLIB_H
+#  include <stdlib.h>
+# endif
+#endif
+#ifdef HAVE_STRING_H
+# if !defined STDC_HEADERS && defined HAVE_MEMORY_H
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#ifdef HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#ifdef HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#ifdef HAVE_STDINT_H
+# include <stdint.h>
+#endif
+#ifdef HAVE_UNISTD_H
+# include <unistd.h>
+#endif"
+
+ac_subst_vars='LTLIBOBJS
+LIBOBJS
+EGREP
+GREP
+CPP
+INSTALL_DATA
+INSTALL_SCRIPT
+INSTALL_PROGRAM
+OBJEXT
+EXEEXT
+ac_ct_CC
+CPPFLAGS
+LDFLAGS
+CFLAGS
+CC
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+runstatedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_URL
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+'
+      ac_precious_vars='build_alias
+host_alias
+target_alias
+CC
+CFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS
+CPP'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval $ac_prev=\$ac_option
+    ac_prev=
+    continue
+  fi
+
+  case $ac_option in
+  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *=)   ac_optarg= ;;
+  *)    ac_optarg=yes ;;
+  esac
+
+  # Accept the important Cygnus configure options, so we can diagnose typos.
+
+  case $ac_dashdash$ac_option in
+  --)
+    ac_dashdash=yes ;;
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir=$ac_optarg ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build_alias ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build_alias=$ac_optarg ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file=$ac_optarg ;;
+
+  --config-cache | -C)
+    cache_file=config.cache ;;
+
+  -datadir | --datadir | --datadi | --datad)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=*)
+    datadir=$ac_optarg ;;
+
+  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+  | --dataroo | --dataro | --datar)
+    ac_prev=datarootdir ;;
+  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+    datarootdir=$ac_optarg ;;
+
+  -disable-* | --disable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=no ;;
+
+  -docdir | --docdir | --docdi | --doc | --do)
+    ac_prev=docdir ;;
+  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+    docdir=$ac_optarg ;;
+
+  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+    ac_prev=dvidir ;;
+  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+    dvidir=$ac_optarg ;;
+
+  -enable-* | --enable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=\$ac_optarg ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix=$ac_optarg ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=long ;;
+  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+    ac_init_help=recursive ;;
+  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+    ac_init_help=short ;;
+
+  -host | --host | --hos | --ho)
+    ac_prev=host_alias ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host_alias=$ac_optarg ;;
+
+  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+    ac_prev=htmldir ;;
+  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+  | --ht=*)
+    htmldir=$ac_optarg ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir=$ac_optarg ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir=$ac_optarg ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir=$ac_optarg ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir=$ac_optarg ;;
+
+  -localedir | --localedir | --localedi | --localed | --locale)
+    ac_prev=localedir ;;
+  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+    localedir=$ac_optarg ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst | --locals)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+    localstatedir=$ac_optarg ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir=$ac_optarg ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c | -n)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir=$ac_optarg ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix=$ac_optarg ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix=$ac_optarg ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix=$ac_optarg ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name=$ac_optarg ;;
+
+  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+    ac_prev=pdfdir ;;
+  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+    pdfdir=$ac_optarg ;;
+
+  -psdir | --psdir | --psdi | --psd | --ps)
+    ac_prev=psdir ;;
+  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+    psdir=$ac_optarg ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir=$ac_optarg ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir=$ac_optarg ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site=$ac_optarg ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir=$ac_optarg ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir=$ac_optarg ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target_alias ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target_alias=$ac_optarg ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=\$ac_optarg ;;
+
+  -without-* | --without-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: $ac_useropt"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=no ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes=$ac_optarg ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries=$ac_optarg ;;
+
+  -*) as_fn_error $? "unrecognized option: \`$ac_option'
+Try \`$0 --help' for more information"
+    ;;
+
+  *=*)
+    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+    # Reject names that are not valid shell variable names.
+    case $ac_envvar in #(
+      '' | [0-9]* | *[!_$as_cr_alnum]* )
+      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
+    esac
+    eval $ac_envvar=\$ac_optarg
+    export $ac_envvar ;;
+
+  *)
+    # FIXME: should be removed in autoconf 3.0.
+    $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2
+    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2
+    : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+  as_fn_error $? "missing argument to $ac_option"
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+  case $enable_option_checking in
+    no) ;;
+    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+    *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+  esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
+		datadir sysconfdir sharedstatedir localstatedir includedir \
+		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+		libdir localedir mandir runstatedir
+do
+  eval ac_val=\$$ac_var
+  # Remove trailing slashes.
+  case $ac_val in
+    */ )
+      ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+      eval $ac_var=\$ac_val;;
+  esac
+  # Be sure to have absolute directory names.
+  case $ac_val in
+    [\\/$]* | ?:[\\/]* )  continue;;
+    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+  esac
+  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+done
+
+# There might be people who depend on the old broken behavior: `$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+  if test "x$build_alias" = x; then
+    cross_compiling=maybe
+  elif test "x$build_alias" != "x$host_alias"; then
+    cross_compiling=yes
+  fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+  as_fn_error $? "working directory cannot be determined"
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+  as_fn_error $? "pwd does not report name of working directory"
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then the parent directory.
+  ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_myself" : 'X\(//\)[^/]' \| \
+	 X"$as_myself" : 'X\(//\)$' \| \
+	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_myself" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  srcdir=$ac_confdir
+  if test ! -r "$srcdir/$ac_unique_file"; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+fi
+ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
+ac_abs_confdir=`(
+	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+	pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+  srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+  eval ac_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_env_${ac_var}_value=\$${ac_var}
+  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<_ACEOF
+\`configure' configures this package to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print \`checking ...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for \`--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or \`..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [PREFIX]
+
+By default, \`make install' will install all the files in
+\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than \`$ac_default_prefix' using \`--prefix',
+for instance \`--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR            user executables [EPREFIX/bin]
+  --sbindir=DIR           system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR        program executables [EPREFIX/libexec]
+  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
+  --libdir=DIR            object code libraries [EPREFIX/lib]
+  --includedir=DIR        C header files [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
+  --datarootdir=DIR       read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR           read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR           info documentation [DATAROOTDIR/info]
+  --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR            man documentation [DATAROOTDIR/man]
+  --docdir=DIR            documentation root [DATAROOTDIR/doc/PACKAGE]
+  --htmldir=DIR           html documentation [DOCDIR]
+  --dvidir=DIR            dvi documentation [DOCDIR]
+  --pdfdir=DIR            pdf documentation [DOCDIR]
+  --psdir=DIR             ps documentation [DOCDIR]
+_ACEOF
+
+  cat <<\_ACEOF
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+
+  cat <<\_ACEOF
+
+Some influential environment variables:
+  CC          C compiler command
+  CFLAGS      C compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CPP         C preprocessor
+
+Use these variables to override the choices made by `configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to the package provider.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+  # If there are subdirs, report their specific --help.
+  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+    test -d "$ac_dir" ||
+      { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+      continue
+    ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+    cd "$ac_dir" || { ac_status=$?; continue; }
+    # Check for guested configure.
+    if test -f "$ac_srcdir/configure.gnu"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+    elif test -f "$ac_srcdir/configure"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure" --help=recursive
+    else
+      $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+    fi || ac_status=$?
+    cd "$ac_pwd" || { ac_status=$?; break; }
+  done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+  cat <<\_ACEOF
+configure
+generated by GNU Autoconf 2.69
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+  exit
+fi
+
+## ------------------------ ##
+## Autoconf initialization. ##
+## ------------------------ ##
+
+# ac_fn_c_try_compile LINENO
+# --------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_compile
+
+# ac_fn_c_try_cpp LINENO
+# ----------------------
+# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
+ac_fn_c_try_cpp ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } > conftest.i && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+    ac_retval=1
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_cpp
+
+# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists, giving a warning if it cannot be compiled using
+# the include files in INCLUDES and setting the cache variable VAR
+# accordingly.
+ac_fn_c_check_header_mongrel ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if eval \${$3+:} false; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
+$as_echo_n "checking $2 usability... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_header_compiler=yes
+else
+  ac_header_compiler=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
+$as_echo_n "checking $2 presence... " >&6; }
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <$2>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  ac_header_preproc=yes
+else
+  ac_header_preproc=no
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #((
+  yes:no: )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
+    ;;
+esac
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  eval "$3=\$ac_header_compiler"
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_mongrel
+
+# ac_fn_c_try_run LINENO
+# ----------------------
+# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
+# that executables *can* be run.
+ac_fn_c_try_run ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  ac_retval=0
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+       $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+       ac_retval=$ac_status
+fi
+  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_c_try_run
+
+# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
+# -------------------------------------------------------
+# Tests whether HEADER exists and can be compiled using the include files in
+# INCLUDES, setting the cache variable VAR accordingly.
+ac_fn_c_check_header_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
+$as_echo_n "checking for $2... " >&6; }
+if eval \${$3+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$4
+#include <$2>
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  eval "$3=yes"
+else
+  eval "$3=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+eval ac_res=\$$3
+	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+
+} # ac_fn_c_check_header_compile
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by $as_me, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  $ $0 $@
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
+
+/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
+/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
+/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    $as_echo "PATH: $as_dir"
+  done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+  for ac_arg
+  do
+    case $ac_arg in
+    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+    | -silent | --silent | --silen | --sile | --sil)
+      continue ;;
+    *\'*)
+      ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    case $ac_pass in
+    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+    2)
+      as_fn_append ac_configure_args1 " '$ac_arg'"
+      if test $ac_must_keep_next = true; then
+	ac_must_keep_next=false # Got value, back to normal.
+      else
+	case $ac_arg in
+	  *=* | --config-cache | -C | -disable-* | --disable-* \
+	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+	  | -with-* | --with-* | -without-* | --without-* | --x)
+	    case "$ac_configure_args0 " in
+	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+	    esac
+	    ;;
+	  -* ) ac_must_keep_next=true ;;
+	esac
+      fi
+      as_fn_append ac_configure_args " '$ac_arg'"
+      ;;
+    esac
+  done
+done
+{ ac_configure_args0=; unset ac_configure_args0;}
+{ ac_configure_args1=; unset ac_configure_args1;}
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log.  We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+  # Save into config.log some information that might help in debugging.
+  {
+    echo
+
+    $as_echo "## ---------------- ##
+## Cache variables. ##
+## ---------------- ##"
+    echo
+    # The following way of writing the cache mishandles newlines in values,
+(
+  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+  (set) 2>&1 |
+    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      sed -n \
+	"s/'\''/'\''\\\\'\'''\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+      ;; #(
+    *)
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+)
+    echo
+
+    $as_echo "## ----------------- ##
+## Output variables. ##
+## ----------------- ##"
+    echo
+    for ac_var in $ac_subst_vars
+    do
+      eval ac_val=\$$ac_var
+      case $ac_val in
+      *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      esac
+      $as_echo "$ac_var='\''$ac_val'\''"
+    done | sort
+    echo
+
+    if test -n "$ac_subst_files"; then
+      $as_echo "## ------------------- ##
+## File substitutions. ##
+## ------------------- ##"
+      echo
+      for ac_var in $ac_subst_files
+      do
+	eval ac_val=\$$ac_var
+	case $ac_val in
+	*\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	esac
+	$as_echo "$ac_var='\''$ac_val'\''"
+      done | sort
+      echo
+    fi
+
+    if test -s confdefs.h; then
+      $as_echo "## ----------- ##
+## confdefs.h. ##
+## ----------- ##"
+      echo
+      cat confdefs.h
+      echo
+    fi
+    test "$ac_signal" != 0 &&
+      $as_echo "$as_me: caught signal $ac_signal"
+    $as_echo "$as_me: exit $exit_status"
+  } >&5
+  rm -f core *.core core.conftest.* &&
+    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+    exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+$as_echo "/* confdefs.h */" > confdefs.h
+
+# Predefined preprocessor variables.
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_NAME "$PACKAGE_NAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_TARNAME "$PACKAGE_TARNAME"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_VERSION "$PACKAGE_VERSION"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_STRING "$PACKAGE_STRING"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
+_ACEOF
+
+cat >>confdefs.h <<_ACEOF
+#define PACKAGE_URL "$PACKAGE_URL"
+_ACEOF
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+ac_site_file1=NONE
+ac_site_file2=NONE
+if test -n "$CONFIG_SITE"; then
+  # We do not want a PATH search for config.site.
+  case $CONFIG_SITE in #((
+    -*)  ac_site_file1=./$CONFIG_SITE;;
+    */*) ac_site_file1=$CONFIG_SITE;;
+    *)   ac_site_file1=./$CONFIG_SITE;;
+  esac
+elif test "x$prefix" != xNONE; then
+  ac_site_file1=$prefix/share/config.site
+  ac_site_file2=$prefix/etc/config.site
+else
+  ac_site_file1=$ac_default_prefix/share/config.site
+  ac_site_file2=$ac_default_prefix/etc/config.site
+fi
+for ac_site_file in "$ac_site_file1" "$ac_site_file2"
+do
+  test "x$ac_site_file" = xNONE && continue
+  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+$as_echo "$as_me: loading site script $ac_site_file" >&6;}
+    sed 's/^/| /' "$ac_site_file" >&5
+    . "$ac_site_file" \
+      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "failed to load site script $ac_site_file
+See \`config.log' for more details" "$LINENO" 5; }
+  fi
+done
+
+if test -r "$cache_file"; then
+  # Some versions of bash will fail to source /dev/null (special files
+  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
+  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+$as_echo "$as_me: loading cache $cache_file" >&6;}
+    case $cache_file in
+      [\\/]* | ?:[\\/]* ) . "$cache_file";;
+      *)                      . "./$cache_file";;
+    esac
+  fi
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+$as_echo "$as_me: creating cache $cache_file" >&6;}
+  >$cache_file
+fi
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	# differences in whitespace do not lead to failure.
+	ac_old_val_w=`echo x $ac_old_val`
+	ac_new_val_w=`echo x $ac_new_val`
+	if test "$ac_old_val_w" != "$ac_new_val_w"; then
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
+	  ac_cache_corrupted=:
+	else
+	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
+	  eval $ac_var=\$ac_old_val
+	fi
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
+$as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
+	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
+$as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+if test -n "$ac_tool_prefix"; then
+  # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$ac_cv_prog_CC"; then
+  ac_ct_CC=$CC
+  # Extract the first word of "gcc", so it can be a program name with args.
+set dummy gcc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="gcc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+else
+  CC="$ac_cv_prog_CC"
+fi
+
+if test -z "$CC"; then
+          if test -n "$ac_tool_prefix"; then
+    # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
+set dummy ${ac_tool_prefix}cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="${ac_tool_prefix}cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  fi
+fi
+if test -z "$CC"; then
+  # Extract the first word of "cc", so it can be a program name with args.
+set dummy cc; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+  ac_prog_rejected=no
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
+       ac_prog_rejected=yes
+       continue
+     fi
+    ac_cv_prog_CC="cc"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+if test $ac_prog_rejected = yes; then
+  # We found a bogon in the path, so make sure we never use it.
+  set dummy $ac_cv_prog_CC
+  shift
+  if test $# != 0; then
+    # We chose a different compiler from the bogus one.
+    # However, it has the same basename, so the bogon will be chosen
+    # first if we set CC to just the basename; use the full file name.
+    shift
+    ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@"
+  fi
+fi
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+fi
+if test -z "$CC"; then
+  if test -n "$ac_tool_prefix"; then
+  for ac_prog in cl.exe
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CC"; then
+  ac_cv_prog_CC="$CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CC=$ac_cv_prog_CC
+if test -n "$CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+$as_echo "$CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+    test -n "$CC" && break
+  done
+fi
+if test -z "$CC"; then
+  ac_ct_CC=$CC
+  for ac_prog in cl.exe
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_ac_ct_CC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$ac_ct_CC"; then
+  ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+ac_ct_CC=$ac_cv_prog_ac_ct_CC
+if test -n "$ac_ct_CC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+$as_echo "$ac_ct_CC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CC" && break
+done
+
+  if test "x$ac_ct_CC" = x; then
+    CC=""
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CC=$ac_ct_CC
+  fi
+fi
+
+fi
+
+
+test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "no acceptable C compiler found in \$PATH
+See \`config.log' for more details" "$LINENO" 5; }
+
+# Provide some information about the compiler.
+$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5
+$as_echo_n "checking whether the C compiler works... " >&6; }
+ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+  esac
+done
+rm -f $ac_rmfiles
+
+if { { ac_try="$ac_link_default"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link_default") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
+# So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
+# in a Makefile.  We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+	;;
+    [ab].out )
+	# We found the default executable, but exeext='' is most
+	# certainly right.
+	break;;
+    *.* )
+	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+	then :; else
+	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	fi
+	# We set ac_cv_exeext here because the later test for it is not
+	# safe: cross compilers may not add the suffix if given an `-o'
+	# argument, so we may need to know it at that point already.
+	# Even if this section looks crufty: it has the advantage of
+	# actually working.
+	break;;
+    * )
+	break;;
+  esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else
+  ac_file=''
+fi
+if test -z "$ac_file"; then :
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "C compiler cannot create executables
+See \`config.log' for more details" "$LINENO" 5; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+$as_echo "yes" >&6; }
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5
+$as_echo_n "checking for C compiler default output file name... " >&6; }
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+ac_exeext=$ac_cv_exeext
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+$as_echo_n "checking for suffix of executables... " >&6; }
+if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  # If both `conftest.exe' and `conftest' are `present' (well, observable)
+# catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
+# work properly (i.e., refer to `conftest.exe'), while it won't with
+# `rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	  break;;
+    * ) break;;
+  esac
+done
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest conftest$ac_cv_exeext
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+$as_echo "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main ()
+{
+FILE *f = fopen ("conftest.out", "w");
+ return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files="$ac_clean_files conftest.out"
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+if test "$cross_compiling" != yes; then
+  { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+  if { ac_try='./conftest$ac_cv_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot run C compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details" "$LINENO" 5; }
+    fi
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
+ac_clean_files=$ac_clean_files_save
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+$as_echo_n "checking for suffix of object files... " >&6; }
+if ${ac_cv_objext+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+$as_echo "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; then :
+  for ac_file in conftest.o conftest.obj conftest.*; do
+  test -f "$ac_file" || continue;
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+       break;;
+  esac
+done
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of object files: cannot compile
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+rm -f conftest.$ac_cv_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+$as_echo "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+$as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
+if ${ac_cv_c_compiler_gnu+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_compiler_gnu=yes
+else
+  ac_compiler_gnu=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_cv_c_compiler_gnu=$ac_compiler_gnu
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+$as_echo "$ac_cv_c_compiler_gnu" >&6; }
+if test $ac_compiler_gnu = yes; then
+  GCC=yes
+else
+  GCC=
+fi
+ac_test_CFLAGS=${CFLAGS+set}
+ac_save_CFLAGS=$CFLAGS
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+$as_echo_n "checking whether $CC accepts -g... " >&6; }
+if ${ac_cv_prog_cc_g+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_save_c_werror_flag=$ac_c_werror_flag
+   ac_c_werror_flag=yes
+   ac_cv_prog_cc_g=no
+   CFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+else
+  CFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+
+else
+  ac_c_werror_flag=$ac_save_c_werror_flag
+	 CFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+   ac_c_werror_flag=$ac_save_c_werror_flag
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+$as_echo "$ac_cv_prog_cc_g" >&6; }
+if test "$ac_test_CFLAGS" = set; then
+  CFLAGS=$ac_save_CFLAGS
+elif test $ac_cv_prog_cc_g = yes; then
+  if test "$GCC" = yes; then
+    CFLAGS="-g -O2"
+  else
+    CFLAGS="-g"
+  fi
+else
+  if test "$GCC" = yes; then
+    CFLAGS="-O2"
+  else
+    CFLAGS=
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+$as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
+if ${ac_cv_prog_cc_c89+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_prog_cc_c89=no
+ac_save_CC=$CC
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdarg.h>
+#include <stdio.h>
+struct stat;
+/* Most of the following tests are stolen from RCS 5.7's src/conf.sh.  */
+struct buf { int x; };
+FILE * (*rcsopen) (struct buf *, struct stat *, int);
+static char *e (p, i)
+     char **p;
+     int i;
+{
+  return p[i];
+}
+static char *f (char * (*g) (char **, int), char **p, ...)
+{
+  char *s;
+  va_list v;
+  va_start (v,p);
+  s = g (p, va_arg (v,int));
+  va_end (v);
+  return s;
+}
+
+/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default.  It has
+   function prototypes and stuff, but not '\xHH' hex character constants.
+   These don't provoke an error unfortunately, instead are silently treated
+   as 'x'.  The following induces an error, until -std is added to get
+   proper ANSI mode.  Curiously '\x00'!='x' always comes out true, for an
+   array size at least.  It's necessary to write '\x00'==0 to get something
+   that's true only with -std.  */
+int osf4_cc_array ['\x00' == 0 ? 1 : -1];
+
+/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters
+   inside strings and character constants.  */
+#define FOO(x) 'x'
+int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1];
+
+int test (int i, double x);
+struct s1 {int (*f) (int a);};
+struct s2 {int (*f) (double a);};
+int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int);
+int argc;
+char **argv;
+int
+main ()
+{
+return f (e, argv, 0) != argv[0]  ||  f (e, argv, 1) != argv[1];
+  ;
+  return 0;
+}
+_ACEOF
+for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \
+	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
+do
+  CC="$ac_save_CC $ac_arg"
+  if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_prog_cc_c89=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext
+  test "x$ac_cv_prog_cc_c89" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+
+fi
+# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_c89" in
+  x)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+$as_echo "none needed" >&6; } ;;
+  xno)
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+$as_echo "unsupported" >&6; } ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_c89"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+$as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
+esac
+if test "x$ac_cv_prog_cc_c89" != xno; then :
+
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+ac_aux_dir=
+for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
+  if test -f "$ac_dir/install-sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install-sh -c"
+    break
+  elif test -f "$ac_dir/install.sh"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/install.sh -c"
+    break
+  elif test -f "$ac_dir/shtool"; then
+    ac_aux_dir=$ac_dir
+    ac_install_sh="$ac_aux_dir/shtool install -c"
+    break
+  fi
+done
+if test -z "$ac_aux_dir"; then
+  as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5
+fi
+
+# These three variables are undocumented and unsupported,
+# and are intended to be withdrawn in a future Autoconf release.
+# They can cause serious problems if a builder's source tree is in a directory
+# whose full name contains unusual characters.
+ac_config_guess="$SHELL $ac_aux_dir/config.guess"  # Please don't use this var.
+ac_config_sub="$SHELL $ac_aux_dir/config.sub"  # Please don't use this var.
+ac_configure="$SHELL $ac_aux_dir/configure"  # Please don't use this var.
+
+
+# Find a good install program.  We prefer a C program (faster),
+# so one script is as good as another.  But avoid the broken or
+# incompatible versions:
+# SysV /etc/install, /usr/sbin/install
+# SunOS /usr/etc/install
+# IRIX /sbin/install
+# AIX /bin/install
+# AmigaOS /C/install, which installs bootblocks on floppy discs
+# AIX 4 /usr/bin/installbsd, which doesn't work without a -g flag
+# AFS /usr/afsws/bin/install, which mishandles nonexistent args
+# SVR4 /usr/ucb/install, which tries to use the nonexistent group "staff"
+# OS/2's system install, which has a completely different semantic
+# ./install, which can be erroneously created by make from ./install.sh.
+# Reject install programs that cannot install multiple files.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for a BSD-compatible install" >&5
+$as_echo_n "checking for a BSD-compatible install... " >&6; }
+if test -z "$INSTALL"; then
+if ${ac_cv_path_install+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    # Account for people who put trailing slashes in PATH elements.
+case $as_dir/ in #((
+  ./ | .// | /[cC]/* | \
+  /etc/* | /usr/sbin/* | /usr/etc/* | /sbin/* | /usr/afsws/bin/* | \
+  ?:[\\/]os2[\\/]install[\\/]* | ?:[\\/]OS2[\\/]INSTALL[\\/]* | \
+  /usr/ucb/* ) ;;
+  *)
+    # OSF1 and SCO ODT 3.0 have their own names for install.
+    # Don't use installbsd from OSF since it installs stuff as root
+    # by default.
+    for ac_prog in ginstall scoinst install; do
+      for ac_exec_ext in '' $ac_executable_extensions; do
+	if as_fn_executable_p "$as_dir/$ac_prog$ac_exec_ext"; then
+	  if test $ac_prog = install &&
+	    grep dspmsg "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # AIX install.  It has an incompatible calling convention.
+	    :
+	  elif test $ac_prog = install &&
+	    grep pwplus "$as_dir/$ac_prog$ac_exec_ext" >/dev/null 2>&1; then
+	    # program-specific install script used by HP pwplus--don't use.
+	    :
+	  else
+	    rm -rf conftest.one conftest.two conftest.dir
+	    echo one > conftest.one
+	    echo two > conftest.two
+	    mkdir conftest.dir
+	    if "$as_dir/$ac_prog$ac_exec_ext" -c conftest.one conftest.two "`pwd`/conftest.dir" &&
+	      test -s conftest.one && test -s conftest.two &&
+	      test -s conftest.dir/conftest.one &&
+	      test -s conftest.dir/conftest.two
+	    then
+	      ac_cv_path_install="$as_dir/$ac_prog$ac_exec_ext -c"
+	      break 3
+	    fi
+	  fi
+	fi
+      done
+    done
+    ;;
+esac
+
+  done
+IFS=$as_save_IFS
+
+rm -rf conftest.one conftest.two conftest.dir
+
+fi
+  if test "${ac_cv_path_install+set}" = set; then
+    INSTALL=$ac_cv_path_install
+  else
+    # As a last resort, use the slow shell script.  Don't cache a
+    # value for INSTALL within a source directory, because that will
+    # break other packages using the cache if that directory is
+    # removed, or if the value is a relative name.
+    INSTALL=$ac_install_sh
+  fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $INSTALL" >&5
+$as_echo "$INSTALL" >&6; }
+
+# Use test -z because SunOS4 sh mishandles braces in ${var-val}.
+# It thinks the first close brace ends the variable substitution.
+test -z "$INSTALL_PROGRAM" && INSTALL_PROGRAM='${INSTALL}'
+
+test -z "$INSTALL_SCRIPT" && INSTALL_SCRIPT='${INSTALL}'
+
+test -z "$INSTALL_DATA" && INSTALL_DATA='${INSTALL} -m 644'
+
+CFLAGS="-O3"
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
+$as_echo_n "checking how to run the C preprocessor... " >&6; }
+# On Suns, sometimes $CPP names a directory.
+if test -n "$CPP" && test -d "$CPP"; then
+  CPP=
+fi
+if test -z "$CPP"; then
+  if ${ac_cv_prog_CPP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+      # Double quotes because CPP needs to be expanded
+    for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp"
+    do
+      ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+  break
+fi
+
+    done
+    ac_cv_prog_CPP=$CPP
+
+fi
+  CPP=$ac_cv_prog_CPP
+else
+  ac_cv_prog_CPP=$CPP
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
+$as_echo "$CPP" >&6; }
+ac_preproc_ok=false
+for ac_c_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+  # <limits.h> exists even on freestanding compilers.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#ifdef __STDC__
+# include <limits.h>
+#else
+# include <assert.h>
+#endif
+		     Syntax error
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+
+else
+  # Broken: fails on valid input.
+continue
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_c_try_cpp "$LINENO"; then :
+  # Broken: success on invalid input.
+continue
+else
+  # Passes both tests.
+ac_preproc_ok=:
+break
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then :
+
+else
+  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details" "$LINENO" 5; }
+fi
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+$as_echo_n "checking for grep that handles long lines and -e... " >&6; }
+if ${ac_cv_path_GREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -z "$GREP"; then
+  ac_path_GREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in grep ggrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_GREP" || continue
+# Check for GNU ac_path_GREP and select it if it is found.
+  # Check for GNU $ac_path_GREP
+case `"$ac_path_GREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'GREP' >> "conftest.nl"
+    "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_GREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_GREP="$ac_path_GREP"
+      ac_path_GREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_GREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_GREP"; then
+    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_GREP=$GREP
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+$as_echo "$ac_cv_path_GREP" >&6; }
+ GREP="$ac_cv_path_GREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+$as_echo_n "checking for egrep... " >&6; }
+if ${ac_cv_path_EGREP+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
+   then ac_cv_path_EGREP="$GREP -E"
+   else
+     if test -z "$EGREP"; then
+  ac_path_EGREP_found=false
+  # Loop through the user's path and test for each of PROGNAME-LIST
+  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_prog in egrep; do
+    for ac_exec_ext in '' $ac_executable_extensions; do
+      ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
+      as_fn_executable_p "$ac_path_EGREP" || continue
+# Check for GNU ac_path_EGREP and select it if it is found.
+  # Check for GNU $ac_path_EGREP
+case `"$ac_path_EGREP" --version 2>&1` in
+*GNU*)
+  ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;;
+*)
+  ac_count=0
+  $as_echo_n 0123456789 >"conftest.in"
+  while :
+  do
+    cat "conftest.in" "conftest.in" >"conftest.tmp"
+    mv "conftest.tmp" "conftest.in"
+    cp "conftest.in" "conftest.nl"
+    $as_echo 'EGREP' >> "conftest.nl"
+    "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
+    diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
+    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    if test $ac_count -gt ${ac_path_EGREP_max-0}; then
+      # Best one so far, save it but keep looking for a better one
+      ac_cv_path_EGREP="$ac_path_EGREP"
+      ac_path_EGREP_max=$ac_count
+    fi
+    # 10*(2^10) chars as input seems more than enough
+    test $ac_count -gt 10 && break
+  done
+  rm -f conftest.in conftest.tmp conftest.nl conftest.out;;
+esac
+
+      $ac_path_EGREP_found && break 3
+    done
+  done
+  done
+IFS=$as_save_IFS
+  if test -z "$ac_cv_path_EGREP"; then
+    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+  fi
+else
+  ac_cv_path_EGREP=$EGREP
+fi
+
+   fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+$as_echo "$ac_cv_path_EGREP" >&6; }
+ EGREP="$ac_cv_path_EGREP"
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+$as_echo_n "checking for ANSI C header files... " >&6; }
+if ${ac_cv_header_stdc+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <float.h>
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  ac_cv_header_stdc=yes
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+
+if test $ac_cv_header_stdc = yes; then
+  # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <string.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "memchr" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdlib.h>
+
+_ACEOF
+if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
+  $EGREP "free" >/dev/null 2>&1; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f conftest*
+
+fi
+
+if test $ac_cv_header_stdc = yes; then
+  # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
+  if test "$cross_compiling" = yes; then :
+  :
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ctype.h>
+#include <stdlib.h>
+#if ((' ' & 0x0FF) == 0x020)
+# define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
+# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
+#else
+# define ISLOWER(c) \
+		   (('a' <= (c) && (c) <= 'i') \
+		     || ('j' <= (c) && (c) <= 'r') \
+		     || ('s' <= (c) && (c) <= 'z'))
+# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c))
+#endif
+
+#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 256; i++)
+    if (XOR (islower (i), ISLOWER (i))
+	|| toupper (i) != TOUPPER (i))
+      return 2;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_run "$LINENO"; then :
+
+else
+  ac_cv_header_stdc=no
+fi
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
+  conftest.$ac_objext conftest.beam conftest.$ac_ext
+fi
+
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+$as_echo "$ac_cv_header_stdc" >&6; }
+if test $ac_cv_header_stdc = yes; then
+
+$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+
+fi
+
+# On IRIX 5.3, sys/types and inttypes.h are conflicting.
+for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
+		  inttypes.h stdint.h unistd.h
+do :
+  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
+"
+if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+  cat >>confdefs.h <<_ACEOF
+#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
+_ACEOF
+
+fi
+
+done
+
+
+for ac_header in wmmintrin.h
+do :
+  ac_fn_c_check_header_mongrel "$LINENO" "wmmintrin.h" "ac_cv_header_wmmintrin_h" "$ac_includes_default"
+if test "x$ac_cv_header_wmmintrin_h" = xyes; then :
+  cat >>confdefs.h <<_ACEOF
+#define HAVE_WMMINTRIN_H 1
+_ACEOF
+
+        CFLAGS+=" -maes -msse -msse2"
+        CPPFLAGS="-DFASTAES"
+
+
+fi
+
+done
+
+ac_config_files="$ac_config_files Makefile"
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# `ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* `ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+
+  (set) 2>&1 |
+    case $as_nl`(ac_space=' '; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      # `set' does not quote correctly, so add quotes: double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \.
+      sed -n \
+	"s/'/'\\\\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+      ;; #(
+    *)
+      # `set' quotes correctly as required by POSIX, so do not add quotes.
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+) |
+  sed '
+     /^ac_cv_env_/b end
+     t clear
+     :clear
+     s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/
+     t end
+     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+     :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+  if test -w "$cache_file"; then
+    if test "x$cache_file" != "x/dev/null"; then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+$as_echo "$as_me: updating cache $cache_file" >&6;}
+      if test ! -f "$cache_file" || test -h "$cache_file"; then
+	cat confcache >"$cache_file"
+      else
+        case $cache_file in #(
+        */* | ?:*)
+	  mv -f confcache "$cache_file"$$ &&
+	  mv -f "$cache_file"$$ "$cache_file" ;; #(
+        *)
+	  mv -f confcache "$cache_file" ;;
+	esac
+      fi
+    fi
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
+  fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section.  Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 (][^	 (]*([^)]*)\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 ][^	 ]*\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[	 `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\[/\\&/g
+s/\]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+	g
+	s/^\n//
+	s/\n/ /g
+	p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+
+
+ac_libobjs=
+ac_ltlibobjs=
+U=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+  # 1. Remove the extension, and $U if already installed.
+  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+  ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
+  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
+  #    will be set to the directory where LIBOBJS objects are built.
+  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+: "${CONFIG_STATUS=./config.status}"
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
+as_write_fail=0
+cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac
+fi
+
+
+as_nl='
+'
+export as_nl
+# Printing a long string crashes Solaris 7 /usr/bin/printf.
+as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
+as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
+# Prefer a ksh shell builtin over an external printf program on Solaris,
+# but without wasting forks for bash or zsh.
+if test -z "$BASH_VERSION$ZSH_VERSION" \
+    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='print -r --'
+  as_echo_n='print -rn --'
+elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+  as_echo='printf %s\n'
+  as_echo_n='printf %s'
+else
+  if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
+    as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
+    as_echo_n='/usr/ucb/echo -n'
+  else
+    as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
+    as_echo_n_body='eval
+      arg=$1;
+      case $arg in #(
+      *"$as_nl"*)
+	expr "X$arg" : "X\\(.*\\)$as_nl";
+	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
+      esac;
+      expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
+    '
+    export as_echo_n_body
+    as_echo_n='sh -c $as_echo_n_body as_echo'
+  fi
+  export as_echo_body
+  as_echo='sh -c $as_echo_body as_echo'
+fi
+
+# The user is always right.
+if test "${PATH_SEPARATOR+set}" != set; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# IFS
+# We need space, tab and new line, in precisely that order.  Quoting is
+# there to prevent editors from complaining about space-tab.
+# (If _AS_PATH_WALK were called with IFS unset, it would disable word
+# splitting by setting IFS to empty value.)
+IFS=" ""	$as_nl"
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as `sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+# Unset variables that we do not need and which cause bugs (e.g. in
+# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
+# suppresses any "Segmentation fault" message there.  '((' could
+# trigger a bug in pdksh 5.2.14.
+for as_var in BASH_ENV ENV MAIL MAILPATH
+do eval test x\${$as_var+set} = xset \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# NLS nuisances.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# CDPATH.
+(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  $as_echo "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else
+  as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  }
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else
+  as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  }
+fi # as_fn_arith
+
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable.
+    # In both cases, we have to default to `cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'"
+
+# Sed expression to map a string onto a valid variable name.
+as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
+
+
+exec 6>&1
+## ----------------------------------- ##
+## Main body of $CONFIG_STATUS script. ##
+## ----------------------------------- ##
+_ASEOF
+test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# Save the log message, to keep $0 and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by $as_me, which was
+generated by GNU Autoconf 2.69.  Invocation command line was
+
+  CONFIG_FILES    = $CONFIG_FILES
+  CONFIG_HEADERS  = $CONFIG_HEADERS
+  CONFIG_LINKS    = $CONFIG_LINKS
+  CONFIG_COMMANDS = $CONFIG_COMMANDS
+  $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+\`$as_me' instantiates files and other configuration actions
+from templates according to the current configuration.  Unless the files
+and actions are specified as TAGs, all are instantiated by default.
+
+Usage: $0 [OPTION]... [TAG]...
+
+  -h, --help       print this help, then exit
+  -V, --version    print version number and configuration settings, then exit
+      --config     print configuration, then exit
+  -q, --quiet, --silent
+                   do not print progress messages
+  -d, --debug      don't remove temporary files
+      --recheck    update $as_me by reconfiguring in the same conditions
+      --file=FILE[:TEMPLATE]
+                   instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to the package provider."
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
+ac_cs_version="\\
+config.status
+configured by $0, generated by GNU Autoconf 2.69,
+  with options \\"\$ac_cs_config\\"
+
+Copyright (C) 2012 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+INSTALL='$INSTALL'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+  case $1 in
+  --*=?*)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+    ac_shift=:
+    ;;
+  --*=)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=
+    ac_shift=:
+    ;;
+  *)
+    ac_option=$1
+    ac_optarg=$2
+    ac_shift=shift
+    ;;
+  esac
+
+  case $ac_option in
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    ac_cs_recheck=: ;;
+  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+    $as_echo "$ac_cs_version"; exit ;;
+  --config | --confi | --conf | --con | --co | --c )
+    $as_echo "$ac_cs_config"; exit ;;
+  --debug | --debu | --deb | --de | --d | -d )
+    debug=: ;;
+  --file | --fil | --fi | --f )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    '') as_fn_error $? "missing file argument" ;;
+    esac
+    as_fn_append CONFIG_FILES " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --he | --h |  --help | --hel | -h )
+    $as_echo "$ac_cs_usage"; exit ;;
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil | --si | --s)
+    ac_cs_silent=: ;;
+
+  # This is an error.
+  -*) as_fn_error $? "unrecognized option: \`$1'
+Try \`$0 --help' for more information." ;;
+
+  *) as_fn_append ac_config_targets " $1"
+     ac_need_defaults=false ;;
+
+  esac
+  shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+  exec 6>/dev/null
+  ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+  set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+  shift
+  \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6
+  CONFIG_SHELL='$SHELL'
+  export CONFIG_SHELL
+  exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+  echo
+  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+  $as_echo "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+  case $ac_config_target in
+    "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
+
+  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
+  esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+  test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience.  Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to `$tmp'.
+$debug ||
+{
+  tmp= ac_tmp=
+  trap 'exit_status=$?
+  : "${ac_tmp:=$tmp}"
+  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
+' 0
+  trap 'as_fn_exit 1' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+  test -d "$tmp"
+}  ||
+{
+  tmp=./conf$$-$RANDOM
+  (umask 077 && mkdir "$tmp")
+} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+ac_tmp=$tmp
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with `./config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=`echo X | tr X '\015'`
+# On cygwin, bash can eat \r inside `` if the user requested igncr.
+# But we know of no other shell where ac_cr would be empty at this
+# point, so we can use a bashism as a fallback.
+if test "x$ac_cr" = x; then
+  eval ac_cr=\$\'\\r\'
+fi
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+  ac_cs_awk_cr='\\r'
+else
+  ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+  echo "cat >conf$$subs.awk <<_ACEOF" &&
+  echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+  echo "_ACEOF"
+} >conf$$subs.sh ||
+  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  . ./conf$$subs.sh ||
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+
+  ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+  if test $ac_delim_n = $ac_delim_num; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\)..*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\)..*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+  N
+  s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
+  for (key in S) S_is_set[key] = 1
+  FS = ""
+
+}
+{
+  line = $ 0
+  nfields = split(line, field, "@")
+  substed = 0
+  len = length(field[1])
+  for (i = 2; i < nfields; i++) {
+    key = field[i]
+    keylen = length(key)
+    if (S_is_set[key]) {
+      value = S[key]
+      line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+      len += length(value) + length(field[++i])
+      substed = 1
+    } else
+      len += 1 + keylen
+  }
+
+  print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+  sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+  cat
+fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
+  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
+# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
+h
+s///
+s/^/:/
+s/[	 ]*$/:/
+s/:\$(srcdir):/:/g
+s/:\${srcdir}:/:/g
+s/:@srcdir@:/:/g
+s/^:*//
+s/:*$//
+x
+s/\(=[	 ]*\).*/\1/
+G
+s/\n//
+s/^[^=]*=[	 ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+
+eval set X "  :F $CONFIG_FILES      "
+shift
+for ac_tag
+do
+  case $ac_tag in
+  :[FHLC]) ac_mode=$ac_tag; continue;;
+  esac
+  case $ac_mode$ac_tag in
+  :[FHL]*:*);;
+  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;;
+  :[FH]-) ac_tag=-:-;;
+  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+  esac
+  ac_save_IFS=$IFS
+  IFS=:
+  set x $ac_tag
+  IFS=$ac_save_IFS
+  shift
+  ac_file=$1
+  shift
+
+  case $ac_mode in
+  :L) ac_source=$1;;
+  :[FH])
+    ac_file_inputs=
+    for ac_f
+    do
+      case $ac_f in
+      -) ac_f="$ac_tmp/stdin";;
+      *) # Look for the file first in the build tree, then in the source tree
+	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
+	 # because $ac_f cannot contain `:'.
+	 test -f "$ac_f" ||
+	   case $ac_f in
+	   [\\/$]*) false;;
+	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+	   esac ||
+	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;;
+      esac
+      case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      as_fn_append ac_file_inputs " '$ac_f'"
+    done
+
+    # Let's still pretend it is `configure' which instantiates (i.e., don't
+    # use $as_me), people would be surprised to read:
+    #    /* config.h.  Generated by config.status.  */
+    configure_input='Generated from '`
+	  $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	`' by configure.'
+    if test x"$ac_file" != x-; then
+      configure_input="$ac_file.  $configure_input"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+$as_echo "$as_me: creating $ac_file" >&6;}
+    fi
+    # Neutralize special characters interpreted by sed in replacement strings.
+    case $configure_input in #(
+    *\&* | *\|* | *\\* )
+       ac_sed_conf_input=`$as_echo "$configure_input" |
+       sed 's/[\\\\&|]/\\\\&/g'`;; #(
+    *) ac_sed_conf_input=$configure_input;;
+    esac
+
+    case $ac_tag in
+    *:-:* | *:-) cat >"$ac_tmp/stdin" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
+    esac
+    ;;
+  esac
+
+  ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$ac_file" : 'X\(//\)[^/]' \| \
+	 X"$ac_file" : 'X\(//\)$' \| \
+	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$ac_file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  as_dir="$ac_dir"; as_fn_mkdir_p
+  ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+  case $ac_mode in
+  :F)
+  #
+  # CONFIG_FILE
+  #
+
+  case $INSTALL in
+  [\\/$]* | ?:[\\/]* ) ac_INSTALL=$INSTALL ;;
+  *) ac_INSTALL=$ac_top_build_prefix$INSTALL ;;
+  esac
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+ac_sed_dataroot='
+/datarootdir/ {
+  p
+  q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  ac_datarootdir_hack='
+  s&@datadir@&$datadir&g
+  s&@docdir@&$docdir&g
+  s&@infodir@&$infodir&g
+  s&@localedir@&$localedir&g
+  s&@mandir@&$mandir&g
+  s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when `$srcdir' = `.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+s&@INSTALL@&$ac_INSTALL&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
+  >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+  { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
+  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
+      "$ac_tmp/out"`; test -z "$ac_out"; } &&
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&5
+$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&2;}
+
+  rm -f "$ac_tmp/stdin"
+  case $ac_file in
+  -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
+  *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
+  esac \
+  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+ ;;
+
+
+
+  esac
+
+done # for ac_tag
+
+
+as_fn_exit 0
+_ACEOF
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded.  So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status.  When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+  ac_cs_success=:
+  ac_config_status_args=
+  test "$silent" = yes &&
+    ac_config_status_args="$ac_config_status_args --quiet"
+  exec 5>/dev/null
+  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+  exec 5>>config.log
+  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+  # would make configure fail if this is the last instruction.
+  $ac_cs_success || as_fn_exit 1
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
diff --git a/tools/gzinject/configure.ac b/tools/gzinject/configure.ac
new file mode 100644
index 000000000..bf8d7008d
--- /dev/null
+++ b/tools/gzinject/configure.ac
@@ -0,0 +1,14 @@
+AC_PREREQ([2.69])
+AC_INIT
+AC_PREFIX_DEFAULT([/usr/local])
+AC_PROG_CC
+AC_PROG_INSTALL
+CFLAGS="-O3"
+AC_CHECK_HEADERS([wmmintrin.h],
+    [
+        CFLAGS+=" -maes -msse -msse2"
+        CPPFLAGS="-DFASTAES"
+    ]
+)
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT
diff --git a/tools/gzinject/install-sh b/tools/gzinject/install-sh
new file mode 100644
index 000000000..0360b79e7
--- /dev/null
+++ b/tools/gzinject/install-sh
@@ -0,0 +1,501 @@
+#!/bin/sh
+# install - install a program, script, or datafile
+
+scriptversion=2016-01-11.22; # UTC
+
+# This originates from X11R5 (mit/util/scripts/install.sh), which was
+# later released in X11R6 (xc/config/util/install.sh) with the
+# following copyright and license.
+#
+# Copyright (C) 1994 X Consortium
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+# AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNEC-
+# TION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# Except as contained in this notice, the name of the X Consortium shall not
+# be used in advertising or otherwise to promote the sale, use or other deal-
+# ings in this Software without prior written authorization from the X Consor-
+# tium.
+#
+#
+# FSF changes to this file are in the public domain.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# 'make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+
+tab='	'
+nl='
+'
+IFS=" $tab$nl"
+
+# Set DOITPROG to "echo" to test this script.
+
+doit=${DOITPROG-}
+doit_exec=${doit:-exec}
+
+# Put in absolute file names if you don't have them in your path;
+# or use environment vars.
+
+chgrpprog=${CHGRPPROG-chgrp}
+chmodprog=${CHMODPROG-chmod}
+chownprog=${CHOWNPROG-chown}
+cmpprog=${CMPPROG-cmp}
+cpprog=${CPPROG-cp}
+mkdirprog=${MKDIRPROG-mkdir}
+mvprog=${MVPROG-mv}
+rmprog=${RMPROG-rm}
+stripprog=${STRIPPROG-strip}
+
+posix_mkdir=
+
+# Desired mode of installed file.
+mode=0755
+
+chgrpcmd=
+chmodcmd=$chmodprog
+chowncmd=
+mvcmd=$mvprog
+rmcmd="$rmprog -f"
+stripcmd=
+
+src=
+dst=
+dir_arg=
+dst_arg=
+
+copy_on_change=false
+is_target_a_directory=possibly
+
+usage="\
+Usage: $0 [OPTION]... [-T] SRCFILE DSTFILE
+   or: $0 [OPTION]... SRCFILES... DIRECTORY
+   or: $0 [OPTION]... -t DIRECTORY SRCFILES...
+   or: $0 [OPTION]... -d DIRECTORIES...
+
+In the 1st form, copy SRCFILE to DSTFILE.
+In the 2nd and 3rd, copy all SRCFILES to DIRECTORY.
+In the 4th, create DIRECTORIES.
+
+Options:
+     --help     display this help and exit.
+     --version  display version info and exit.
+
+  -c            (ignored)
+  -C            install only if different (preserve the last data modification time)
+  -d            create directories instead of installing files.
+  -g GROUP      $chgrpprog installed files to GROUP.
+  -m MODE       $chmodprog installed files to MODE.
+  -o USER       $chownprog installed files to USER.
+  -s            $stripprog installed files.
+  -t DIRECTORY  install into DIRECTORY.
+  -T            report an error if DSTFILE is a directory.
+
+Environment variables override the default commands:
+  CHGRPPROG CHMODPROG CHOWNPROG CMPPROG CPPROG MKDIRPROG MVPROG
+  RMPROG STRIPPROG
+"
+
+while test $# -ne 0; do
+  case $1 in
+    -c) ;;
+
+    -C) copy_on_change=true;;
+
+    -d) dir_arg=true;;
+
+    -g) chgrpcmd="$chgrpprog $2"
+        shift;;
+
+    --help) echo "$usage"; exit $?;;
+
+    -m) mode=$2
+        case $mode in
+          *' '* | *"$tab"* | *"$nl"* | *'*'* | *'?'* | *'['*)
+            echo "$0: invalid mode: $mode" >&2
+            exit 1;;
+        esac
+        shift;;
+
+    -o) chowncmd="$chownprog $2"
+        shift;;
+
+    -s) stripcmd=$stripprog;;
+
+    -t)
+        is_target_a_directory=always
+        dst_arg=$2
+        # Protect names problematic for 'test' and other utilities.
+        case $dst_arg in
+          -* | [=\(\)!]) dst_arg=./$dst_arg;;
+        esac
+        shift;;
+
+    -T) is_target_a_directory=never;;
+
+    --version) echo "$0 $scriptversion"; exit $?;;
+
+    --) shift
+        break;;
+
+    -*) echo "$0: invalid option: $1" >&2
+        exit 1;;
+
+    *)  break;;
+  esac
+  shift
+done
+
+# We allow the use of options -d and -T together, by making -d
+# take the precedence; this is for compatibility with GNU install.
+
+if test -n "$dir_arg"; then
+  if test -n "$dst_arg"; then
+    echo "$0: target directory not allowed when installing a directory." >&2
+    exit 1
+  fi
+fi
+
+if test $# -ne 0 && test -z "$dir_arg$dst_arg"; then
+  # When -d is used, all remaining arguments are directories to create.
+  # When -t is used, the destination is already specified.
+  # Otherwise, the last argument is the destination.  Remove it from $@.
+  for arg
+  do
+    if test -n "$dst_arg"; then
+      # $@ is not empty: it contains at least $arg.
+      set fnord "$@" "$dst_arg"
+      shift # fnord
+    fi
+    shift # arg
+    dst_arg=$arg
+    # Protect names problematic for 'test' and other utilities.
+    case $dst_arg in
+      -* | [=\(\)!]) dst_arg=./$dst_arg;;
+    esac
+  done
+fi
+
+if test $# -eq 0; then
+  if test -z "$dir_arg"; then
+    echo "$0: no input file specified." >&2
+    exit 1
+  fi
+  # It's OK to call 'install-sh -d' without argument.
+  # This can happen when creating conditional directories.
+  exit 0
+fi
+
+if test -z "$dir_arg"; then
+  if test $# -gt 1 || test "$is_target_a_directory" = always; then
+    if test ! -d "$dst_arg"; then
+      echo "$0: $dst_arg: Is not a directory." >&2
+      exit 1
+    fi
+  fi
+fi
+
+if test -z "$dir_arg"; then
+  do_exit='(exit $ret); exit $ret'
+  trap "ret=129; $do_exit" 1
+  trap "ret=130; $do_exit" 2
+  trap "ret=141; $do_exit" 13
+  trap "ret=143; $do_exit" 15
+
+  # Set umask so as not to create temps with too-generous modes.
+  # However, 'strip' requires both read and write access to temps.
+  case $mode in
+    # Optimize common cases.
+    *644) cp_umask=133;;
+    *755) cp_umask=22;;
+
+    *[0-7])
+      if test -z "$stripcmd"; then
+        u_plus_rw=
+      else
+        u_plus_rw='% 200'
+      fi
+      cp_umask=`expr '(' 777 - $mode % 1000 ')' $u_plus_rw`;;
+    *)
+      if test -z "$stripcmd"; then
+        u_plus_rw=
+      else
+        u_plus_rw=,u+rw
+      fi
+      cp_umask=$mode$u_plus_rw;;
+  esac
+fi
+
+for src
+do
+  # Protect names problematic for 'test' and other utilities.
+  case $src in
+    -* | [=\(\)!]) src=./$src;;
+  esac
+
+  if test -n "$dir_arg"; then
+    dst=$src
+    dstdir=$dst
+    test -d "$dstdir"
+    dstdir_status=$?
+  else
+
+    # Waiting for this to be detected by the "$cpprog $src $dsttmp" command
+    # might cause directories to be created, which would be especially bad
+    # if $src (and thus $dsttmp) contains '*'.
+    if test ! -f "$src" && test ! -d "$src"; then
+      echo "$0: $src does not exist." >&2
+      exit 1
+    fi
+
+    if test -z "$dst_arg"; then
+      echo "$0: no destination specified." >&2
+      exit 1
+    fi
+    dst=$dst_arg
+
+    # If destination is a directory, append the input filename; won't work
+    # if double slashes aren't ignored.
+    if test -d "$dst"; then
+      if test "$is_target_a_directory" = never; then
+        echo "$0: $dst_arg: Is a directory" >&2
+        exit 1
+      fi
+      dstdir=$dst
+      dst=$dstdir/`basename "$src"`
+      dstdir_status=0
+    else
+      dstdir=`dirname "$dst"`
+      test -d "$dstdir"
+      dstdir_status=$?
+    fi
+  fi
+
+  obsolete_mkdir_used=false
+
+  if test $dstdir_status != 0; then
+    case $posix_mkdir in
+      '')
+        # Create intermediate dirs using mode 755 as modified by the umask.
+        # This is like FreeBSD 'install' as of 1997-10-28.
+        umask=`umask`
+        case $stripcmd.$umask in
+          # Optimize common cases.
+          *[2367][2367]) mkdir_umask=$umask;;
+          .*0[02][02] | .[02][02] | .[02]) mkdir_umask=22;;
+
+          *[0-7])
+            mkdir_umask=`expr $umask + 22 \
+              - $umask % 100 % 40 + $umask % 20 \
+              - $umask % 10 % 4 + $umask % 2
+            `;;
+          *) mkdir_umask=$umask,go-w;;
+        esac
+
+        # With -d, create the new directory with the user-specified mode.
+        # Otherwise, rely on $mkdir_umask.
+        if test -n "$dir_arg"; then
+          mkdir_mode=-m$mode
+        else
+          mkdir_mode=
+        fi
+
+        posix_mkdir=false
+        case $umask in
+          *[123567][0-7][0-7])
+            # POSIX mkdir -p sets u+wx bits regardless of umask, which
+            # is incompatible with FreeBSD 'install' when (umask & 300) != 0.
+            ;;
+          *)
+            tmpdir=${TMPDIR-/tmp}/ins$RANDOM-$$
+            trap 'ret=$?; rmdir "$tmpdir/d" "$tmpdir" 2>/dev/null; exit $ret' 0
+
+            if (umask $mkdir_umask &&
+                exec $mkdirprog $mkdir_mode -p -- "$tmpdir/d") >/dev/null 2>&1
+            then
+              if test -z "$dir_arg" || {
+                   # Check for POSIX incompatibilities with -m.
+                   # HP-UX 11.23 and IRIX 6.5 mkdir -m -p sets group- or
+                   # other-writable bit of parent directory when it shouldn't.
+                   # FreeBSD 6.1 mkdir -m -p sets mode of existing directory.
+                   ls_ld_tmpdir=`ls -ld "$tmpdir"`
+                   case $ls_ld_tmpdir in
+                     d????-?r-*) different_mode=700;;
+                     d????-?--*) different_mode=755;;
+                     *) false;;
+                   esac &&
+                   $mkdirprog -m$different_mode -p -- "$tmpdir" && {
+                     ls_ld_tmpdir_1=`ls -ld "$tmpdir"`
+                     test "$ls_ld_tmpdir" = "$ls_ld_tmpdir_1"
+                   }
+                 }
+              then posix_mkdir=:
+              fi
+              rmdir "$tmpdir/d" "$tmpdir"
+            else
+              # Remove any dirs left behind by ancient mkdir implementations.
+              rmdir ./$mkdir_mode ./-p ./-- 2>/dev/null
+            fi
+            trap '' 0;;
+        esac;;
+    esac
+
+    if
+      $posix_mkdir && (
+        umask $mkdir_umask &&
+        $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir"
+      )
+    then :
+    else
+
+      # The umask is ridiculous, or mkdir does not conform to POSIX,
+      # or it failed possibly due to a race condition.  Create the
+      # directory the slow way, step by step, checking for races as we go.
+
+      case $dstdir in
+        /*) prefix='/';;
+        [-=\(\)!]*) prefix='./';;
+        *)  prefix='';;
+      esac
+
+      oIFS=$IFS
+      IFS=/
+      set -f
+      set fnord $dstdir
+      shift
+      set +f
+      IFS=$oIFS
+
+      prefixes=
+
+      for d
+      do
+        test X"$d" = X && continue
+
+        prefix=$prefix$d
+        if test -d "$prefix"; then
+          prefixes=
+        else
+          if $posix_mkdir; then
+            (umask=$mkdir_umask &&
+             $doit_exec $mkdirprog $mkdir_mode -p -- "$dstdir") && break
+            # Don't fail if two instances are running concurrently.
+            test -d "$prefix" || exit 1
+          else
+            case $prefix in
+              *\'*) qprefix=`echo "$prefix" | sed "s/'/'\\\\\\\\''/g"`;;
+              *) qprefix=$prefix;;
+            esac
+            prefixes="$prefixes '$qprefix'"
+          fi
+        fi
+        prefix=$prefix/
+      done
+
+      if test -n "$prefixes"; then
+        # Don't fail if two instances are running concurrently.
+        (umask $mkdir_umask &&
+         eval "\$doit_exec \$mkdirprog $prefixes") ||
+          test -d "$dstdir" || exit 1
+        obsolete_mkdir_used=true
+      fi
+    fi
+  fi
+
+  if test -n "$dir_arg"; then
+    { test -z "$chowncmd" || $doit $chowncmd "$dst"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dst"; } &&
+    { test "$obsolete_mkdir_used$chowncmd$chgrpcmd" = false ||
+      test -z "$chmodcmd" || $doit $chmodcmd $mode "$dst"; } || exit 1
+  else
+
+    # Make a couple of temp file names in the proper directory.
+    dsttmp=$dstdir/_inst.$$_
+    rmtmp=$dstdir/_rm.$$_
+
+    # Trap to clean up those temp files at exit.
+    trap 'ret=$?; rm -f "$dsttmp" "$rmtmp" && exit $ret' 0
+
+    # Copy the file name to the temp name.
+    (umask $cp_umask && $doit_exec $cpprog "$src" "$dsttmp") &&
+
+    # and set any options; do chmod last to preserve setuid bits.
+    #
+    # If any of these fail, we abort the whole thing.  If we want to
+    # ignore errors from any of these, just make sure not to ignore
+    # errors from the above "$doit $cpprog $src $dsttmp" command.
+    #
+    { test -z "$chowncmd" || $doit $chowncmd "$dsttmp"; } &&
+    { test -z "$chgrpcmd" || $doit $chgrpcmd "$dsttmp"; } &&
+    { test -z "$stripcmd" || $doit $stripcmd "$dsttmp"; } &&
+    { test -z "$chmodcmd" || $doit $chmodcmd $mode "$dsttmp"; } &&
+
+    # If -C, don't bother to copy if it wouldn't change the file.
+    if $copy_on_change &&
+       old=`LC_ALL=C ls -dlL "$dst"     2>/dev/null` &&
+       new=`LC_ALL=C ls -dlL "$dsttmp"  2>/dev/null` &&
+       set -f &&
+       set X $old && old=:$2:$4:$5:$6 &&
+       set X $new && new=:$2:$4:$5:$6 &&
+       set +f &&
+       test "$old" = "$new" &&
+       $cmpprog "$dst" "$dsttmp" >/dev/null 2>&1
+    then
+      rm -f "$dsttmp"
+    else
+      # Rename the file to the real destination.
+      $doit $mvcmd -f "$dsttmp" "$dst" 2>/dev/null ||
+
+      # The rename failed, perhaps because mv can't rename something else
+      # to itself, or perhaps because mv is so ancient that it does not
+      # support -f.
+      {
+        # Now remove or move aside any old file at destination location.
+        # We try this two ways since rm can't unlink itself on some
+        # systems and the destination file might be busy for other
+        # reasons.  In this case, the final cleanup might fail but the new
+        # file should still install successfully.
+        {
+          test ! -f "$dst" ||
+          $doit $rmcmd -f "$dst" 2>/dev/null ||
+          { $doit $mvcmd -f "$dst" "$rmtmp" 2>/dev/null &&
+            { $doit $rmcmd -f "$rmtmp" 2>/dev/null; :; }
+          } ||
+          { echo "$0: cannot unlink or rename $dst" >&2
+            (exit 1); exit 1
+          }
+        } &&
+
+        # Now rename the file to the real destination.
+        $doit $mvcmd "$dsttmp" "$dst"
+      }
+    fi || exit 1
+
+    trap '' 0
+  fi
+done
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "scriptversion="
+# time-stamp-format: "%:y-%02m-%02d.%02H"
+# time-stamp-time-zone: "UTC0"
+# time-stamp-end: "; # UTC"
+# End:
diff --git a/tools/gzinject/patches/NACE.gzi b/tools/gzinject/patches/NACE.gzi
new file mode 100644
index 000000000..30ab26f55
--- /dev/null
+++ b/tools/gzinject/patches/NACE.gzi
@@ -0,0 +1,6 @@
+# default gz patches for NACE
+0000 00000000 00000001
+# use 8MB memory
+0304 00002EB0 60000000
+# allocate 32MB for rom
+0304 0005BFD4 3C807200
diff --git a/tools/gzinject/patches/NACJ.gzi b/tools/gzinject/patches/NACJ.gzi
new file mode 100644
index 000000000..28415cb8f
--- /dev/null
+++ b/tools/gzinject/patches/NACJ.gzi
@@ -0,0 +1,6 @@
+# default gz patches for NACJ
+0000 00000000 00000001
+# use 8MB memory
+0304 00002EB0 60000000
+# allocate 32MB for rom
+0304 0005BF44 3C807200
diff --git a/tools/gzinject/patches/NKZE.gzi b/tools/gzinject/patches/NKZE.gzi
new file mode 100644
index 000000000..440cbee08
--- /dev/null
+++ b/tools/gzinject/patches/NKZE.gzi
@@ -0,0 +1,16 @@
+# NKZE kz-NZSE
+0000 00000000 00000001
+# decompress content1
+0100 00000000 00000000
+# apply 12MB fixes
+0304 00010B58 3C8000C0
+0304 0004BD20 67E47000
+0304 0004BC80 3CA00100
+# apply controller remappings
+0302 00148514 00000800
+0302 00148518 00000400
+0302 0014851C 00000200
+0302 00148520 00000100
+0302 00148528 00000020
+# compress content1
+0200 00000000 00000000
\ No newline at end of file
diff --git a/tools/gzinject/patches/NKZJ.gzi b/tools/gzinject/patches/NKZJ.gzi
new file mode 100644
index 000000000..d2ee665dd
--- /dev/null
+++ b/tools/gzinject/patches/NKZJ.gzi
@@ -0,0 +1,16 @@
+# NKZJ kz-NZSJ
+0000 00000000 00000001
+# decompress content1
+0100 00000000 00000000
+# apply 12MB fixes
+0304 00010B58 3C8000C0
+0304 0004BD94 67E47000
+0304 0004BCF4 3CA00100
+# apply controller remappings
+0302 0014AA54 00000800
+0302 0014AA58 00000400
+0302 0014AA5C 00000200
+0302 0014AA60 00000100
+0302 0014AA68 00000020
+# compress content1
+0200 00000000 00000000
\ No newline at end of file
diff --git a/tools/gzinject/patches/gz_default_remap.gzi b/tools/gzinject/patches/gz_default_remap.gzi
new file mode 100644
index 000000000..22b857ac4
--- /dev/null
+++ b/tools/gzinject/patches/gz_default_remap.gzi
@@ -0,0 +1,9 @@
+# gz standard remapping for NACE and NACJ
+0000 00000000 00000001
+# apply d-pad remappings
+0302 0016BAF0 00000800
+0302 0016BAF4 00000400
+0302 0016BAF8 00000200
+0302 0016BAFC 00000100
+# apply c-stick remapping
+0302 0016BB04 00000020
diff --git a/tools/gzinject/patches/gz_raphnet_remap.gzi b/tools/gzinject/patches/gz_raphnet_remap.gzi
new file mode 100644
index 000000000..61e1a968c
--- /dev/null
+++ b/tools/gzinject/patches/gz_raphnet_remap.gzi
@@ -0,0 +1,9 @@
+# gz raphnet remapping for NACE and NACJ
+0000 00000000 00000001
+# apply d-pad remappings
+0302 0016BAF0 00000800
+0302 0016BAF4 00000400
+0302 0016BAF8 00000200
+0302 0016BAFC 00000100
+# apply z-trigger remapping
+0302 0016BAD8 00000020
diff --git a/tools/gzinject/patches/hb_NACE.gzi b/tools/gzinject/patches/hb_NACE.gzi
new file mode 100644
index 000000000..dccad2e18
--- /dev/null
+++ b/tools/gzinject/patches/hb_NACE.gzi
@@ -0,0 +1,15 @@
+# homeboy patches for NACE
+0000 00000000 00000001
+# resize MEM2 heap for homeboy
+0302 00085732 00009010
+0304 00085738 60000000
+0304 00085744 60000000
+# homeboy hook
+0304 00002EA8 3c809000
+0304 00002EAC 38840800
+0304 00002EB0 7c8903a6
+0304 00002EB4 80630018
+0304 00002EB8 4e800421
+# Change iOS to 61
+0000 00000000 00000064
+0301 0000018B 0000003D
\ No newline at end of file
diff --git a/tools/gzinject/patches/hb_NACJ.gzi b/tools/gzinject/patches/hb_NACJ.gzi
new file mode 100644
index 000000000..0e852695d
--- /dev/null
+++ b/tools/gzinject/patches/hb_NACJ.gzi
@@ -0,0 +1,15 @@
+# homeboy patches for NACJ
+0000 00000000 00000001
+# resize MEM2 heap for homeboy
+0302 00085726 00009010
+0304 0008572C 60000000
+0304 00085738 60000000
+# homeboy hook
+0304 00002EA8 3c809000
+0304 00002EAC 38840800
+0304 00002EB0 7c8903a6
+0304 00002EB4 80630018
+0304 00002EB8 4e800421
+# Change iOS to 61
+0000 00000000 00000064
+0301 0000018B 0000003D
\ No newline at end of file
diff --git a/tools/gzinject/patches/ootr_dpad_remap.gzi b/tools/gzinject/patches/ootr_dpad_remap.gzi
new file mode 100644
index 000000000..57fcb9256
--- /dev/null
+++ b/tools/gzinject/patches/ootr_dpad_remap.gzi
@@ -0,0 +1,6 @@
+# ootr remapping for NACE and NACJ
+0000 00000000 00000001
+# apply d-pad remappings
+0302 0016BAF4 00000400
+0302 0016BAF8 00000200
+0302 0016BAFC 00000100
diff --git a/tools/gzinject/src/aes.c b/tools/gzinject/src/aes.c
new file mode 100644
index 000000000..f7701f0a8
--- /dev/null
+++ b/tools/gzinject/src/aes.c
@@ -0,0 +1,567 @@
+/*
+
+This is an implementation of the AES algorithm, specifically ECB, CTR and CBC mode.
+Block size can be chosen in aes.h - available choices are AES128, AES192, AES256.
+
+The implementation is verified against the test vectors in:
+National Institute of Standards and Technology Special Publication 800-38A 2001 ED
+
+ECB-AES128
+----------
+
+plain-text:
+6bc1bee22e409f96e93d7e117393172a
+ae2d8a571e03ac9c9eb76fac45af8e51
+30c81c46a35ce411e5fbc1191a0a52ef
+f69f2445df4f9b17ad2b417be66c3710
+
+key:
+2b7e151628aed2a6abf7158809cf4f3c
+
+resulting cipher
+3ad77bb40d7a3660a89ecaf32466ef97
+f5d3d58503b9699de785895a96fdbaaf
+43b1cd7f598ece23881b00e3ed030688
+7b0c785e27e8ad3f8223207104725dd4
+
+
+NOTE:   String length must be evenly divisible by 16byte (str_len % 16 == 0)
+You should pad the end of the string with zeros if this is not the case.
+For AES192/256 the key size is proportionally larger.
+
+*/
+
+
+/*****************************************************************************/
+/* Includes:                                                                 */
+/*****************************************************************************/
+#include <stdint.h>
+#include <string.h> // CBC mode, for memset
+#include "aes.h"
+
+/*****************************************************************************/
+/* Defines:                                                                  */
+/*****************************************************************************/
+// The number of columns comprising a state in AES. This is a constant in AES. Value=4
+#define Nb 4
+
+#if defined(AES256) && (AES256 == 1)
+#define Nk 8
+#define Nr 14
+#elif defined(AES192) && (AES192 == 1)
+#define Nk 6
+#define Nr 12
+#else
+#define Nk 4        // The number of 32 bit words in a key.
+#define Nr 10       // The number of rounds in AES Cipher.
+#endif
+
+// jcallan@github points out that declaring Multiply as a function 
+// reduces code size considerably with the Keil ARM compiler.
+// See this link for more information: https://github.com/kokke/tiny-AES-C/pull/3
+#ifndef MULTIPLY_AS_A_FUNCTION
+#define MULTIPLY_AS_A_FUNCTION 0
+#endif
+
+
+
+
+/*****************************************************************************/
+/* Private variables:                                                        */
+/*****************************************************************************/
+// state - array holding the intermediate results during decryption.
+typedef uint8_t state_t[4][4];
+
+
+
+// The lookup-tables are marked const so they can be placed in read-only storage instead of RAM
+// The numbers below can be computed dynamically trading ROM for RAM - 
+// This can be useful in (embedded) bootloader applications, where ROM is often limited.
+static const uint8_t sbox[256] = {
+	//0     1    2      3     4    5     6     7      8    9     A      B    C     D     E     F
+	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
+	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
+	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
+	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
+	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
+	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
+	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
+	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
+	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
+	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
+	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
+	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
+	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
+	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
+	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
+	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 };
+
+static const uint8_t rsbox[256] = {
+	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d };
+
+// The round constant word array, Rcon[i], contains the values given by 
+// x to the power (i-1) being powers of x (x is denoted as {02}) in the field GF(2^8)
+static const uint8_t Rcon[11] = {
+	0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36 };
+
+/*
+* Jordan Goulder points out in PR #12 (https://github.com/kokke/tiny-AES-C/pull/12),
+* that you can remove most of the elements in the Rcon array, because they are unused.
+*
+* From Wikipedia's article on the Rijndael key schedule @ https://en.wikipedia.org/wiki/Rijndael_key_schedule#Rcon
+*
+* "Only the first some of these constants are actually used � up to rcon[10] for AES-128 (as 11 round keys are needed),
+*  up to rcon[8] for AES-192, up to rcon[7] for AES-256. rcon[0] is not used in AES algorithm."
+*/
+
+
+/*****************************************************************************/
+/* Private functions:                                                        */
+/*****************************************************************************/
+/*
+static uint8_t getSBoxValue(uint8_t num)
+{
+return sbox[num];
+}
+*/
+#define getSBoxValue(num) (sbox[(num)])
+/*
+static uint8_t getSBoxInvert(uint8_t num)
+{
+return rsbox[num];
+}
+*/
+#define getSBoxInvert(num) (rsbox[(num)])
+
+// This function produces Nb(Nr+1) round keys. The round keys are used in each round to decrypt the states. 
+static void KeyExpansion(uint8_t* RoundKey, const uint8_t* Key)
+{
+	unsigned i, j, k;
+	uint8_t tempa[4]; // Used for the column/row operations
+
+					  // The first round key is the key itself.
+	for (i = 0; i < Nk; ++i)
+	{
+		RoundKey[(i * 4) + 0] = Key[(i * 4) + 0];
+		RoundKey[(i * 4) + 1] = Key[(i * 4) + 1];
+		RoundKey[(i * 4) + 2] = Key[(i * 4) + 2];
+		RoundKey[(i * 4) + 3] = Key[(i * 4) + 3];
+	}
+
+	// All other round keys are found from the previous round keys.
+	for (i = Nk; i < Nb * (Nr + 1); ++i)
+	{
+		{
+			k = (i - 1) * 4;
+			tempa[0] = RoundKey[k + 0];
+			tempa[1] = RoundKey[k + 1];
+			tempa[2] = RoundKey[k + 2];
+			tempa[3] = RoundKey[k + 3];
+
+		}
+
+		if (i % Nk == 0)
+		{
+			// This function shifts the 4 bytes in a word to the left once.
+			// [a0,a1,a2,a3] becomes [a1,a2,a3,a0]
+
+			// Function RotWord()
+			{
+				k = tempa[0];
+				tempa[0] = tempa[1];
+				tempa[1] = tempa[2];
+				tempa[2] = tempa[3];
+				tempa[3] = k;
+			}
+
+			// SubWord() is a function that takes a four-byte input word and 
+			// applies the S-box to each of the four bytes to produce an output word.
+
+			// Function Subword()
+			{
+				tempa[0] = getSBoxValue(tempa[0]);
+				tempa[1] = getSBoxValue(tempa[1]);
+				tempa[2] = getSBoxValue(tempa[2]);
+				tempa[3] = getSBoxValue(tempa[3]);
+			}
+
+			tempa[0] = tempa[0] ^ Rcon[i / Nk];
+		}
+#if defined(AES256) && (AES256 == 1)
+		if (i % Nk == 4)
+		{
+			// Function Subword()
+			{
+				tempa[0] = getSBoxValue(tempa[0]);
+				tempa[1] = getSBoxValue(tempa[1]);
+				tempa[2] = getSBoxValue(tempa[2]);
+				tempa[3] = getSBoxValue(tempa[3]);
+			}
+		}
+#endif
+		j = i * 4; k = (i - Nk) * 4;
+		RoundKey[j + 0] = RoundKey[k + 0] ^ tempa[0];
+		RoundKey[j + 1] = RoundKey[k + 1] ^ tempa[1];
+		RoundKey[j + 2] = RoundKey[k + 2] ^ tempa[2];
+		RoundKey[j + 3] = RoundKey[k + 3] ^ tempa[3];
+	}
+}
+
+void AES_init_ctx(struct AES_ctx* ctx, const uint8_t* key)
+{
+	KeyExpansion(ctx->RoundKey, key);
+}
+#if defined(CBC) && (CBC == 1)
+void AES_init_ctx_iv(struct AES_ctx* ctx, const uint8_t* key, const uint8_t* iv)
+{
+	KeyExpansion(ctx->RoundKey, key);
+	memcpy(ctx->Iv, iv, AES_BLOCKLEN);
+}
+void AES_ctx_set_iv(struct AES_ctx* ctx, const uint8_t* iv)
+{
+	memcpy(ctx->Iv, iv, AES_BLOCKLEN);
+}
+#endif
+
+// This function adds the round key to state.
+// The round key is added to the state by an XOR function.
+static void AddRoundKey(uint8_t round, state_t* state, uint8_t* RoundKey)
+{
+	uint8_t i, j;
+	for (i = 0; i < 4; ++i)
+	{
+		for (j = 0; j < 4; ++j)
+		{
+			(*state)[i][j] ^= RoundKey[(round * Nb * 4) + (i * Nb) + j];
+		}
+	}
+}
+
+// The SubBytes Function Substitutes the values in the
+// state matrix with values in an S-box.
+static void SubBytes(state_t* state)
+{
+	uint8_t i, j;
+	for (i = 0; i < 4; ++i)
+	{
+		for (j = 0; j < 4; ++j)
+		{
+			(*state)[j][i] = getSBoxValue((*state)[j][i]);
+		}
+	}
+}
+
+// The ShiftRows() function shifts the rows in the state to the left.
+// Each row is shifted with different offset.
+// Offset = Row number. So the first row is not shifted.
+static void ShiftRows(state_t* state)
+{
+	uint8_t temp;
+
+	// Rotate first row 1 columns to left  
+	temp = (*state)[0][1];
+	(*state)[0][1] = (*state)[1][1];
+	(*state)[1][1] = (*state)[2][1];
+	(*state)[2][1] = (*state)[3][1];
+	(*state)[3][1] = temp;
+
+	// Rotate second row 2 columns to left  
+	temp = (*state)[0][2];
+	(*state)[0][2] = (*state)[2][2];
+	(*state)[2][2] = temp;
+
+	temp = (*state)[1][2];
+	(*state)[1][2] = (*state)[3][2];
+	(*state)[3][2] = temp;
+
+	// Rotate third row 3 columns to left
+	temp = (*state)[0][3];
+	(*state)[0][3] = (*state)[3][3];
+	(*state)[3][3] = (*state)[2][3];
+	(*state)[2][3] = (*state)[1][3];
+	(*state)[1][3] = temp;
+}
+
+static uint8_t xtime(uint8_t x)
+{
+	return ((x << 1) ^ (((x >> 7) & 1) * 0x1b));
+}
+
+// MixColumns function mixes the columns of the state matrix
+static void MixColumns(state_t* state)
+{
+	uint8_t i;
+	uint8_t Tmp, Tm, t;
+	for (i = 0; i < 4; ++i)
+	{
+		t = (*state)[i][0];
+		Tmp = (*state)[i][0] ^ (*state)[i][1] ^ (*state)[i][2] ^ (*state)[i][3];
+		Tm = (*state)[i][0] ^ (*state)[i][1]; Tm = xtime(Tm);  (*state)[i][0] ^= Tm ^ Tmp;
+		Tm = (*state)[i][1] ^ (*state)[i][2]; Tm = xtime(Tm);  (*state)[i][1] ^= Tm ^ Tmp;
+		Tm = (*state)[i][2] ^ (*state)[i][3]; Tm = xtime(Tm);  (*state)[i][2] ^= Tm ^ Tmp;
+		Tm = (*state)[i][3] ^ t;              Tm = xtime(Tm);  (*state)[i][3] ^= Tm ^ Tmp;
+	}
+}
+
+// Multiply is used to multiply numbers in the field GF(2^8)
+#if MULTIPLY_AS_A_FUNCTION
+static uint8_t Multiply(uint8_t x, uint8_t y)
+{
+	return (((y & 1) * x) ^
+		((y >> 1 & 1) * xtime(x)) ^
+		((y >> 2 & 1) * xtime(xtime(x))) ^
+		((y >> 3 & 1) * xtime(xtime(xtime(x)))) ^
+		((y >> 4 & 1) * xtime(xtime(xtime(xtime(x))))));
+}
+#else
+#define Multiply(x, y)                                \
+      (  ((y & 1) * x) ^                              \
+      ((y>>1 & 1) * xtime(x)) ^                       \
+      ((y>>2 & 1) * xtime(xtime(x))) ^                \
+      ((y>>3 & 1) * xtime(xtime(xtime(x)))) ^         \
+      ((y>>4 & 1) * xtime(xtime(xtime(xtime(x))))))   \
+
+#endif
+
+// MixColumns function mixes the columns of the state matrix.
+// The method used to multiply may be difficult to understand for the inexperienced.
+// Please use the references to gain more information.
+static void InvMixColumns(state_t* state)
+{
+	int i;
+	uint8_t a, b, c, d;
+	for (i = 0; i < 4; ++i)
+	{
+		a = (*state)[i][0];
+		b = (*state)[i][1];
+		c = (*state)[i][2];
+		d = (*state)[i][3];
+
+		(*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^ Multiply(d, 0x09);
+		(*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^ Multiply(d, 0x0d);
+		(*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^ Multiply(d, 0x0b);
+		(*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^ Multiply(d, 0x0e);
+	}
+}
+
+
+// The SubBytes Function Substitutes the values in the
+// state matrix with values in an S-box.
+static void InvSubBytes(state_t* state)
+{
+	uint8_t i, j;
+	for (i = 0; i < 4; ++i)
+	{
+		for (j = 0; j < 4; ++j)
+		{
+			(*state)[j][i] = getSBoxInvert((*state)[j][i]);
+		}
+	}
+}
+
+static void InvShiftRows(state_t* state)
+{
+	uint8_t temp;
+
+	// Rotate first row 1 columns to right  
+	temp = (*state)[3][1];
+	(*state)[3][1] = (*state)[2][1];
+	(*state)[2][1] = (*state)[1][1];
+	(*state)[1][1] = (*state)[0][1];
+	(*state)[0][1] = temp;
+
+	// Rotate second row 2 columns to right 
+	temp = (*state)[0][2];
+	(*state)[0][2] = (*state)[2][2];
+	(*state)[2][2] = temp;
+
+	temp = (*state)[1][2];
+	(*state)[1][2] = (*state)[3][2];
+	(*state)[3][2] = temp;
+
+	// Rotate third row 3 columns to right
+	temp = (*state)[0][3];
+	(*state)[0][3] = (*state)[1][3];
+	(*state)[1][3] = (*state)[2][3];
+	(*state)[2][3] = (*state)[3][3];
+	(*state)[3][3] = temp;
+}
+
+
+// Cipher is the main function that encrypts the PlainText.
+static void Cipher(state_t* state, uint8_t* RoundKey)
+{
+	uint8_t round = 0;
+
+	// Add the First round key to the state before starting the rounds.
+	AddRoundKey(0, state, RoundKey);
+
+	// There will be Nr rounds.
+	// The first Nr-1 rounds are identical.
+	// These Nr-1 rounds are executed in the loop below.
+	for (round = 1; round < Nr; ++round)
+	{
+		SubBytes(state);
+		ShiftRows(state);
+		MixColumns(state);
+		AddRoundKey(round, state, RoundKey);
+	}
+
+	// The last round is given below.
+	// The MixColumns function is not here in the last round.
+	SubBytes(state);
+	ShiftRows(state);
+	AddRoundKey(Nr, state, RoundKey);
+}
+
+static void InvCipher(state_t* state, uint8_t* RoundKey)
+{
+	uint8_t round = 0;
+
+	// Add the First round key to the state before starting the rounds.
+	AddRoundKey(Nr, state, RoundKey);
+
+	// There will be Nr rounds.
+	// The first Nr-1 rounds are identical.
+	// These Nr-1 rounds are executed in the loop below.
+	for (round = (Nr - 1); round > 0; --round)
+	{
+		InvShiftRows(state);
+		InvSubBytes(state);
+		AddRoundKey(round, state, RoundKey);
+		InvMixColumns(state);
+	}
+
+	// The last round is given below.
+	// The MixColumns function is not here in the last round.
+	InvShiftRows(state);
+	InvSubBytes(state);
+	AddRoundKey(0, state, RoundKey);
+}
+
+
+/*****************************************************************************/
+/* Public functions:                                                         */
+/*****************************************************************************/
+#if defined(ECB) && (ECB == 1)
+
+
+void AES_ECB_encrypt(struct AES_ctx *ctx, const uint8_t* buf)
+{
+	// The next function call encrypts the PlainText with the Key using AES algorithm.
+	Cipher((state_t*)buf, ctx->RoundKey);
+}
+
+void AES_ECB_decrypt(struct AES_ctx* ctx, const uint8_t* buf)
+{
+	// The next function call decrypts the PlainText with the Key using AES algorithm.
+	InvCipher((state_t*)buf, ctx->RoundKey);
+}
+
+
+#endif // #if defined(ECB) && (ECB == 1)
+
+
+
+
+
+#if defined(CBC) && (CBC == 1)
+
+
+static void XorWithIv(uint8_t* buf, uint8_t* Iv)
+{
+	uint8_t i;
+	for (i = 0; i < AES_BLOCKLEN; ++i) // The block in AES is always 128bit no matter the key size
+	{
+		buf[i] ^= Iv[i];
+	}
+}
+
+void AES_CBC_encrypt_buffer(struct AES_ctx *ctx, uint8_t* buf, uint32_t length)
+{
+	uintptr_t i;
+	uint8_t *Iv = ctx->Iv;
+	for (i = 0; i < length; i += AES_BLOCKLEN)
+	{
+		XorWithIv(buf, Iv);
+		Cipher((state_t*)buf, ctx->RoundKey);
+		Iv = buf;
+		buf += AES_BLOCKLEN;
+		//printf("Step %d - %d", i/16, i);
+	}
+	/* store Iv in ctx for next call */
+	memcpy(ctx->Iv, Iv, AES_BLOCKLEN);
+}
+
+void AES_CBC_decrypt_buffer(struct AES_ctx* ctx, uint8_t* buf, uint32_t length)
+{
+	uintptr_t i;
+	uint8_t storeNextIv[AES_BLOCKLEN];
+	for (i = 0; i < length; i += AES_BLOCKLEN)
+	{
+		memcpy(storeNextIv, buf, AES_BLOCKLEN);
+		InvCipher((state_t*)buf, ctx->RoundKey);
+		XorWithIv(buf, ctx->Iv);
+		memcpy(ctx->Iv, storeNextIv, AES_BLOCKLEN);
+		buf += AES_BLOCKLEN;
+	}
+
+}
+
+#endif // #if defined(CBC) && (CBC == 1)
+
+
+
+#if defined(CTR) && (CTR == 1)
+
+/* Symmetrical operation: same function for encrypting as for decrypting. Note any IV/nonce should never be reused with the same key */
+void AES_CTR_xcrypt_buffer(struct AES_ctx* ctx, uint8_t* buf, uint32_t length)
+{
+	uint8_t buffer[AES_BLOCKLEN];
+
+	unsigned i;
+	int bi;
+	for (i = 0, bi = AES_BLOCKLEN; i < length; ++i, ++bi)
+	{
+		if (bi == AES_BLOCKLEN) /* we need to regen xor compliment in buffer */
+		{
+
+			memcpy(buffer, ctx->Iv, AES_BLOCKLEN);
+			Cipher((state_t*)buffer, ctx->RoundKey);
+
+			/* Increment Iv and handle overflow */
+			for (bi = (AES_BLOCKLEN - 1); bi >= 0; --bi)
+			{
+				/* inc will owerflow */
+				if (ctx->Iv[bi] == 255)
+				{
+					ctx->Iv[bi] = 0;
+					continue;
+				}
+				ctx->Iv[bi] += 1;
+				break;
+			}
+			bi = 0;
+		}
+
+		buf[i] = (buf[i] ^ buffer[bi]);
+	}
+}
+
+#endif // #if defined(CTR) && (CTR == 1)
+
diff --git a/tools/gzinject/src/aes.h b/tools/gzinject/src/aes.h
new file mode 100644
index 000000000..d1a468630
--- /dev/null
+++ b/tools/gzinject/src/aes.h
@@ -0,0 +1,90 @@
+#ifndef _AES_H_
+#define _AES_H_
+
+#include <stdint.h>
+
+// #define the macros below to 1/0 to enable/disable the mode of operation.
+//
+// CBC enables AES encryption in CBC-mode of operation.
+// CTR enables encryption in counter-mode.
+// ECB enables the basic ECB 16-byte block algorithm. All can be enabled simultaneously.
+
+// The #ifndef-guard allows it to be configured before #include'ing or at compile time.
+#ifndef CBC
+#define CBC 1
+#endif
+
+#ifndef ECB
+#define ECB 1
+#endif
+
+#ifndef CTR
+#define CTR 1
+#endif
+
+
+#define AES128 1
+//#define AES192 1
+//#define AES256 1
+
+#define AES_BLOCKLEN 16 //Block length in bytes AES is 128b block only
+
+#if defined(AES256) && (AES256 == 1)
+#define AES_KEYLEN 32
+#define AES_keyExpSize 240
+#elif defined(AES192) && (AES192 == 1)
+#define AES_KEYLEN 24
+#define AES_keyExpSize 208
+#else
+#define AES_KEYLEN 16   // Key length in bytes
+#define AES_keyExpSize 176
+#endif
+
+struct AES_ctx
+{
+	uint8_t RoundKey[AES_keyExpSize];
+#if (defined(CBC) && (CBC == 1)) || (defined(CTR) && (CTR == 1))
+	uint8_t Iv[AES_BLOCKLEN];
+#endif
+};
+
+void AES_init_ctx(struct AES_ctx* ctx, const uint8_t* key);
+#if defined(CBC) && (CBC == 1)
+void AES_init_ctx_iv(struct AES_ctx* ctx, const uint8_t* key, const uint8_t* iv);
+void AES_ctx_set_iv(struct AES_ctx* ctx, const uint8_t* iv);
+#endif
+
+#if defined(ECB) && (ECB == 1)
+// buffer size is exactly AES_BLOCKLEN bytes; 
+// you need only AES_init_ctx as IV is not used in ECB 
+// NB: ECB is considered insecure for most uses
+void AES_ECB_encrypt(struct AES_ctx* ctx, const uint8_t* buf);
+void AES_ECB_decrypt(struct AES_ctx* ctx, const uint8_t* buf);
+
+#endif // #if defined(ECB) && (ECB == !)
+
+
+#if defined(CBC) && (CBC == 1)
+// buffer size MUST be mutile of AES_BLOCKLEN;
+// Suggest https://en.wikipedia.org/wiki/Padding_(cryptography)#PKCS7 for padding scheme
+// NOTES: you need to set IV in ctx via AES_init_ctx_iv() or AES_ctx_set_iv()
+//        no IV should ever be reused with the same key 
+void AES_CBC_encrypt_buffer(struct AES_ctx* ctx, uint8_t* buf, uint32_t length);
+void AES_CBC_decrypt_buffer(struct AES_ctx* ctx, uint8_t* buf, uint32_t length);
+
+#endif // #if defined(CBC) && (CBC == 1)
+
+
+#if defined(CTR) && (CTR == 1)
+
+// Same function for encrypting as for decrypting. 
+// IV is incremented for every block, and used after encryption as XOR-compliment for output
+// Suggesting https://en.wikipedia.org/wiki/Padding_(cryptography)#PKCS7 for padding scheme
+// NOTES: you need to set IV in ctx with AES_init_ctx_iv() or AES_ctx_set_iv()
+//        no IV should ever be reused with the same key 
+void AES_CTR_xcrypt_buffer(struct AES_ctx* ctx, uint8_t* buf, uint32_t length);
+
+#endif // #if defined(CTR) && (CTR == 1)
+
+
+#endif //_AES_H_
\ No newline at end of file
diff --git a/tools/gzinject/src/doltool.c b/tools/gzinject/src/doltool.c
new file mode 100644
index 000000000..dbc48e451
--- /dev/null
+++ b/tools/gzinject/src/doltool.c
@@ -0,0 +1,99 @@
+#include <stdio.h>
+#include <sys/stat.h>
+#include <malloc.h>
+#include <string.h>
+
+#include "doltool.h"
+#include "gzinject.h"
+
+void dol_load(doltool_ctxt_t *ctxt, uint8_t **file_data, uint32_t *file_size){
+
+    ctxt->file_data = file_data;
+    ctxt->file_size = file_size;
+
+    memcpy(&ctxt->hdr,*(ctxt->file_data),sizeof(ctxt->hdr));
+
+    for(int i=0;i<7;i++){
+        ctxt->hdr.text_size[i] = REVERSEENDIAN32(ctxt->hdr.text_size[i]);
+        ctxt->hdr.text_offset[i] = REVERSEENDIAN32(ctxt->hdr.text_offset[i]);
+        ctxt->hdr.text_loading[i] = REVERSEENDIAN32(ctxt->hdr.text_loading[i]);
+        if(ctxt->hdr.text_size[i]>0){
+            ctxt->text_sections[i] = *(ctxt->file_data) + ctxt->hdr.text_offset[i];
+        }
+    }
+    for(int i=0;i<11;i++){
+        ctxt->hdr.data_size[i] = REVERSEENDIAN32(ctxt->hdr.data_size[i]);
+        ctxt->hdr.data_offset[i] = REVERSEENDIAN32(ctxt->hdr.data_offset[i]);
+        ctxt->hdr.data_loading[i] = REVERSEENDIAN32(ctxt->hdr.data_loading[i]);
+        if(ctxt->hdr.data_size[i]>0){
+            ctxt->data_sections[i] = *(ctxt->file_data) + ctxt->hdr.data_offset[i];
+        }
+    }
+}
+
+void dol_inject(doltool_ctxt_t *ctxt, uint8_t *text, size_t size, uint32_t loading_addr){
+    int injection_idx = 0;
+    size = addpadding(size,16);
+    for(int i = 0;i<7;i++){
+        if(ctxt->text_sections[i]) continue;
+        injection_idx = i;
+        ctxt->text_sections[i] = text;
+        ctxt->hdr.text_loading[i] = loading_addr;
+        ctxt->hdr.text_offset[i] = ctxt->hdr.data_offset[0];
+        ctxt->hdr.text_size[i] = size;
+        break;
+    }
+    for(int i=0;i<11;i++){
+        if(ctxt->data_sections[i]){
+            ctxt->hdr.data_offset[i] += ctxt->hdr.text_size[injection_idx];
+        }else{
+            break;
+        }
+    }
+}
+
+size_t dol_save(doltool_ctxt_t *ctxt){
+    uint32_t text_sizes[7];
+    uint32_t data_sizes[11];
+    memcpy(text_sizes,ctxt->hdr.text_size,sizeof(ctxt->hdr.text_size));
+    memcpy(data_sizes,ctxt->hdr.data_size,sizeof(ctxt->hdr.data_size));
+    uint32_t totalsize = 0x100;
+    for(int i=0;i<7;i++){
+        totalsize += ctxt->hdr.text_size[i];
+        ctxt->hdr.text_size[i] = REVERSEENDIAN32(ctxt->hdr.text_size[i]);
+        ctxt->hdr.text_offset[i] = REVERSEENDIAN32(ctxt->hdr.text_offset[i]);
+        ctxt->hdr.text_loading[i] = REVERSEENDIAN32(ctxt->hdr.text_loading[i]);
+    }
+    for(int i=0;i<11;i++){
+        totalsize += ctxt->hdr.data_size[i];
+        ctxt->hdr.data_size[i] = REVERSEENDIAN32(ctxt->hdr.data_size[i]);
+        ctxt->hdr.data_offset[i] = REVERSEENDIAN32(ctxt->hdr.data_offset[i]);
+        ctxt->hdr.data_loading[i] = REVERSEENDIAN32(ctxt->hdr.data_loading[i]);
+    }
+
+    uint8_t *new_data = malloc(totalsize);
+    if(!new_data){
+        perror("Could not allocate new dol");
+        return 0;
+    }
+    memcpy(new_data,&ctxt->hdr,sizeof(ctxt->hdr));
+    uint8_t *p = new_data + sizeof(ctxt->hdr);
+    for(int i=0;i<7;i++){
+        if(ctxt->text_sections[i]){
+            memcpy(p,ctxt->text_sections[i],text_sizes[i]);
+            p += text_sizes[i];
+        }
+    }
+    for(int i=0;i<11;i++){
+        if(ctxt->data_sections[i]){
+            memcpy(p,ctxt->data_sections[i],data_sizes[i]);
+            p += data_sizes[i];
+        }
+    }
+    free(*(ctxt->file_data));
+    *(ctxt->file_data) = new_data;
+    if(ctxt->file_size){
+        *(ctxt->file_size) = totalsize;
+    }
+    return totalsize;
+}
\ No newline at end of file
diff --git a/tools/gzinject/src/doltool.h b/tools/gzinject/src/doltool.h
new file mode 100644
index 000000000..3c30194e9
--- /dev/null
+++ b/tools/gzinject/src/doltool.h
@@ -0,0 +1,31 @@
+#ifndef _DOLTOOL_H
+#define _DOLTOOL_H
+
+#include <stdint.h>
+
+typedef struct {
+    uint32_t    text_offset[7];     /* 0x000 */
+    uint32_t    data_offset[11];    /* 0x01C */
+    uint32_t    text_loading[7];    /* 0x048 */
+    uint32_t    data_loading[11];   /* 0x064 */
+    uint32_t    text_size[7];       /* 0x090 */
+    uint32_t    data_size[11];      /* 0x0AC */
+    uint32_t    bss_loading;        /* 0x0D8 */
+    uint32_t    bss_size;           /* 0x0DC */
+    uint32_t    entry;              /* 0x0E0 */
+    char        padding[0x1C];      /* 0x0E4 */
+} dol_hdr_t;                        /* 0x100 */
+
+typedef struct{
+    dol_hdr_t hdr;
+    uint8_t *text_sections[7];
+    uint8_t *data_sections[11];
+    uint8_t **file_data;
+    uint32_t *file_size;
+} doltool_ctxt_t;
+
+void dol_load(doltool_ctxt_t *ctxt, uint8_t **file_data, uint32_t *file_size);
+void dol_inject(doltool_ctxt_t *ctxt, uint8_t *text, size_t size, uint32_t loading_addr);
+size_t dol_save(doltool_ctxt_t *ctxt);
+
+#endif
\ No newline at end of file
diff --git a/tools/gzinject/src/fastaes.c b/tools/gzinject/src/fastaes.c
new file mode 100644
index 000000000..ed6e9ae4b
--- /dev/null
+++ b/tools/gzinject/src/fastaes.c
@@ -0,0 +1,115 @@
+#ifdef FASTAES
+#include <string.h>
+#include "fastaes.h"
+
+static __m128i do_key_exp(__m128i a, __m128i b) {
+    __m128i tmp;
+
+    b = _mm_shuffle_epi32(b, 0xFF);
+    tmp = _mm_slli_si128(a, 4);
+    a = _mm_xor_si128(a, tmp);
+    tmp = _mm_slli_si128(a, 4);
+    a = _mm_xor_si128(a, tmp);
+    tmp = _mm_slli_si128(a, 4);
+    a = _mm_xor_si128(a, tmp);
+    a = _mm_xor_si128(a, b);
+
+    return a;
+}
+
+static void key_expansion(const uint8_t *key, __m128i *key_sched, __m128i *dkey_sched) {
+    key_sched[0] = _mm_loadu_si128((const __m128i_u*)key);
+    key_sched[1] = do_key_exp(key_sched[0], _mm_aeskeygenassist_si128(key_sched[0], 0x01));
+    key_sched[2] = do_key_exp(key_sched[1], _mm_aeskeygenassist_si128(key_sched[1], 0x02));
+    key_sched[3] = do_key_exp(key_sched[2], _mm_aeskeygenassist_si128(key_sched[2], 0x04));
+    key_sched[4] = do_key_exp(key_sched[3], _mm_aeskeygenassist_si128(key_sched[3], 0x08));
+    key_sched[5] = do_key_exp(key_sched[4], _mm_aeskeygenassist_si128(key_sched[4], 0x10));
+    key_sched[6] = do_key_exp(key_sched[5], _mm_aeskeygenassist_si128(key_sched[5], 0x20));
+    key_sched[7] = do_key_exp(key_sched[6], _mm_aeskeygenassist_si128(key_sched[6], 0x40));
+    key_sched[8] = do_key_exp(key_sched[7], _mm_aeskeygenassist_si128(key_sched[7], 0x80));
+    key_sched[9] = do_key_exp(key_sched[8], _mm_aeskeygenassist_si128(key_sched[8], 0x1B));
+    key_sched[10] = do_key_exp(key_sched[9], _mm_aeskeygenassist_si128(key_sched[9], 0x36));
+
+    dkey_sched[0] = key_sched[0];
+    dkey_sched[1] = _mm_aesimc_si128(key_sched[1]);
+    dkey_sched[2] = _mm_aesimc_si128(key_sched[2]);
+    dkey_sched[3] = _mm_aesimc_si128(key_sched[3]);
+    dkey_sched[4] = _mm_aesimc_si128(key_sched[4]);
+    dkey_sched[5] = _mm_aesimc_si128(key_sched[5]);
+    dkey_sched[6] = _mm_aesimc_si128(key_sched[6]);
+    dkey_sched[7] = _mm_aesimc_si128(key_sched[7]);
+    dkey_sched[8] = _mm_aesimc_si128(key_sched[8]);
+    dkey_sched[9] = _mm_aesimc_si128(key_sched[9]);
+    dkey_sched[10] = key_sched[10];
+
+}
+
+void aes_ctx_init(aes_ctxt_t *ctx, const uint8_t *key, const uint8_t *iv) {
+    memcpy(ctx->iv, iv, sizeof(ctx->iv));
+    key_expansion(key, ctx->key_schedule, ctx->dkey_schedule);
+}
+
+static __m128i cipher(__m128i state, __m128i *key_sched) {
+    state = _mm_xor_si128(state, key_sched[0]);
+
+    for(int i = 1; i < 10; i++) {
+        state = _mm_aesenc_si128(state, key_sched[i]);
+    }
+
+    return _mm_aesenclast_si128(state, key_sched[10]);
+}
+
+static __m128i inv_cipher(__m128i state, __m128i *key_sched) {
+    state = _mm_xor_si128(state, key_sched[10]);
+    state = _mm_aesdec_si128(state, key_sched[9]);
+    state = _mm_aesdec_si128(state, key_sched[8]);
+    state = _mm_aesdec_si128(state, key_sched[7]);
+    state = _mm_aesdec_si128(state, key_sched[6]);
+    state = _mm_aesdec_si128(state, key_sched[5]);
+    state = _mm_aesdec_si128(state, key_sched[4]);
+    state = _mm_aesdec_si128(state, key_sched[3]);
+    state = _mm_aesdec_si128(state, key_sched[2]);
+    state = _mm_aesdec_si128(state, key_sched[1]);
+
+    state = _mm_aesdeclast_si128(state, key_sched[0]);
+
+    return state;
+}
+
+void aes_encrypt_buffer(aes_ctxt_t *ctx, uint8_t *buffer, size_t len) {
+    __m128i iv = _mm_loadu_si128((const __m128i*)ctx->iv);
+    __m128i state;
+
+    for(int i = 0; i < len; i += 16) {
+        state = _mm_loadu_si128((const __m128i*)buffer);
+        state = _mm_xor_si128(state, iv);
+        state = cipher(state, ctx->key_schedule);
+        _mm_storeu_si128((__m128i_u*)buffer, state);
+        iv = state;
+
+        buffer += 16;
+    }
+
+    _mm_storeu_si128((__m128i_u*)&ctx->state, state);
+    _mm_storeu_si128((__m128i_u*)ctx->iv, iv);
+}
+
+void aes_decrypt_buffer(aes_ctxt_t *ctx, uint8_t *buffer, size_t len) {
+    __m128i state;
+    __m128i iv = _mm_loadu_si128((const __m128i_u*)ctx->iv);
+    __m128i next_iv;
+
+    for(int i = 0; i < len; i += 16) {
+        state = _mm_loadu_si128((const __m128i_u*)buffer);
+        next_iv = state;
+        state = inv_cipher(state, ctx->dkey_schedule);
+        state = _mm_xor_si128(state, iv);
+        iv = next_iv;
+        _mm_storeu_si128((__m128i_u*)buffer, state);
+        buffer += 16;
+    }
+
+    _mm_storeu_si128((__m128i_u*)&ctx->state, state);
+    _mm_storeu_si128((__m128i_u*)ctx->iv, iv);
+}
+#endif
diff --git a/tools/gzinject/src/fastaes.h b/tools/gzinject/src/fastaes.h
new file mode 100644
index 000000000..3080751fb
--- /dev/null
+++ b/tools/gzinject/src/fastaes.h
@@ -0,0 +1,27 @@
+#ifdef FASTAES
+#ifndef _FASTAES_H
+#define _FASTAES_H
+
+#include <stdint.h>
+#include <wmmintrin.h>
+
+/**
+ * fast aes for  x86/x86-64 processors.
+ */
+
+typedef uint8_t state_t[4][4];
+
+typedef struct {
+    state_t state;
+    uint8_t iv[16];
+    // gzinject only cares about aes128
+    __m128i key_schedule[11];
+    __m128i dkey_schedule[11];
+} aes_ctxt_t;
+
+void aes_ctx_init(aes_ctxt_t *ctx, const uint8_t *key, const uint8_t *iv);
+void aes_encrypt_buffer(aes_ctxt_t *ctx, uint8_t *buffer, size_t len);
+void aes_decrypt_buffer(aes_ctxt_t *ctx, uint8_t *buffer, size_t len);
+
+#endif
+#endif
diff --git a/tools/gzinject/src/gzi.c b/tools/gzinject/src/gzi.c
new file mode 100644
index 000000000..95768c217
--- /dev/null
+++ b/tools/gzinject/src/gzi.c
@@ -0,0 +1,218 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "gzi.h"
+#include "lz77.h"
+#include "gzinject.h"
+
+typedef int (*gzi_action_t)(gzi_ctxt_t *ctxt, int pos);
+
+static int gzi_cmd_file(gzi_ctxt_t *ctxt, int pos){
+    ctxt->curfile = ctxt->codes[pos].data & 0xFF;
+    if(verbose){
+        printf("Setting current file to %d\n",ctxt->curfile);
+    }
+    return 1;
+}
+
+static int gzi_cmd_lz77_decomp(gzi_ctxt_t *ctxt, int pos){
+    int32_t curfile = ctxt->curfile;
+    if(curfile<0){
+        printf("Warning: No file Selected, not decompressing.\n");
+        return 0;
+    }
+    if(verbose){
+        printf("LZ77 Decompressing %d\n",curfile);
+    }
+    int decompsize = addpadding(lz77_decompressed_size(ctxt->file_ptrs[curfile]),16);
+    uint8_t *decomp = calloc(decompsize,1);
+    lz77_decompress(ctxt->file_ptrs[curfile],decomp);
+    free(ctxt->file_ptrs[curfile]);
+    ctxt->file_ptrs[curfile] = decomp;
+    ctxt->file_sizes[curfile] = decompsize;
+    return 1;
+}
+
+static int gzi_cmd_lz77_comp(gzi_ctxt_t *ctxt, int pos){
+    int32_t curfile = ctxt->curfile;
+    if(curfile<0){
+        printf("Warning: No file selected, not compressing.\n");
+        return 0;
+    }
+    if(verbose){
+        printf("LZ77 Compressing %d\n",curfile);
+    }
+    uint8_t *comp = NULL;
+    uint32_t len = ctxt->file_sizes[curfile];
+
+    // I hate this, but it works for now.
+    len -= (8 - (len & 0x8));
+    int complen = lz77_compress(ctxt->file_ptrs[curfile],&comp,len,&len);
+    free(ctxt->file_ptrs[curfile]);
+    ctxt->file_ptrs[curfile] = comp;
+    ctxt->file_sizes[curfile] = complen;
+    return 1;
+}
+
+static int gzi_cmd_apply_patch(gzi_ctxt_t *ctxt, int pos){
+    int32_t curfile = ctxt->curfile;
+    if(curfile<0){
+        printf("Warning: No file selected, not applying patch.\n");
+    }
+    gzi_code_t code = ctxt->codes[pos];
+    uint32_t val = code.data;
+    if(verbose){
+        printf("Apply patch to %d. offset 0x%x = 0x%x\n",curfile,code.offset,code.data);
+    }
+    uint8_t *p;
+    switch(curfile){
+        case GZI_FILE_TMD:
+            p = ctxt->tmd;
+            break;
+        case GZI_FILE_TIK:
+            p = ctxt->tik;
+            break;
+        case GZI_FILE_CERT:
+            p = ctxt->cert;
+            break;
+        default:
+            if(curfile>ctxt->filecnt-1){
+                return -1;
+            }
+            p = ctxt->file_ptrs[curfile];
+            break;
+    }
+    switch(code.len){
+        case 1:
+            *((uint8_t*)(p + code.offset)) = (uint8_t)val;
+            break;
+        case 2:
+            *((uint16_t*)(p + code.offset)) = REVERSEENDIAN16((uint16_t)val);
+            break;
+        case 4:
+        default:
+            *((uint32_t*)(p + code.offset)) = REVERSEENDIAN32(val);
+            break;
+    }
+    return 1;
+}
+
+static gzi_action_t commands[] = {
+    gzi_cmd_file,
+    gzi_cmd_lz77_decomp,
+    gzi_cmd_lz77_comp,
+    gzi_cmd_apply_patch,
+};
+
+static char *readline(FILE *fle){
+    char *line = NULL;
+    int buflen=256;
+    for(int i=0;;++i){
+        int c = fgetc(fle);
+
+        if(i%buflen==0){
+            char *new = realloc(line,i+buflen);
+            line = new;
+        }
+        if(c==EOF || c=='\n'){
+            line[i] = 0;
+            return line;
+        }else{
+            line[i] = c;
+        }
+    }
+}
+
+int ishexstring(const char *string, size_t len){
+    const char *s;
+    for(s = string; *s!=0;s++){
+        if(!isxdigit(*s)){
+            return 0;
+        }
+    }
+    return s - string == len;
+}
+
+void parseline(gzi_ctxt_t *ctxt, const char *line){
+    char command[6]={0};
+    char offset[10]={0};
+    char data[10]={0};
+    sscanf(line,"%5s %9s %9s",command,offset,data);
+    if(!ishexstring(command,4) || !ishexstring(offset,8) || !ishexstring(offset,8))
+        return;
+    ctxt->codecnt++;
+    gzi_code_t *new_codes = realloc(ctxt->codes,sizeof(gzi_code_t) * ctxt->codecnt);
+    if(new_codes){
+        ctxt->codes = new_codes;
+    }
+    gzi_code_t code;
+    uint16_t cmd;
+    sscanf(command,"%"SCNx16,&cmd);
+    code.command = (cmd & 0xFF00) >> 8;
+    code.len = cmd & 0xFF;
+    sscanf(offset,"%"SCNx32,&code.offset);
+    sscanf(data,"%"SCNx32,&code.data);
+    memcpy(ctxt->codes + (ctxt->codecnt - 1),&code,sizeof(code));
+}
+
+int gzi_parse_file(gzi_ctxt_t *ctxt, const char *file){
+    FILE *fle = fopen(file,"r");
+    if(!fle){
+        fprintf(stderr,"Could not open %s, cannot parse file.\n",file);
+    }
+    if(verbose){
+        printf("Parsing gzi file %s\n",file);
+    }
+    while(!feof(fle)){
+        char *line = readline(fle);
+        if(!line){
+            fprintf(stderr,"Could not readline from gzi file %s.\n",file);
+            return 0;
+        }
+        if(line[0]=='#' || line[0]==0){
+            free(line);
+            continue;
+        }
+        parseline(ctxt,line);
+        free(line);
+    }
+    fclose(fle);
+    return 1;
+}
+
+int gzi_run(gzi_ctxt_t *ctxt){
+    if(verbose){
+        printf("Running gzi commands\n");
+    }
+    for(int i=0;i<ctxt->codecnt;i++){
+        commands[ctxt->codes[i].command](ctxt,i);
+    }
+    return 1;
+}
+
+int gzi_init(gzi_ctxt_t *ctxt, uint8_t **files, uint32_t *filesizes, int filecnt,
+             uint8_t *tmd, uint8_t *tik, uint8_t *cert,
+             uint32_t *tmd_size, uint32_t *tik_size, uint32_t *cert_size){
+    ctxt->codes = NULL;
+    ctxt->codecnt=0;
+    ctxt->curfile=-1;
+    ctxt->file_ptrs = files;
+    ctxt->file_sizes = filesizes;
+    ctxt->filecnt = filecnt;
+    ctxt->tmd = tmd;
+    ctxt->tik = tik;
+    ctxt->cert = cert;
+    ctxt->tmd_size = tmd_size;
+    ctxt->tik_size = tik_size;
+    ctxt->cert_size = cert_size;
+    return 1;
+}
+
+int gzi_destroy(gzi_ctxt_t *ctxt){
+    if(ctxt->codes) free(ctxt->codes);
+    return 1;
+}
\ No newline at end of file
diff --git a/tools/gzinject/src/gzi.h b/tools/gzinject/src/gzi.h
new file mode 100644
index 000000000..1b49fcfa9
--- /dev/null
+++ b/tools/gzinject/src/gzi.h
@@ -0,0 +1,39 @@
+#ifndef _PATCH_H_
+#define _PATCH_H_
+
+#include <stdint.h>
+
+#define GZI_FILE_TMD    100
+#define GZI_FILE_TIK    101
+#define GZI_FILE_CERT   102
+
+typedef struct {
+    uint8_t     command;
+    uint8_t     len;
+    uint32_t    offset;
+    uint32_t    data;
+} gzi_code_t;
+
+typedef struct{
+    gzi_code_t     *codes;
+    int             codecnt;
+    int8_t          curfile;
+    uint8_t       **file_ptrs;
+    uint32_t       *file_sizes;
+    uint8_t         filecnt;
+    uint8_t        *tmd;
+    uint8_t        *tik;
+    uint8_t        *cert;
+    uint32_t       *tmd_size;
+    uint32_t       *tik_size;
+    uint32_t       *cert_size;
+} gzi_ctxt_t;
+
+int gzi_parse_file(gzi_ctxt_t *ctxt, const char *file);
+int gzi_run(gzi_ctxt_t *ctxt);
+int gzi_init(gzi_ctxt_t *ctxt, uint8_t **files, uint32_t *filesizes, int filecnt, 
+             uint8_t *tmd, uint8_t *tik, uint8_t *cert, 
+             uint32_t *tmd_size, uint32_t *tik_size, uint32_t *cert_size);
+int gzi_destroy(gzi_ctxt_t *ctxt);
+
+#endif
\ No newline at end of file
diff --git a/tools/gzinject/src/gzinject.c b/tools/gzinject/src/gzinject.c
new file mode 100644
index 000000000..1b15397ae
--- /dev/null
+++ b/tools/gzinject/src/gzinject.c
@@ -0,0 +1,1329 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <errno.h>
+
+#include "gzinject.h"
+#include "lz77.h"
+#include "u8.h"
+#include "gzi.h"
+#include "aes.h"
+#include "sha1.h"
+#include "md5.h"
+#include "romchu.h"
+#include "doltool.h"
+#include "fastaes.h"
+
+static uint8_t key[16] = {0};
+static uint8_t region = 0x03;
+
+static int cleanup = 0;
+static int content_num = 5;
+
+int verbose = 0;
+int dol_after = -1;
+
+static char *wad = NULL;
+static char *directory = NULL;
+static char *keyfile = NULL;
+static char    *workingdirectory = NULL;
+static char *rom = NULL;
+static char *outwad = NULL;
+static patch_list_t *patch = NULL;
+static patch_list_t **patch_link = &patch;
+static dol_list_t *dol = NULL;
+static dol_list_t **dol_link = &dol;
+static dol_loading_list_t *dol_loading = NULL;
+static dol_loading_list_t **dol_loading_link = &dol_loading;
+static char *titleid = NULL;
+static char *channelname = NULL;
+
+uint16_t be16(const uint8_t *p)
+{
+    return (p[0] << 8) | p[1];
+}
+
+uint32_t be32(const uint8_t *p)
+{
+    return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
+}
+
+static const struct option cmdoptions[] = {
+    { "action",required_argument,0,'a' },
+    { "wad",required_argument,0,'w' },
+    { "channelid",required_argument,0,'i' },
+    { "channeltitle",required_argument,0,'t' },
+    { "help",no_argument,0,'h' },
+    { "key",required_argument,0,'k' },
+    { "region",required_argument,0,'r' },
+    { "verbose",no_argument,&verbose,1 },
+    { "directory",required_argument,0,'d' },
+    { "cleanup", no_argument,&cleanup,1},
+    { "version",no_argument,0,'v'},
+    { "rom",required_argument,0,'m'},
+    { "outputwad",required_argument,0,'o'},
+    { "patch-file",required_argument,0,'p'},
+    { "content-num",required_argument,0,'c'},
+    { "dol-inject",required_argument,0,'f'},
+    { "dol-loading",required_argument,0,'l'},
+    { "dol-after", required_argument,0,'e'},
+    { 0,0,0,0}
+};
+
+const uint8_t newkey[16] = {
+    0x47, 0x5a, 0x49, 0x73, 0x4c, 0x69, 0x66, 0x65, 0x41, 0x6e, 0x64, 0x42, 0x65, 0x65, 0x72, 0x21
+};
+
+static SHA1_CTX sha1;
+static MD5_CTX md5;
+
+#ifdef FASTAES
+static aes_ctxt_t aes;
+static void do_encrypt(uint8_t *input, size_t size, const uint8_t *key, const uint8_t *iv) {
+    aes_ctx_init(&aes, key, iv);
+    aes_encrypt_buffer(&aes, input, size);
+}
+
+static void do_decrypt(uint8_t *input, size_t size, const uint8_t *key, const uint8_t *iv) {
+    aes_ctx_init(&aes, key, iv);
+    aes_decrypt_buffer(&aes, input, size);
+}
+
+#else
+
+static struct AES_ctx aes;
+static void do_encrypt(uint8_t *input, size_t size, const uint8_t *key, const uint8_t *iv) {
+    AES_init_ctx_iv(&aes, key, iv);
+    AES_CBC_encrypt_buffer(&aes, input, size);
+}
+
+static void do_decrypt(uint8_t *input, size_t size, const uint8_t *key, const uint8_t *iv) {
+    AES_init_ctx_iv(&aes, key, iv);
+    AES_CBC_decrypt_buffer(&aes, input, size);
+}
+#endif
+
+ static void do_sha1(uint8_t *input, uint8_t *output, size_t size) {
+    SHA1Init(&sha1);
+    SHA1Update(&sha1, input, size);
+    SHA1Final(output, &sha1);
+}
+
+ static void do_md5(uint8_t *input, uint8_t *output, size_t size) {
+    MD5_Init(&md5);
+    MD5_Update(&md5, input, size);
+    MD5_Final(output, &md5);
+}
+
+uint32_t addpadding(uint32_t inp, uint32_t padding) {
+    int ret = inp;
+    if (inp % padding != 0) {
+        ret = inp + (padding - (inp % padding));
+    }
+    return ret;
+}
+
+static uint32_t getcontentlength(uint8_t *tmd, uint32_t contentnum) {
+    uint32_t off = 0x1ec + (36 * contentnum);
+    return tmd[off + 4] << 24 |
+        tmd[off + 5] << 16 |
+        tmd[off + 6] << 8 |
+        tmd[off + 7];
+}
+
+static void setcontentlength(uint8_t *tmd, uint32_t contentnum, uint32_t size){
+    uint32_t off = 0x1ec + (36 * contentnum) + 4;
+    *((uint32_t*)(tmd + off)) = REVERSEENDIAN32(size);
+}
+
+static void removedir(const char *file);
+
+static void removefile(const char* file) {
+    struct stat sbuffer;
+    if (stat(file, &sbuffer) == 0) {
+        if ((sbuffer.st_mode & S_IFMT) == S_IFDIR) {
+            removedir(file);
+        }
+        else if ((sbuffer.st_mode & S_IFMT) == S_IFREG) {
+            if (verbose) {
+                printf("Removing %s\n", file);
+            }
+            remove(file);
+        }
+
+    }
+}
+
+static void removedir(const char *file) {
+    DIR *dir;
+    struct dirent *ent;
+    if ((dir = opendir(file)) != NULL) {
+        while ((ent = readdir(dir)) != NULL) {
+            if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0)
+                continue;
+            char *path = malloc(1000);
+            snprintf(path, 1000, "%s/%s", file, ent->d_name);
+            removefile(path);
+            free(path);
+        }
+        if (verbose) {
+            printf("Removing %s\n", file);
+        }
+        rmdir(file);
+    }
+
+}
+
+static char *removeext(char* mystr) {
+    char *retstr;
+    char *lastdot;
+    if (mystr == NULL)
+        return NULL;
+    if ((retstr = malloc(strlen(mystr) + 1)) == NULL)
+        return NULL;
+    strcpy(retstr, mystr);
+    lastdot = strrchr(retstr, '.');
+    if (lastdot != NULL)
+        *lastdot = '\0';
+    return retstr;
+}
+
+static void print_usage() {
+    puts("Usage:\n"
+    "  gzinject -a extract -w SOURCEWAD [options]\n"
+    "  gzinject -a pack -w DESTWAD [options]\n"
+    "  gzinject -a inject -w SOURCEWAD -m ROM [options]\n"
+    "  gzinject -a romc -m INROM -o OUTROM [options]\n"
+    "  gzinject -a genkey [options]\n"
+    "  gzinject --help\n"
+    "  gzinject --version\n\n"
+    "Actions:\n"
+    "  extract      extracts SOURCEWAD to directory\n"
+    "  pack         packs directory into DESTWAD\n"
+    "  inject       injects rom into SOURCEWAD\n"
+    "  romc         decompresses a romc compressed rom\n"
+    "  genkey       generates wii common-key\n\n"
+    "Options:\n"
+    "  -i, --channelid=ID           New Channel ID For Pack and Inject actions (default: none)\n"
+    "  -t, --title=title            New Channel name for pack and inject actions (default: none)\n"
+    "  -h, --help                   Prints this help message\n"
+    "  -k, --key=keyfile            Location of the common-key file (default: ./common-key.bin)\n"
+    "  -r, --region=1-3             Region to use (default: 3)\n"
+    "  --verbose                    Print out verbose program execution information\n"
+    "  -d, --directory=directory    Directory to extract contents to, or directory to read contents from (default: ./wadextract)\n"
+    "  --cleanup                    Remove files before performing actions\n"
+    "  --version                    Prints the current version\n"
+    "  -m, --rom=rom                Rom to inject for inject action (default: none), also rom to romc decompress\n"
+    "  -o, --outputwad=outwad       The output wad for inject actions (default: SOURCEWAD-inject.wad), also output for romc decompression\n"
+    "  -p, --patch-file=patchfile   gzi file to use for applying patches (default: none)\n"
+    "  -c, --content=contentfile    the primary content file (default: 5)\n"
+    "  --dol-inject                 Binary data to inject into the emulator program, requires --dol-loading\n"
+    "  --dol-loading                The loading address for the binary specified by --dol-inject\n"
+    "  --dol-after                  After which patch file to inject the dol, default: after all patches\n"
+    );
+}
+
+static void print_version(const char* prog) {
+    printf("%s Version ", prog);
+    printf(GZINJECT_VERSION);
+    printf("\n");
+}
+
+static void truchasign(uint8_t *data, uint8_t type, size_t len) {
+    uint16_t pos = 0x1f2;
+    if (type == W_TMD) {
+        pos = 0x1d4;
+    }
+
+    uint8_t digest[20];
+    do_sha1(data + 0x140, digest, len - 0x140);
+
+    uint16_t i;
+    if (digest[0] != 0x00) {
+        for (i = 4; i < 260; i++) {
+            data[i] = 0x00;
+        }
+        for (i = 0; i < 0xFFFF; i++) {
+            uint16_t revi = REVERSEENDIAN16(i);
+            memcpy(data + pos, &revi, 2);
+
+            do_sha1(data + 0x140, digest, len - 0x140);
+
+            if (digest[0] == 0x00) {
+                break;
+            }
+        }
+    }
+}
+
+static int do_extract() {
+    struct stat sbuffer;
+
+    if (stat(wad, &sbuffer) != 0) {
+        printf("Could not open %s\n", wad);
+        return 0;
+    }
+
+    if (verbose) {
+        printf("Extracting %s to %s\n", wad, directory);
+    }
+
+    uint8_t *data = (uint8_t*)malloc(sbuffer.st_size);
+    if(!data){
+        fprintf(stderr,"Could not allocate %ld bytes for wad\n",sbuffer.st_size);
+        return 0;
+    }
+    FILE *wadfile = fopen(wad, "rb");
+    if(!wadfile){
+        fprintf(stderr,"Could not open %s wad file\n",wad);
+        free(data);
+        return 0;
+    }
+    int bytesread = fread(data, 1, sbuffer.st_size, wadfile);
+    if(bytesread!=sbuffer.st_size || ferror(wadfile)){
+        fprintf(stderr,"Could not read total wad, or file error occured");
+        free(data);
+        fclose(wadfile);
+        return 0;
+    }
+    fclose(wadfile);
+    if (be32(&data[3]) != 0x20497300) {
+        fprintf(stderr,"%s is an invalid wad file!\n",wad);
+        free(data);
+        return 0;
+    }
+
+    uint32_t certsize = be32(data + 0x08);
+    uint32_t tiksize = be32(data + 0x10);
+    uint32_t tmdsize = be32(data + 0x14);
+    uint32_t datasize = be32(data + 0x18);
+    uint32_t footersize = be32(data + 0x1C);
+
+    uint32_t certpos = 0x40;
+    uint32_t tikpos = 0x40 + addpadding(certsize, 64);
+    uint32_t tmdpos = tikpos + addpadding(tiksize, 64);
+    uint32_t datapos = tmdpos + addpadding(tmdsize, 64);
+    uint32_t footerpos = datapos + addpadding(datasize,64);
+
+    if (cleanup == 1) removedir(directory);
+
+    stat(directory,&sbuffer);
+    if(S_ISDIR(sbuffer.st_mode)){
+        if(verbose){
+            printf("%s exists, not creating.\n",directory);
+        }
+    }else{
+        if(verbose)
+            printf("Creating %s\n",directory);
+        if(mkdir(directory, 0755)==-1){
+            fprintf(stderr,"Could not mkdir %s\n",directory);
+            free(data);
+            return 0;
+        }
+    }
+
+    if(chdir(directory)==-1){
+        fprintf(stderr,"Could not chdir to %s\n",directory);
+        free(data);
+        return 0;
+    }
+
+    uint16_t contentcount = be16(data + tmdpos + 0x1de);
+
+    if (verbose) {
+        printf("Writing cert.cert.\n");
+    }
+    FILE* outfile = fopen("cert.cert", "wb");
+    if(!outfile){
+        perror("Could not open cert.cert for writing\n");
+        free(data);
+        return 0;
+    }
+
+    fwrite(data + certpos, 1, certsize, outfile);
+    if(ferror(outfile)){
+        perror("Could not write to cert.cert\n");
+        free(data);
+        return 0;
+    }
+    fclose(outfile);
+
+    if (verbose) {
+        printf("Writing ticket.tik.\n");
+    }
+    outfile = fopen("ticket.tik", "wb");
+    if(!outfile){
+        perror("Could not open ticket.tik for writing.\n");
+        free(data);
+        return 0;
+    }
+    fwrite(data + tikpos, 1, tiksize, outfile);
+    if(ferror(outfile)){
+        perror("Could not write to ticket.tik\n");
+        free(data);
+        return 0;
+    }
+    fclose(outfile);
+
+    if (verbose) {
+        printf("Writing metadata.tmd.\n");
+    }
+    outfile = fopen("metadata.tmd", "wb");
+    if(!outfile){
+        perror("Could not open metadata.tmd for writing\n");
+        free(data);
+        return 0;
+    }
+    fwrite(data + tmdpos, 1, tmdsize, outfile);
+    if(ferror(outfile)){
+        perror("Could not write to metadata.tmd\n");
+        free(data);
+        return 0;
+    }
+    fclose(outfile);
+
+    if(verbose){
+        printf("Writing footer.bin\n");
+    }
+    outfile = fopen("footer.bin","wb");
+    if(!outfile){
+        perror("Could not open footer.bin for writing.\n");
+        free(data);
+        return 0;
+    }
+    fwrite(data + footerpos, 1, footersize, outfile);
+    if(ferror(outfile)){
+        perror("Could not write to footer.bin\n");
+        free(data);
+        return 0;
+    }
+    fclose(outfile);
+
+    uint8_t encryptedkey[16], iv[16];
+
+    uint8_t i, j;
+    for (i = 0; i < 16; i++) {
+        encryptedkey[i] = data[tikpos + 0x1bf + i];
+    }
+    for (i = 0; i < 8; i++) {
+        iv[i] = data[tikpos + 0x1dc + i];
+        iv[i + 8] = 0x00;
+    }
+    ;
+    do_decrypt(encryptedkey, 16, key, iv);
+
+    for (j = 2; j < 16; j++) iv[j] = 0x00;
+
+    uint8_t *contentpos = data + datapos;
+
+    for (i = 0; i < contentcount; i++) {
+
+        iv[0] = data[tmdpos + 0x1e8 + (0x24 * i)];
+        iv[1] = data[tmdpos + 0x1e9 + (0x24 * i)];
+
+        uint32_t size = addpadding(getcontentlength(data + tmdpos, i), 16);
+
+        if (verbose) {
+            printf("Decrypting contents %d.\n", i);
+        }
+
+        do_decrypt(contentpos, size, encryptedkey, iv);
+
+        // Main rom content file
+        if (i == content_num) {
+            if (verbose) {
+                printf("Extracting content %d uint8_t Archive.\n",content_num);
+            }
+            char dbuf[100];
+            snprintf(dbuf,100,"content%d",content_num);
+            if(!extract_u8_archive(contentpos,dbuf)){
+                perror("Could not extract u8 archive");
+                free(data);
+                return 0;
+            }
+        }
+
+        char contentname[100];
+        snprintf(contentname, 100, "content%d.app", i);
+        if (verbose) {
+            printf("Writing %s.\n", contentname);
+        }
+        outfile = fopen(contentname, "wb");
+        if(!outfile){
+            fprintf(stderr,"Could not open %s for writing\n",contentname);
+            free(data);
+            return 0;
+        }
+        fwrite(contentpos, 1, getcontentlength(data + tmdpos, i), outfile);
+        if(ferror(outfile)){
+            fprintf(stderr,"Could not write to %s\n",contentname);
+            free(data);
+            return 0;
+        }
+        fclose(outfile);
+        contentpos += addpadding(size, 64);
+    }
+    chdir("..");
+    free(data);
+    return 1;
+}
+
+static int apply_dol_patch(const char *dol_file, uint32_t loading_address, uint8_t **data, uint32_t *size){
+    if(verbose){
+        printf("Injecting dol file %s\n",dol_file);
+    }
+    struct stat sbuffer;
+    chdir(workingdirectory);
+    doltool_ctxt_t *dolctxt = calloc(1,sizeof(*dolctxt));
+    if(!dolctxt){
+        perror("Could not create dol ctxt");
+        errno = ENOMEM;
+        return -1;
+    }
+    dol_load(dolctxt,data,size);
+    FILE *inject_file = fopen(dol_file,"rb");
+    if(!inject_file){
+        free(dolctxt);
+        perror(dol_file);
+        errno = ENOENT;
+        return -1;
+    }
+    stat(dol_file,&sbuffer);
+    uint8_t *inject_data = malloc(sbuffer.st_size);
+    fread(inject_data,1,sbuffer.st_size,inject_file);
+    fclose(inject_file);
+    dol_inject(dolctxt,inject_data,sbuffer.st_size,loading_address);
+    dol_save(dolctxt);
+    free(dolctxt);
+    free(inject_data);
+    chdir(directory);
+    return 0;
+}
+
+static int do_pack() {
+    DIR *testdir = opendir(directory);
+    if (testdir) {
+        closedir(testdir);
+    }
+    else {
+        fprintf(stderr,"%s doesn't exist, or is not a directory!\n", directory);
+        return 0;
+    }
+
+    if (verbose) {
+        printf("Packing %s into %s\n", directory, wad);
+    }
+    if(chdir(directory)==-1){
+        fprintf(stderr,"Could not change directory to %s",directory);
+        return 0;
+    }
+
+    if (verbose) {
+        printf("Gathering WAD Header Information\n");
+    }
+
+    struct stat sbuffer;
+    if(stat("cert.cert", &sbuffer)!=0){
+        perror("Could not stat cert.cert\n");
+        return 0;
+    }
+    uint32_t certsize = sbuffer.st_size;
+
+    if(stat("ticket.tik", &sbuffer)!=0){
+        perror("Could not stat ticket.tik\n");
+        return 0;
+    }
+    uint32_t tiksize = sbuffer.st_size;
+
+    if(stat("metadata.tmd", &sbuffer)!=0){
+        perror("Could not stat metadata.tmd\n");
+        return 0;
+    }
+    uint32_t tmdsize = sbuffer.st_size;
+
+    if (verbose) {
+        printf("Reading cert.cert\n");
+    }
+    FILE *infile = fopen("cert.cert", "rb");
+    if(!infile){
+        perror("Could not open cert.cert for reading\n");
+        return 0;
+    }
+    uint8_t *cert = calloc(addpadding(certsize, 64), sizeof(uint8_t));
+    if(!cert){
+        fprintf(stderr,"Could not allocate %d bytes for cert\n",certsize);
+        return 0;
+    }
+    int bytesread = fread(cert, 1, certsize, infile);
+    if(bytesread!=certsize || ferror(infile)){
+        perror("Error reading from cert.cert\n");
+        free(cert);
+        return 0;
+    }
+    fclose(infile);
+
+    if (verbose) {
+        printf("Reading ticket.cert\n");
+    }
+    infile = fopen("ticket.tik", "rb");
+    if(!infile){
+        perror("Could not open ticket.tik for reading\n");
+        free(cert);
+        return 0;
+    }
+    uint8_t *tik = calloc(addpadding(tiksize, 64), sizeof(uint8_t));
+    if(!tik){
+        fprintf(stderr,"Could not allocate %d bytes for ticket\n",tiksize);
+        free(cert);
+        return 0;
+    }
+    bytesread = fread(tik, 1, tiksize, infile);
+    if(bytesread!=tiksize || ferror(infile)){
+        perror("Error reading from ticket.tik\n");
+        free(cert);
+        free(tik);
+        return 0;
+    }
+    fclose(infile);
+
+    if (verbose) {
+        printf("Reading metadata.tmd\n");
+    }
+    infile = fopen("metadata.tmd", "rb");
+    if(!infile){
+        perror("Could not open metadata.tmd for reading\n");
+        free(cert);
+        free(tik);
+        return 0;
+    }
+    uint8_t *tmd = calloc(addpadding(tmdsize, 64), sizeof(uint8_t));
+    if(!tmd){
+        fprintf(stderr,"Could not allocate %d bytes for tmd\n",tmdsize);
+        free(cert);
+        free(tik);
+        return 0;
+    }
+    bytesread = fread(tmd, 1, tmdsize, infile);
+    if(bytesread!=tmdsize || ferror(infile)){
+        perror("Error reading from tmddata.tmd\n");
+        free(cert);
+        free(tik);
+        free(tmd);
+        return 0;
+    }
+    fclose(infile);
+
+    if (verbose) {
+        printf("Generating Footer signature\n");
+    }
+    char footer[0x40] = {0};
+    sprintf(footer,"gzinject v%s https://github.com/krimtonz/gzinject", GZINJECT_VERSION);
+    uint32_t footersize = 0x40;
+
+    // Build Content5 into a .app file first
+    char dbuf[100], nbuf[100] = {0};
+    snprintf(dbuf,100,"content%d",content_num);
+    strcpy(nbuf,dbuf);
+    strcat(nbuf,".app");
+    if(verbose){
+        printf("Generating %s u8 archive\n",nbuf);
+    }
+
+    int content5len = create_u8_archive(dbuf,nbuf);
+    if(!content5len){
+        fprintf(stderr,"Could not create u8 archive from %s into %s\n",dbuf,nbuf);
+        free(cert);
+        free(tik);
+        free(tmd);
+        return 0;
+    }
+    chdir(workingdirectory);
+    chdir(directory);
+    if (verbose) {
+        printf("Modifying content metadata in the TMD\n");
+    }
+    uint16_t contentsc = be16(tmd + 0x1DE);
+    int i;
+
+    char cfname[100];
+    uint8_t **fileptrs = malloc(sizeof(*fileptrs) * contentsc);
+    if(!fileptrs){
+        perror("Could not allocate filepointers.\n");
+        free(cert);
+        free(tik);
+        free(tmd);
+        return 0;
+    }
+    uint32_t *filesizes = malloc(sizeof(*filesizes) * contentsc);
+    if(!filesizes){
+        perror("Could not allocate filesizes\n");
+        free(cert);
+        free(tik);
+        free(tmd);
+        free(fileptrs);
+        return 0;
+    }
+
+    for (i = 0; i < contentsc; i++) {
+        snprintf(cfname, 30, "content%d.app", i);
+        stat(cfname, &sbuffer);
+        filesizes[i] = addpadding(sbuffer.st_size,16);
+        fileptrs[i] = calloc(filesizes[i],1);
+        if(!fileptrs[i]){
+            fprintf(stderr,"Could not allocate %ld bytes for %s\n",sbuffer.st_size,cfname);
+            goto error;
+        }
+        infile = fopen(cfname,"rb");
+        if(!infile){
+            fprintf(stderr,"Could not open %s for reading\n",cfname);
+            goto error;
+        }
+        bytesread = fread(fileptrs[i],1,sbuffer.st_size,infile);
+        if(bytesread!=sbuffer.st_size || ferror(infile)){
+            fprintf(stderr,"Error reading from %s\n",cfname);
+            goto error;
+        }
+        fclose(infile);
+        setcontentlength(tmd,i,filesizes[i]);
+    }
+
+    int patch_idx = 0;
+    int dol_applied = 0;
+    if(dol_after>=101) dol_after-=101;
+
+    while(patch){
+        if(verbose){
+            printf("Applying %s gzi patches\n",patch->filename);
+        }
+
+        if(chdir(workingdirectory)!=0){
+            fprintf(stderr,"Could not change directory to %s",workingdirectory);
+        }
+        gzi_ctxt_t gzi;
+        if(!gzi_init(&gzi,fileptrs,filesizes,contentsc,tmd,tik,cert,&tmdsize,&tiksize,&certsize)){
+            perror("Could not initialize patch file");
+            goto error;
+
+        }
+        if(!gzi_parse_file(&gzi,patch->filename)){
+            perror("Could not parse gzi patch file");
+            goto error;
+        }
+        if(!gzi_run(&gzi)){
+            perror("Could not run gzi patch file");
+            goto error;
+        }
+        if(chdir(directory)!=0){
+            fprintf(stderr,"Could not change directory to %s",directory);
+            goto error;
+        }
+
+        for(int i=0;i<contentsc;i++){
+            setcontentlength(tmd,i,gzi.file_sizes[i]);
+        }
+
+        if(!gzi_destroy(&gzi)){
+            perror("Could not destory gzi patch file");
+            goto error;
+        }
+        patch_list_t *old_patch = patch;
+        patch = patch->next;
+        free(old_patch);
+        if(dol_after == patch_idx){
+            while(dol && dol_loading){
+                if (apply_dol_patch(dol->filename,dol_loading->loading_address,&fileptrs[1],&filesizes[1]) != 0) {
+                    fprintf(stderr, "Could not inject dol patch\n");
+                    goto error;
+                }
+                dol_list_t *old_dol = dol;
+                dol = dol->next;
+                free(old_dol);
+                dol_loading_list_t *old_loading = dol_loading;
+                dol_loading = dol_loading->next;
+                free(old_loading);
+            }
+            dol_applied = 1;
+            setcontentlength(tmd,1,filesizes[1]);
+        }
+        patch_idx++;
+    }
+
+    if(!dol_applied && dol && dol_loading){
+        while(dol && dol_loading){
+            if (apply_dol_patch(dol->filename,dol_loading->loading_address,&fileptrs[1],&filesizes[1]) != 0) {
+                fprintf(stderr, "Could not inject dol patch\n");
+                goto error;
+            }
+            dol_list_t *old_dol = dol;
+            dol = dol->next;
+            free(old_dol);
+            dol_loading_list_t *old_loading = dol_loading;
+            dol_loading = dol_loading->next;
+            free(old_loading);
+        }
+        setcontentlength(tmd,1,filesizes[1]);
+    }
+
+    // Change Title ID
+    if (titleid != NULL) {
+        if (verbose) {
+            printf("Changing Channel ID\n");
+        }
+        memcpy(tik + 0x1e0, titleid, 4);
+        memcpy(tmd + 0x190, titleid, 4);
+    }
+
+    if (verbose) {
+        printf("Changing region in the TMD\n");
+    }
+    // Change the Region
+    tmd[0x19d] = region;
+
+    if (verbose) {
+        printf("Changing encryption key in the ticket\n");
+    }
+    // New key
+    memcpy(tik + 0x1bf, &newkey, 16);
+
+    //Decrypt the new key
+    uint8_t newenc[16];
+    uint8_t iv[16];
+
+    for (i = 0; i < 16; i++) {
+        newenc[i] = *(tik + 0x1bf + i);
+    }
+    for (i = 0; i < 8; i++) {
+        iv[i] = *(tik + 0x1dc + i);
+        iv[i + 8] = 0x00;
+    }
+
+    do_decrypt(newenc, 16, key, iv);
+
+    int j;
+
+    for (j = 2; j < 15; j++) {
+        iv[j] = 0x00;
+    }
+
+    for (i = 0; i < contentsc; i++) {
+        uint8_t *contents = fileptrs[i];
+
+        if (i == 0) {
+            if (channelname != NULL) {
+                if (verbose) {
+                    printf("Changing the Channel Name in content0.app\n");
+                }
+
+                uint16_t imetpos = -1;
+                for (j = 0; j < 400; j++) {
+                    if (strcmp((char*)(contents + j),"IMET")==0) {
+                        imetpos = j;
+                        break;
+                    }
+                }
+                if(imetpos!=-1){
+                    uint16_t count = 0;
+                    size_t cnamelen = strlen(channelname);
+                    char namebuf[40] = {0};
+                    for(j=0,count=0;count<cnamelen;j+=2,count++)
+                        namebuf[j+1] = channelname[count];
+                    char *names = (char*)contents + imetpos + 28;
+                    for(j=0;j<8;j++){
+                        memcpy(names + (j*84),namebuf,40);
+                    }
+                }else{
+                    printf("Could not find IMET position to change title name.\n");
+                    goto skipmd5;
+                }
+
+                if (verbose) {
+                    printf("Signing the new Channel Name\n");
+                }
+                memset(contents + 0x630,0,16);
+                uint8_t md5digest[16];
+                do_md5(contents + 64, md5digest, 1536);
+
+                for (j = 0; j < 16; j++) {
+                    contents[0x630 + j] = md5digest[j];
+                }
+            }
+        }
+
+skipmd5:
+        iv[0] = tmd[0x1e8 + (0x24 * i)];
+        iv[1] = tmd[0x1e9 + (0x24 * i)];
+
+
+        if (verbose) {
+            printf("Generating signature for the content %d, and copying signature to the TMD\n", i);
+        }
+
+        uint8_t digest[20];
+        do_sha1(contents, digest, getcontentlength(tmd, i));
+
+        memcpy(tmd + 0x1f4 + (36 * i), &digest, 20);
+
+        if (verbose) {
+            printf("Encrypting content %d\n", i);
+        }
+
+        do_encrypt(contents, filesizes[i], newenc, iv);
+
+    }
+
+    chdir(workingdirectory);
+
+    truchasign(tmd, W_TMD, tmdsize);
+    truchasign(tik, W_TIK, tiksize);
+
+
+    if (verbose) {
+        printf("Generating WAD Header, and flipping endianness\n");
+    }
+
+    FILE *outwadfile = fopen(wad, "wb");
+    if(!outwadfile){
+        perror("Could not open output wad file\n");
+        goto error;
+    }
+    char wadheader[8] = {
+        0x00, 0x00, 0x00, 0x20, 0x49, 0x73, 0x00, 0x00
+    };
+    fwrite(wadheader,1,8,outwadfile);
+    if(ferror(outwadfile)){
+        perror("Could not write wad header to wad\n");
+        goto error;
+    }
+
+    uint32_t datasize = 0;
+    for(i=0;i<contentsc;i++){
+        datasize+=addpadding(filesizes[i],64);
+    }
+    uint32_t wadsizes[6];
+    wadsizes[0] = REVERSEENDIAN32(certsize);
+    wadsizes[1] = 0;
+    wadsizes[2] = REVERSEENDIAN32(tiksize);
+    wadsizes[3] = REVERSEENDIAN32(tmdsize);
+    wadsizes[4] = REVERSEENDIAN32(datasize);
+    wadsizes[5] = REVERSEENDIAN32(footersize);
+    fwrite(wadsizes,sizeof(uint32_t),6,outwadfile);
+    if(ferror(outwadfile)){
+        perror("Could not write header sizes to wad\n");
+        goto error;
+    }
+
+    char headerpadding[32];
+    memset(&headerpadding, 0, 32);
+    fwrite(&headerpadding, 1, 32, outwadfile);
+    if(ferror(outwadfile)){
+        perror("Could not write header padding to wad\n");
+        goto error;
+    }
+    if (verbose) {
+        printf("Writing certificate\n");
+    }
+    fwrite(cert, 1, addpadding(certsize, 64), outwadfile);
+    if(ferror(outwadfile)){
+        perror("Could not write cert to wad\n");
+        goto error;
+    }
+    if (verbose) {
+        printf("Writing ticket\n");
+    }
+    fwrite(tik, 1, addpadding(tiksize, 64), outwadfile);
+    if(ferror(outwadfile)){
+        perror("Could not write ticket to wad\n");
+        goto error;
+    }
+    if (verbose) {
+        printf("Writing medatadata\n");
+    }
+    fwrite(tmd, 1, addpadding(tmdsize, 64), outwadfile);
+    if(ferror(outwadfile)){
+        perror("Could not write metadata to wad\n");
+        goto error;
+    }
+    if (verbose) {
+        printf("Writing data\n");
+    }
+    char padding[64] = {0};
+    for(i=0;i<contentsc;i++){
+        fwrite(fileptrs[i],1,filesizes[i],outwadfile);
+        if(ferror(outwadfile)){
+            fprintf(stderr,"Could not write content %d to wad\n",i);
+            goto error;
+        }
+        int padcnt = addpadding(filesizes[i],64) - filesizes[i];
+        if(padcnt>0){
+            fwrite(&padding,1,padcnt,outwadfile);
+            if(ferror(outwadfile)){
+                perror("Could not write write content padding\n");
+                goto error;
+            }
+        }
+    }
+    if (verbose) {
+        printf("Writing footer\n");
+    }
+    fwrite(footer, 1, 0x40, outwadfile);
+    if(ferror(outwadfile)){
+        perror("Could not write footer\n");
+        goto error;
+    }
+    fclose(outwadfile);
+
+
+    free(cert);
+    free(tik);
+    free(tmd);
+    for(i=0;i<contentsc;i++){
+        if(fileptrs[i]) free(fileptrs[i]);
+    }
+    free(fileptrs);
+    free(filesizes);
+
+    if (cleanup)
+        removedir(directory);
+
+    return 1;
+
+error:
+    free(cert);
+    free(tik);
+    free(tmd);
+    if(fileptrs){
+        for(int i=0;i<contentsc;i++){
+            if(fileptrs[i]) free(fileptrs[i]);
+        }
+        free(fileptrs);
+
+    }
+    if(filesizes) free(filesizes);
+    return 0;
+}
+
+void romc(){
+    if(outwad == NULL || rom == NULL){
+        print_usage();
+        exit(1);
+    }
+    printf("Decompressing %s\n",rom);
+    struct stat sbuffer;
+    stat(rom,&sbuffer);
+    FILE *inrom = fopen(rom,"rb");
+    if(!inrom){
+        printf("Could not open %s\n",rom);
+        exit(1);
+    }
+    uint8_t *comp = malloc(sbuffer.st_size);
+    if(!comp){
+        printf("Could not alloc %ld bytes\n",sbuffer.st_size);
+    }
+    fread(comp,1,sbuffer.st_size,inrom);
+    fclose(inrom);
+    size_t decomp_size;
+    uint8_t *decomp = romchu_decompress(comp,sbuffer.st_size,&decomp_size);
+    free(comp);
+    if(!decomp){
+        printf("Could not decompress %s\n",rom);
+        exit(1);
+    }
+    FILE *outrom = fopen(outwad,"wb");
+    if(!outrom){
+        printf("Could not open %s for writing\n",outwad);
+        exit(1);
+    }
+    fwrite(decomp,1,decomp_size,outrom);
+    fclose(outrom);
+    free(decomp);
+}
+
+static void genkey() {
+    printf("Enter 45e and press enter: ");
+    char *line = malloc(4);
+    fgets(line, 4, stdin);
+
+    uint8_t outkey[16] = { 0x26 ,0xC2 ,0x12 ,0xB3 ,0x60 ,0xDD ,0x2E ,0x04 ,0xCF ,0x9C ,0x12 ,0x51 ,0xAF ,0x99 ,0x88 ,0xE4 };
+
+    uint8_t iv[16];
+    iv[0] = line[0];
+    iv[1] = line[1];
+    iv[2] = line[2];
+
+    int i;
+    for (i = 3; i < 16; i++) iv[i] = 0x00;
+
+    do_decrypt(outkey, 16, newkey, iv);
+
+    free(line);
+
+    if (keyfile == NULL)  keyfile = "common-key.bin";
+    FILE *keyf = fopen(keyfile, "wb");
+    fwrite(&outkey, 1, 16, keyf);
+    fclose(keyf);
+
+    printf("%s successfully generated\n", keyfile);
+}
+
+int main(int argc, char **argv) {
+    setbuf(stdout, NULL);
+
+    int opt;
+
+    char *action = NULL;
+
+    while (1) {
+        int oi = 0;
+        opt = getopt_long(argc, argv, "a:w:i:t:?k:r:d:vm:o:p:c:", cmdoptions, &oi);
+        if (opt == -1) break;
+        switch (opt) {
+        case 'a':
+            action = optarg;
+            break;
+        case 'w':
+            wad = optarg;
+            break;
+        case 'i':
+            titleid = optarg;
+            break;
+        case 't':
+            channelname = optarg;
+            break;
+        case 'h':
+        case '?':
+            print_usage();
+            exit(0);
+            break;
+        case 'k':
+            keyfile = optarg;
+            break;
+        case 'r':
+            if (optarg[0] == '0') region = 0;
+            else if (optarg[0] == '1') region = 1;
+            else if (optarg[0] == '2') region = 2;
+            else region = 3;
+            break;
+        case 'd':
+            directory = optarg;
+            break;
+        case 'v':
+            print_version(argv[0]);
+            exit(0);
+            break;
+        case 'm':
+            rom = optarg;
+            break;
+        case 'o':
+            outwad = optarg;
+            break;
+        case 'p': {
+            patch_list_t *new_patch = malloc(sizeof(*new_patch));
+            if (new_patch == NULL) {
+                perror("Could not allocate patch list");
+                exit(1);
+            }
+            new_patch->filename = optarg;
+            new_patch->next = NULL;
+            *patch_link = new_patch;
+            patch_link = &new_patch->next;
+            break;
+        }
+        case 'c':
+            content_num = optarg[0] - 0x30;
+            if(content_num<0 || content_num>9) content_num=5;
+            break;
+        case 'f':
+        {
+            dol_list_t *new_dol = malloc(sizeof(*new_dol));
+            if(new_dol == NULL){
+                perror("Could not allocate dol list");
+                exit(1);
+            }
+            new_dol->filename = optarg;
+            new_dol->next = NULL;
+            *dol_link = new_dol;
+            dol_link = &new_dol->next;
+            break;
+        }
+        case 'l':{
+            char loading_address[10];
+            sscanf(optarg,"%9s",loading_address);
+            uint32_t addr;
+            sscanf(loading_address,"%"SCNx32,&addr);
+            dol_loading_list_t *new_dol_loading = malloc(sizeof(*new_dol_loading));
+            if(new_dol_loading == NULL){
+                perror("Could not allocate dol loading address.");
+                exit(1);
+            }
+            new_dol_loading->loading_address = addr;
+            new_dol_loading->next = NULL;
+            *dol_loading_link = new_dol_loading;
+            dol_loading_link = &new_dol_loading->next;
+            break;
+        }
+        case 'e': {
+            char dol_after_str[10];
+            sscanf(optarg, "%s", dol_after_str);
+            sscanf(dol_after_str, "%"SCNu32, &dol_after);
+            break;
+        }
+        default:
+            break;
+        }
+
+    }
+
+    if (action == NULL) {
+        print_usage();
+        exit(1);
+    }
+
+    if(strcmp(action, "romc") == 0){
+        romc();
+        return 0;
+    }
+
+    if (strcmp(action, "genkey") == 0){
+        genkey();
+        return 0;
+    }
+
+    if (strcmp(action, "extract") != 0 && strcmp(action, "pack") != 0 && strcmp(action, "inject") != 0) {
+        print_usage();
+        exit(1);
+    }
+
+    if (wad == NULL) {
+        print_usage();
+        exit(1);
+    }
+
+    if (directory == NULL) directory = "wadextract";
+
+    struct stat sbuffer;
+    if (keyfile == NULL) {
+        if (stat("key.bin", &sbuffer) == 0) {
+            keyfile = "key.bin";
+        }
+        else if (stat("common-key.bin", &sbuffer) == 0) {
+            keyfile = "common-key.bin";
+        }
+        else {
+            printf("Cannot find key.bin or common-key.bin.\n");
+            exit(1);
+        }
+    }
+    else {
+        if (stat(keyfile, &sbuffer) != 0) {
+            printf("Cannot find keyfile specified.\n");
+            exit(1);
+        }
+    }
+
+    FILE *fkeyfile = fopen(keyfile, "rb");
+    if(!fkeyfile){
+        perror("Could not open keyfile");
+        exit(1);
+    }
+
+    fread(&key, 1, 16, fkeyfile);
+    if(ferror(fkeyfile)){
+        perror("Could not read from keyfile.");
+        exit(1);
+    }
+    fclose(fkeyfile);
+
+    workingdirectory = malloc(200);
+    if(!workingdirectory){
+        perror("Could not allocate for working directory");
+        exit(1);
+    }
+    workingdirectory = getcwd(workingdirectory, 200);
+
+    if (strcmp(action, "extract") == 0) {
+        if(!do_extract()){
+            exit(1);
+        }
+    }
+    else if (strcmp(action, "pack") == 0) {
+        if(!do_pack()){
+            exit(1);
+        }
+    }
+    else if (strcmp(action, "inject") == 0) {
+        if (rom == NULL) {
+            printf("-a inject specified, but no rom to inject\n");
+            free(workingdirectory);
+            exit(1);
+
+        }
+        if(!do_extract()){
+            perror("Could not extract wad\n");
+            free(workingdirectory);
+            exit(1);
+        }
+
+        if (verbose) {
+            printf("Copying %s to %s/content%d/rom\n", rom, directory,content_num);
+        }
+        FILE *from = fopen(rom, "rb");
+        fseek(from, 0, SEEK_END);
+        size_t fromlen = ftell(from);
+        fseek(from, 0, SEEK_SET);
+        uint8_t *inrom = malloc(fromlen);
+        if(!inrom){
+            perror("could not allocate input rom\n");
+            free(workingdirectory);
+            exit(1);
+        }
+        fread(inrom, 1, fromlen, from);
+        fclose(from);
+
+        char *orom = malloc(200);
+        if(!orom){
+            perror("Could not allocate output rom name\n");
+            free(workingdirectory);
+            free(inrom);
+            exit(1);
+        }
+        snprintf(orom, 200, "%s/content%d/rom", directory,content_num);
+        from = fopen(orom, "wb");
+        fwrite(inrom, 1, fromlen, from);
+        fclose(from);
+        free(inrom);
+        free(orom);
+
+
+        char *wadname = removeext(wad),
+            *outname = malloc(strlen(wadname) + 12);
+        if(!outname){
+            perror("could not allocate for output wad name\n");
+            free(workingdirectory);
+            exit(1);
+        }
+        sprintf(outname, "%s-inject.wad", wadname);
+        free(wadname);
+        if (outwad == NULL) {
+            wad = outname;
+        }
+        else {
+            wad = outwad;
+        }
+
+        if(!do_pack()){
+            perror("Could not pack wad\n");
+            free(outname);
+            free(workingdirectory);
+            exit(1);
+        }
+        free(outname);
+    }
+
+    free(workingdirectory);
+    return 0;
+}
diff --git a/tools/gzinject/src/gzinject.h b/tools/gzinject/src/gzinject.h
new file mode 100644
index 000000000..493557caf
--- /dev/null
+++ b/tools/gzinject/src/gzinject.h
@@ -0,0 +1,50 @@
+#ifndef _GZINJECT_H_
+#define _GZINJECT_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define REVERSEENDIAN32(X)  (((X) >> 24) & 0xff) | (((X)<<8) & 0xFF0000) | (((X) >> 8) & 0xff00) | (((X)<<24) & 0xff000000)
+#define REVERSEENDIAN16(X) (((X)>>8) & 0xff) | (((X)<<8) & 0xFF00)
+
+#define W_TIK 0x00
+#define W_TMD 0x01
+#define GZINJECT_VERSION "0.3.3"
+
+#if _WIN32
+#define mkdir(X,Y) mkdir(X)
+#define getcwd(X,Y) _getcwd(X,Y)
+#endif
+
+typedef enum{
+    FILE_DIRECTORY,
+    FILE_NORMAL
+}filetype_t;
+
+typedef struct patch_list patch_list_t;
+struct patch_list {
+    const char     *filename;
+    patch_list_t   *next;
+};
+
+typedef struct dol_list dol_list_t;
+struct dol_list{
+    const char *filename;
+    dol_list_t *next;
+};
+
+typedef struct dol_loading_list dol_loading_list_t;
+struct dol_loading_list{
+    uint32_t            loading_address;
+    dol_loading_list_t *next;
+};
+
+ uint16_t be16(const uint8_t *p);
+
+ uint32_t be32(const uint8_t *p);
+
+uint32_t addpadding(uint32_t inp, uint32_t padding);
+
+extern int verbose;
+
+#endif
diff --git a/tools/gzinject/src/lz77.c b/tools/gzinject/src/lz77.c
new file mode 100644
index 000000000..5f9063ab3
--- /dev/null
+++ b/tools/gzinject/src/lz77.c
@@ -0,0 +1,175 @@
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include "lz77.h"
+#include "gzinject.h"
+
+int lz77_compressed_length(uint8_t *src){
+    if(*src!=0x10){
+        return -1;
+    }
+    uint32_t size = *(uint32_t*)src >> 8;
+    uint32_t pos = 0;
+    int idx = 4;
+cloop:
+    while(pos<size){
+        uint8_t flags = src[idx++];
+        uint32_t lab=0;
+        while(1){
+            if(lab<8 && pos < size){
+                if((flags & 0x80)!=0){
+                    uint8_t n = (src[idx]>>4 & 0xF) + 3;
+                    if(((src[idx] & 0xF)<<8) + (src[idx + 1] + 1) <= pos){
+                        pos+=n;
+                        idx+=2;
+                    }else{
+                        break;
+                    }
+                }else{
+                    pos++;
+                    idx++;
+                }
+                flags <<= 1;
+                lab++;
+            }else{
+                goto cloop;
+            }
+            
+        }
+        return -1;
+    }
+    if(idx%4!=0){
+        idx+=4-idx%4;
+    }
+    return idx;
+}
+
+int lz77_decompressed_size(uint8_t *source){
+    return *(uint32_t*)(source + 1);
+}
+
+int lz77_decompress(uint8_t *src, uint8_t *dest){
+    if(*src++ != 0x10){
+        return -1;
+    }
+    int index1 = 0;
+    int num1 = src[0] + (src[1] << 8) + (src[2]<<16);
+    src+=3;
+    while(index1<num1){
+        uint8_t num2 = *src++;
+        for(int index2=0;index2<8;++index2){
+            if((num2 & 0x80)!=0){
+                int num3 = 3 + (*src>>4);
+                int num4 = 1 + ((src[0] & 0xF) << 8) + src[1];
+                src+=2;
+                if(num4>num1){
+                    return -1;
+                }
+                for(int index3 = 0;index3<num3;++index3){
+                    dest[index1] = dest[index1 - index3 - num4 + index3 % num4];
+                    ++index1;
+                }
+            }else{
+                dest[index1++] = *src++;
+            }
+            if(index1<num1){
+                num2<<=1;
+            }
+            else{
+                break;
+            }
+        }
+    }
+    return 0;
+}
+
+void lz77_search(uint8_t *source, int *dest, int *d, int pos, int len){
+    
+    if(pos>=len){
+        dest[0] = -1;
+        dest[1] = 0;
+        return;
+    }
+    if(pos<2 || len-pos<2){
+        dest[0] = 0;
+        dest[1] = 0;
+        return;
+    }
+    
+    int didx = 0;
+    for(int index = 1; index<0x1000 && index<pos;++index){
+        if(source[pos - (index+1)] == source[pos]){
+            d[didx++] = index+1;
+        }
+    }
+    if(didx==0){
+        dest[0] = 0;
+        dest[1] = 0;
+        return;
+    }
+    int num =0;
+    uint8_t flag = 1;
+    while(num<18 && flag){
+        ++num;
+        for(int index = didx-1;index>=0;--index){
+            if(source[pos+num]!=source[pos-d[index] + num % d[index]]){
+                if(didx>1){
+                    memmove((void*)d + (sizeof(int) * index),(void*)d + (sizeof(int) * (index+1)),sizeof(int) * (didx - index - 1));
+                    didx--;
+                }else{
+                    flag = 0;
+                }
+            }
+        }
+    }
+    dest[0] = num;
+    dest[1] = d[0];
+}
+
+int lz77_compress(uint8_t *src, uint8_t **dest, uint32_t len, uint32_t *lenp){
+    int pos = 0;
+    int cpos = 0;
+    uint8_t *comp = calloc(len,1);
+    comp[cpos++] = 0x10;
+    uint8_t *cp = (uint8_t*)lenp;
+    for(int index=0;index<3;++index){
+        comp[cpos++]=*(uint8_t*)cp++;
+    }
+    int d[2];
+    int dbuf[0x4000];
+    while(pos<len){
+        uint8_t num1 = 0;
+        uint8_t comp2[16];
+        int bpos = 0;
+        
+        for(int index=0;index<8;++index){
+            lz77_search(src,d,dbuf,pos,len);
+            if(d[0] > 2){
+                uint8_t num2 = ((((d[0] - 3) & 0xF) << 4) + ((d[1] - 1) >> 8 & 0xF));
+                comp2[bpos++] = num2;
+                uint8_t num3 = (d[1] - 1) & 0xFF;
+                comp2[bpos++] = num3;
+                pos+=d[0];
+                num1 |= 1 << (8 - (index+1));
+            }else if(d[0]>=0){
+                comp2[bpos++] = src[pos++];
+            }else{
+                break;
+            }
+        }
+        comp[cpos++] = num1;
+        for(int i=0;i<bpos;i++){
+            comp[cpos++] = comp2[i];
+        }
+    }
+    while(cpos%4!=0){
+        comp[cpos++] = 0;
+    }
+    cpos--;
+    len = addpadding(cpos,16);
+    *dest = calloc(len,1);
+    memcpy(*dest,comp,cpos);
+    free(comp);
+    return len;
+}
+
diff --git a/tools/gzinject/src/lz77.h b/tools/gzinject/src/lz77.h
new file mode 100644
index 000000000..70c939c80
--- /dev/null
+++ b/tools/gzinject/src/lz77.h
@@ -0,0 +1,12 @@
+#ifndef _LZ77_H
+#define _LZ77_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+int lz77_compressed_length(uint8_t *source);
+int lz77_decompress(uint8_t *source, uint8_t *dest);
+int lz77_decompressed_size(uint8_t *source);
+int lz77_compress(uint8_t *src, uint8_t **dest, uint32_t len, uint32_t *intp);
+
+#endif
\ No newline at end of file
diff --git a/tools/gzinject/src/md5.c b/tools/gzinject/src/md5.c
new file mode 100644
index 000000000..e0affaaf0
--- /dev/null
+++ b/tools/gzinject/src/md5.c
@@ -0,0 +1,291 @@
+/*
+* This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+* MD5 Message-Digest Algorithm (RFC 1321).
+*
+* Homepage:
+* http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+*
+* Author:
+* Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+*
+* This software was written by Alexander Peslyak in 2001.  No copyright is
+* claimed, and the software is hereby placed in the public domain.
+* In case this attempt to disclaim copyright and place the software in the
+* public domain is deemed null and void, then the software is
+* Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+* general public under the following terms:
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted.
+*
+* There's ABSOLUTELY NO WARRANTY, express or implied.
+*
+* (This is a heavily cut-down "BSD license".)
+*
+* This differs from Colin Plumb's older public domain implementation in that
+* no exactly 32-bit integer data type is required (any 32-bit or wider
+* unsigned integer data type will do), there's no compile-time endianness
+* configuration, and the function prototypes match OpenSSL's.  No code from
+* Colin Plumb's implementation has been reused; this comment merely compares
+* the properties of the two independent implementations.
+*
+* The primary goals of this implementation are portability and ease of use.
+* It is meant to be fast, but not as fast as possible.  Some known
+* optimizations are not included to reduce source code size and avoid
+* compile-time configuration.
+*/
+
+#ifndef HAVE_OPENSSL
+
+#include <string.h>
+
+#include "md5.h"
+
+/*
+* The basic MD5 functions.
+*
+* F and G are optimized compared to their RFC 1321 definitions for
+* architectures that lack an AND-NOT instruction, just like in Colin Plumb's
+* implementation.
+*/
+#define F(x, y, z)			((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z)			((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z)			(((x) ^ (y)) ^ (z))
+#define H2(x, y, z)			((x) ^ ((y) ^ (z)))
+#define I(x, y, z)			((y) ^ ((x) | ~(z)))
+
+/*
+* The MD5 transformation for all four rounds.
+*/
+#define STEP(f, a, b, c, d, x, t, s) \
+	(a) += f((b), (c), (d)) + (x) + (t); \
+	(a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \
+	(a) += (b);
+
+/*
+* SET reads 4 input bytes in little-endian byte order and stores them in a
+* properly aligned word in host byte order.
+*
+* The check for little-endian architectures that tolerate unaligned memory
+* accesses is just an optimization.  Nothing will break if it fails to detect
+* a suitable architecture.
+*
+* Unfortunately, this optimization may be a C strict aliasing rules violation
+* if the caller's data buffer has effective type that cannot be aliased by
+* MD5_u32plus.  In practice, this problem may occur if these MD5 routines are
+* inlined into a calling function, or with future and dangerously advanced
+* link-time optimizations.  For the time being, keeping these MD5 routines in
+* their own translation unit avoids the problem.
+*/
+#if defined(__i386__) || defined(__x86_64__) || defined(__vax__)
+#define SET(n) \
+	(*(MD5_u32plus *)&ptr[(n) * 4])
+#define GET(n) \
+	SET(n)
+#else
+#define SET(n) \
+	(ctx->block[(n)] = \
+	(MD5_u32plus)ptr[(n) * 4] | \
+	((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \
+	((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \
+	((MD5_u32plus)ptr[(n) * 4 + 3] << 24))
+#define GET(n) \
+	(ctx->block[(n)])
+#endif
+
+/*
+* This processes one or more 64-byte data blocks, but does NOT update the bit
+* counters.  There are no alignment requirements.
+*/
+static const void *body(MD5_CTX *ctx, const void *data, unsigned long size)
+{
+	const unsigned char *ptr;
+	MD5_u32plus a, b, c, d;
+	MD5_u32plus saved_a, saved_b, saved_c, saved_d;
+
+	ptr = (const unsigned char *)data;
+
+	a = ctx->a;
+	b = ctx->b;
+	c = ctx->c;
+	d = ctx->d;
+
+	do {
+		saved_a = a;
+		saved_b = b;
+		saved_c = c;
+		saved_d = d;
+
+		/* Round 1 */
+		STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
+			STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
+			STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
+			STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
+			STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
+			STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
+			STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
+			STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
+			STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
+			STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
+			STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
+			STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
+			STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
+			STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
+			STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
+			STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
+
+			/* Round 2 */
+			STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+			STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+			STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+			STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+			STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+			STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+			STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+			STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+			STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+			STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+			STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+			STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+			STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+			STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+			STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+			STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+
+			/* Round 3 */
+			STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+			STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11)
+			STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+			STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23)
+			STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+			STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+			STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+			STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23)
+			STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+			STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11)
+			STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+			STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23)
+			STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+			STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11)
+			STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+			STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23)
+
+			/* Round 4 */
+			STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+			STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+			STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+			STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+			STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+			STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+			STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+			STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+			STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+			STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+			STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+			STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+			STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+			STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+			STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+			STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+
+			a += saved_a;
+		b += saved_b;
+		c += saved_c;
+		d += saved_d;
+
+		ptr += 64;
+	} while (size -= 64);
+
+	ctx->a = a;
+	ctx->b = b;
+	ctx->c = c;
+	ctx->d = d;
+
+	return ptr;
+}
+
+void MD5_Init(MD5_CTX *ctx)
+{
+	ctx->a = 0x67452301;
+	ctx->b = 0xefcdab89;
+	ctx->c = 0x98badcfe;
+	ctx->d = 0x10325476;
+
+	ctx->lo = 0;
+	ctx->hi = 0;
+}
+
+void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size)
+{
+	MD5_u32plus saved_lo;
+	unsigned long used, available;
+
+	saved_lo = ctx->lo;
+	if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo)
+		ctx->hi++;
+	ctx->hi += size >> 29;
+
+	used = saved_lo & 0x3f;
+
+	if (used) {
+		available = 64 - used;
+
+		if (size < available) {
+			memcpy(&ctx->buffer[used], data, size);
+			return;
+		}
+
+		memcpy(&ctx->buffer[used], data, available);
+		data = (const unsigned char *)data + available;
+		size -= available;
+		body(ctx, ctx->buffer, 64);
+	}
+
+	if (size >= 64) {
+		data = body(ctx, data, size & ~(unsigned long)0x3f);
+		size &= 0x3f;
+	}
+
+	memcpy(ctx->buffer, data, size);
+}
+
+#define OUT(dst, src) \
+	(dst)[0] = (unsigned char)(src); \
+	(dst)[1] = (unsigned char)((src) >> 8); \
+	(dst)[2] = (unsigned char)((src) >> 16); \
+	(dst)[3] = (unsigned char)((src) >> 24);
+
+void MD5_Final(unsigned char *result, MD5_CTX *ctx)
+{
+	unsigned long used, available;
+
+	used = ctx->lo & 0x3f;
+
+	ctx->buffer[used++] = 0x80;
+
+	available = 64 - used;
+
+	if (available < 8) {
+		memset(&ctx->buffer[used], 0, available);
+		body(ctx, ctx->buffer, 64);
+		used = 0;
+		available = 64;
+	}
+
+	memset(&ctx->buffer[used], 0, available - 8);
+
+	ctx->lo <<= 3;
+	OUT(&ctx->buffer[56], ctx->lo)
+		OUT(&ctx->buffer[60], ctx->hi)
+
+		body(ctx, ctx->buffer, 64);
+
+	OUT(&result[0], ctx->a)
+		OUT(&result[4], ctx->b)
+		OUT(&result[8], ctx->c)
+		OUT(&result[12], ctx->d)
+
+		memset(ctx, 0, sizeof(*ctx));
+}
+
+#endif
\ No newline at end of file
diff --git a/tools/gzinject/src/md5.h b/tools/gzinject/src/md5.h
new file mode 100644
index 000000000..f51d33e6d
--- /dev/null
+++ b/tools/gzinject/src/md5.h
@@ -0,0 +1,45 @@
+/*
+* This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+* MD5 Message-Digest Algorithm (RFC 1321).
+*
+* Homepage:
+* http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+*
+* Author:
+* Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+*
+* This software was written by Alexander Peslyak in 2001.  No copyright is
+* claimed, and the software is hereby placed in the public domain.
+* In case this attempt to disclaim copyright and place the software in the
+* public domain is deemed null and void, then the software is
+* Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+* general public under the following terms:
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted.
+*
+* There's ABSOLUTELY NO WARRANTY, express or implied.
+*
+* See md5.c for more information.
+*/
+
+#ifdef HAVE_OPENSSL
+#include <openssl/md5.h>
+#elif !defined(_MD5_H)
+#define _MD5_H
+
+/* Any 32-bit or wider unsigned integer data type will do */
+typedef unsigned int MD5_u32plus;
+
+typedef struct {
+	MD5_u32plus lo, hi;
+	MD5_u32plus a, b, c, d;
+	unsigned char buffer[64];
+	MD5_u32plus block[16];
+} MD5_CTX;
+
+extern void MD5_Init(MD5_CTX *ctx);
+extern void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size);
+extern void MD5_Final(unsigned char *result, MD5_CTX *ctx);
+
+#endif
\ No newline at end of file
diff --git a/tools/gzinject/src/romchu.c b/tools/gzinject/src/romchu.c
new file mode 100644
index 000000000..aa37bb9cc
--- /dev/null
+++ b/tools/gzinject/src/romchu.c
@@ -0,0 +1,543 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+/* romchu 0.6 */
+/* a decompressor for type 2 romc */
+/* reversed by hcs from the Wii VC wad for Super Smash Bros EU. */
+/* this code is public domain, have at it */
+/* Taken from https://forum.xentax.com/viewtopic.php?t=5364 */
+
+#define VERSION "0.6"
+
+struct bitstream;
+
+struct bitstream *init_bitstream(const unsigned char *pool, unsigned long pool_size);
+uint32_t get_bits(struct bitstream *bs, int bits);
+int bitstream_eof(struct bitstream *bs);
+void free_bitstream(struct bitstream *bs);
+
+struct huftable;
+
+struct huftable *load_table(struct bitstream *bs, int symbols);
+int huf_lookup(struct bitstream *bs, struct huftable *ht);
+void free_table(struct huftable *);
+
+struct {
+    unsigned int bits;
+    unsigned int base;
+} backref_len[0x1D], backref_disp[0x1E];
+
+uint8_t *romchu_decompress(uint8_t *compressed, size_t comp_size, size_t *decomp_size){
+
+    unsigned char head_buf[4];
+    unsigned char payload_buf[0x10000];
+    int block_count = 0;
+    long out_offset = 0;
+    uint8_t *decompressed;
+
+    uint64_t nominal_size;
+    int romc_type;
+    uint8_t *comp = compressed;
+    // read header
+    {
+        memcpy(head_buf,compressed,4);
+        nominal_size = head_buf[0];
+        nominal_size *= 0x100;
+        nominal_size |= head_buf[1];
+        nominal_size *= 0x100;
+        nominal_size |= head_buf[2];
+        nominal_size *= 0x40;
+        nominal_size |= head_buf[3]>>2;
+        romc_type = head_buf[3]&0x3;
+        decompressed = malloc(nominal_size);
+        if(decomp_size) *decomp_size = nominal_size;
+        if (!decompressed)
+        {
+            perror("malloc big outbuf buffer");
+            return NULL;
+        }
+
+        switch(romc_type) {
+            case 0:
+                memcpy(decompressed, compressed + 4, *decomp_size);
+                return decompressed;
+            case 2:
+                break;
+            default:
+                fprintf(stderr, "Unsupported romc type. %d\n", romc_type);
+                return NULL;
+
+        }
+    }
+
+    // initialize backreference lookup tables
+    {
+        for (unsigned int i = 0; i < 8; i++)
+        {
+            backref_len[i].bits = 0;
+            backref_len[i].base = i;
+        }
+
+        for (unsigned int i = 8, scale = 1; scale < 6; scale++)
+        {
+            for (unsigned int k = (1<<(scale+2));
+                              k < (1<<(scale+3));
+                              k += (1<<scale), i++)
+            {
+                backref_len[i].bits = scale;
+                backref_len[i].base = k;
+            }
+        }
+
+        backref_len[28].bits = 0;
+        backref_len[28].base = 255;
+
+        for (unsigned int i = 0; i < 4; i++)
+        {
+            backref_disp[i].bits = 0;
+            backref_disp[i].base = i;
+        }
+
+        for (unsigned int i = 4, scale = 1, k = 4; scale < 14; scale ++)
+        {
+            for (unsigned int j = 0; j < 2; j ++, k += (1 << scale), i++)
+            {
+                backref_disp[i].bits = scale;
+                backref_disp[i].base = k;
+            }
+        }
+    }
+
+    out_offset = 0;
+    // decode each block
+    compressed += 4;
+
+    while (compressed - comp < comp_size)
+    {
+        memcpy(head_buf,compressed,4);
+        compressed+=4;
+        int compression_flag;
+        uint32_t payload_bytes;
+        int payload_bits;
+        uint32_t read_size;
+
+        struct bitstream *head_bs;
+
+
+
+        head_bs = init_bitstream(head_buf, 4*8);
+
+        compression_flag = get_bits(head_bs, 1);
+        if (compression_flag)
+        {
+            /* compressed */
+
+            uint32_t block_size;
+
+            /* bits, including this header */
+            block_size = get_bits(head_bs, 31) - 32;
+
+            payload_bytes = block_size/8;
+            payload_bits = block_size%8;
+        }
+        else
+        {
+            /* uncompressed */
+
+            uint32_t block_size;
+
+            /* bytes */
+            block_size = get_bits(head_bs, 31);
+
+            payload_bytes = block_size;
+            payload_bits = 0;
+        }
+
+        free_bitstream(head_bs);
+        head_bs = NULL;
+
+        /* read payload */
+        read_size = payload_bytes;
+        if (payload_bits > 0)
+        {
+            read_size ++;
+        }
+
+        if (read_size > sizeof(payload_buf))
+        {
+            fprintf(stderr, "payload too large\n");
+            free(decompressed);
+            return NULL;
+        }
+        memcpy(payload_buf,compressed,read_size);
+        compressed+=read_size;
+
+        /* attempt to parse... */
+
+        if (compression_flag)
+        {
+            uint16_t tab1_size, tab2_size;
+            uint32_t body_size;
+            unsigned long tab1_offset, tab2_offset, body_offset;
+            struct bitstream *bs;
+            struct huftable *table1, *table2;
+
+            /* read table 1 size */
+            tab1_offset = 0;
+            bs = init_bitstream(payload_buf + tab1_offset, payload_bytes*8+payload_bits);
+            tab1_size = get_bits(bs, 16);
+            free_bitstream(bs);
+
+            /* load table 1 */
+            bs = init_bitstream(payload_buf + tab1_offset + 2, tab1_size);
+            table1 = load_table(bs, 0x11D);
+            free_bitstream(bs);
+
+            /* read table 2 size */
+            tab2_offset = tab1_offset + 2 + (tab1_size+7) / 8;
+            bs = init_bitstream(payload_buf + tab2_offset, 2*8);
+            tab2_size = get_bits(bs, 16);
+            free_bitstream(bs);
+
+            /* load table 2 */
+            bs = init_bitstream(payload_buf + tab2_offset + 2, tab2_size);
+            table2 = load_table(bs, 0x1E);
+            free_bitstream(bs);
+
+            /* decode body */
+            body_offset = tab2_offset + 2 + (tab2_size+7) / 8;
+            body_size = payload_bytes*8 + payload_bits - body_offset*8;
+            bs = init_bitstream(payload_buf + body_offset, body_size);
+
+            while (!bitstream_eof(bs))
+            {
+                int symbol = huf_lookup(bs, table1);
+
+                if (symbol < 0x100)
+                {
+                    /* byte literal */
+                    unsigned char b = symbol;
+                    if (out_offset >= nominal_size)
+                    {
+                        fprintf(stderr, "generated too many bytes\n");
+                        free(decompressed);
+                        return NULL;
+                    }
+                    decompressed[out_offset++] = b;
+                }
+                else
+                {
+                    /* backreference */
+                    unsigned int len_bits = backref_len[symbol-0x100].bits;
+                    unsigned int len = backref_len[symbol-0x100].base;
+                    if (len_bits > 0)
+                    {
+                        len += get_bits(bs, len_bits);
+                    }
+                    len += 3;
+
+                    int symbol2 = huf_lookup(bs, table2);
+
+                    unsigned int disp_bits = backref_disp[symbol2].bits;
+                    unsigned int disp = backref_disp[symbol2].base;
+                    if (disp_bits > 0)
+                    {
+                        disp += get_bits(bs, disp_bits);
+                    }
+                    disp ++;
+
+                    if (disp > out_offset)
+                    {
+                        fprintf(stderr, "backreference too far\n");
+                        free(decompressed);
+                        return NULL;
+                    }
+                    if (out_offset+len > nominal_size)
+                    {
+                        fprintf(stderr, "generated too many bytes\n");
+                        free(decompressed);
+                        return NULL;
+                    }
+                    for (unsigned int i = 0; i < len; i++, out_offset++)
+                    {
+                        decompressed[out_offset] = decompressed[out_offset-disp];
+                    }
+                }
+            }
+
+            free_table(table1);
+            free_table(table2);
+            free_bitstream(bs);
+        }
+        else
+        {
+            if (out_offset + payload_bytes > nominal_size)
+            {
+                fprintf(stderr, "generated too many bytes\n");
+                free(decompressed);
+                return NULL;
+            }
+            memcpy(decompressed+out_offset, payload_buf, payload_bytes);
+            out_offset += payload_bytes;
+        }
+
+        block_count ++;
+    }
+    return decompressed;
+}
+
+/* bitstream reader */
+struct bitstream
+{
+    const unsigned char *pool;
+    long bits_left;
+    uint8_t first_byte;
+    int first_byte_bits;
+};
+
+struct bitstream *init_bitstream(const unsigned char *pool, unsigned long pool_size)
+{
+    struct bitstream *bs = malloc(sizeof(struct bitstream));
+    if (!bs)
+    {
+        perror("bitstream malloc");
+        exit(EXIT_FAILURE);
+    }
+
+    bs->pool = pool;
+    bs->bits_left = pool_size;
+    bs->first_byte_bits = 0;
+
+    /* check that padding bits are 0 (to ensure we aren't ignoring anything) */
+    if (pool_size%8)
+    {
+        if (pool[pool_size/8] & ~((1<<(pool_size%8))-1))
+        {
+            fprintf(stderr, "nonzero padding at end of bitstream\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return bs;
+}
+
+uint32_t get_bits(struct bitstream *bs, int bits)
+{
+    uint32_t accum = 0;
+
+    if (bits > 32)
+    {
+        fprintf(stderr, "get_bits() supports max 32\n");
+        exit(EXIT_FAILURE);
+    }
+    if (bits > bs->bits_left + bs->first_byte_bits)
+    {
+        fprintf(stderr, "get_bits() underflow\n");
+        exit(EXIT_FAILURE);
+    }
+
+    for (int i = 0; i < bits; i++)
+    {
+        if (bs->first_byte_bits == 0)
+        {
+            bs->first_byte = *bs->pool;
+            bs->pool ++;
+            if (bs->bits_left >= 8)
+            {
+                bs->first_byte_bits = 8;
+                bs->bits_left -= 8;
+            }
+            else
+            {
+                bs->first_byte_bits = bs->bits_left;
+                bs->bits_left = 0;
+            }
+        }
+
+        accum >>= 1;
+        accum |= (bs->first_byte & 1)<<31;
+        bs->first_byte >>= 1;
+        bs->first_byte_bits --;
+    }
+
+    return accum>>(32-bits);
+}
+
+int bitstream_eof(struct bitstream *bs)
+{
+    return (bs->bits_left + bs->first_byte_bits == 0);
+}
+
+void free_bitstream(struct bitstream *bs)
+{
+    free(bs);
+}
+
+/* Huffman code handling */
+struct hufnode {
+    int is_leaf;
+    union {
+        struct {
+            int left, right;
+        } inner;
+        struct {
+            int symbol;
+        } leaf;
+    } u;
+};
+struct huftable {
+    int symbols;
+    struct hufnode *t;
+};
+
+struct huftable *load_table(struct bitstream *bs, int symbols)
+{
+    int len_count[32] = {0};
+    uint32_t codes[32];
+    int *length_of = malloc(sizeof(*length_of) * symbols);
+    struct huftable *ht;
+    int next_free_node;
+
+    for (int i = 0; i < symbols; )
+    {
+        if (get_bits(bs, 1))
+        {
+            /* run of equal lengths */
+            int count = get_bits(bs, 7) + 2;
+            int length = get_bits(bs, 5);
+
+            len_count[length] += count;
+            for (int j = 0; j < count; j++, i++)
+            {
+                length_of[i] = length;
+            }
+        }
+        else
+        {
+            /* set of inequal lengths */
+            int count = get_bits(bs, 7) + 1;
+
+            for (int j = 0; j < count; j++, i++)
+            {
+                int length = get_bits(bs, 5);
+                length_of[i] = length;
+                len_count[length] ++;
+            }
+        }
+    }
+
+    if (!bitstream_eof(bs))
+    {
+        fprintf(stderr, "did not exhaust bitstream reading table\n");
+        exit(EXIT_FAILURE);
+    }
+
+    /* compute the first canonical Huffman code for each length */
+    len_count[0] = 0; // not strictly necessary
+    for (uint32_t i = 1, accum = 0; i < 32; i++)
+    {
+        accum = codes[i] = (accum + len_count[i-1]) << 1;
+    }
+
+    /* allocate space for the tree */
+    ht = malloc(sizeof(struct huftable));
+    if (!ht)
+    {
+        perror("malloc of huftable");
+        exit(EXIT_FAILURE);
+    }
+    ht->symbols = symbols;
+    ht->t = malloc(sizeof(struct hufnode) * symbols * 2);
+    if (!ht->t)
+    {
+        perror("malloc of hufnodes");
+        exit(EXIT_FAILURE);
+    }
+
+    /* determine codes and build a tree */
+    for (int i = 0; i < symbols*2; i++)
+    {
+        ht->t[i].is_leaf = 0;
+        ht->t[i].u.inner.left = ht->t[i].u.inner.right = 0;
+    }
+    next_free_node = 1;
+    for (int i = 0; i < symbols; i++)
+    {
+        int cur = 0;
+        if (0 == length_of[i])
+        {
+            // 0 length indicates absent symbol
+            continue;
+        }
+
+        for (int j = length_of[i]-1; j >= 0; j --)
+        {
+            int next;
+            if (ht->t[cur].is_leaf)
+            {
+                fprintf(stderr, "oops, walked onto a leaf\n");
+                exit(EXIT_FAILURE);
+            }
+
+            if (codes[length_of[i]]&(1<<j))
+            {
+                // 1 == right
+                next = ht->t[cur].u.inner.right;
+                if (0 == next)
+                {
+                    next = ht->t[cur].u.inner.right = next_free_node ++;
+                }
+            }
+            else
+            {
+                // 0 == left
+                next = ht->t[cur].u.inner.left ;
+                if (0 == next)
+                {
+                    next = ht->t[cur].u.inner.left = next_free_node ++;
+                }
+            }
+
+            cur = next;
+        }
+
+        ht->t[cur].is_leaf = 1;
+        ht->t[cur].u.leaf.symbol = i;
+
+        codes[length_of[i]] ++;
+    }
+    free(length_of);
+    return ht;
+}
+
+int huf_lookup(struct bitstream *bs, struct huftable *ht)
+{
+    int cur = 0;
+    while (!ht->t[cur].is_leaf)
+    {
+        if (get_bits(bs, 1))
+        {
+            // 1 == right
+            cur = ht->t[cur].u.inner.right;
+        }
+        else
+        {
+            // 0 == left
+            cur = ht->t[cur].u.inner.left;
+        }
+    }
+
+    return ht->t[cur].u.leaf.symbol;
+}
+
+void free_table(struct huftable *ht)
+{
+    if (ht)
+    {
+        free(ht->t);
+    }
+    free(ht);
+}
diff --git a/tools/gzinject/src/romchu.h b/tools/gzinject/src/romchu.h
new file mode 100644
index 000000000..1c2f8b838
--- /dev/null
+++ b/tools/gzinject/src/romchu.h
@@ -0,0 +1,8 @@
+#ifndef _ROMCHU_H
+#define _ROMCHU_H
+
+#include <stdint.h>
+
+uint8_t *romchu_decompress(uint8_t *compressed, size_t comp_size, size_t *decomp_size);
+
+#endif
\ No newline at end of file
diff --git a/tools/gzinject/src/sha1.c b/tools/gzinject/src/sha1.c
new file mode 100644
index 000000000..73794062a
--- /dev/null
+++ b/tools/gzinject/src/sha1.c
@@ -0,0 +1,296 @@
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
+/* #define SHA1HANDSOFF * Copies data before messing with it. */
+
+#define SHA1HANDSOFF
+
+#include <stdio.h>
+#include <string.h>
+
+/* for uint32_t */
+#include <stdint.h>
+
+#include "sha1.h"
+
+
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+    |(rol(block->l[i],8)&0x00FF00FF))
+#elif BYTE_ORDER == BIG_ENDIAN
+#define blk0(i) block->l[i]
+#else
+#error "Endianness not defined!"
+#endif
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
+    ^block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+
+void SHA1Transform(
+	uint32_t state[5],
+	const unsigned char buffer[64]
+)
+{
+	uint32_t a, b, c, d, e;
+
+	typedef union
+	{
+		unsigned char c[64];
+		uint32_t l[16];
+	} CHAR64LONG16;
+
+#ifdef SHA1HANDSOFF
+	CHAR64LONG16 block[1];      /* use array to appear as a pointer */
+
+	memcpy(block, buffer, 64);
+#else
+	/* The following had better never be used because it causes the
+	* pointer-to-const buffer to be cast into a pointer to non-const.
+	* And the result is written through.  I threw a "const" in, hoping
+	* this will cause a diagnostic.
+	*/
+	CHAR64LONG16 *block = (const CHAR64LONG16 *)buffer;
+#endif
+	/* Copy context->state[] to working vars */
+	a = state[0];
+	b = state[1];
+	c = state[2];
+	d = state[3];
+	e = state[4];
+	/* 4 rounds of 20 operations each. Loop unrolled. */
+	R0(a, b, c, d, e, 0);
+	R0(e, a, b, c, d, 1);
+	R0(d, e, a, b, c, 2);
+	R0(c, d, e, a, b, 3);
+	R0(b, c, d, e, a, 4);
+	R0(a, b, c, d, e, 5);
+	R0(e, a, b, c, d, 6);
+	R0(d, e, a, b, c, 7);
+	R0(c, d, e, a, b, 8);
+	R0(b, c, d, e, a, 9);
+	R0(a, b, c, d, e, 10);
+	R0(e, a, b, c, d, 11);
+	R0(d, e, a, b, c, 12);
+	R0(c, d, e, a, b, 13);
+	R0(b, c, d, e, a, 14);
+	R0(a, b, c, d, e, 15);
+	R1(e, a, b, c, d, 16);
+	R1(d, e, a, b, c, 17);
+	R1(c, d, e, a, b, 18);
+	R1(b, c, d, e, a, 19);
+	R2(a, b, c, d, e, 20);
+	R2(e, a, b, c, d, 21);
+	R2(d, e, a, b, c, 22);
+	R2(c, d, e, a, b, 23);
+	R2(b, c, d, e, a, 24);
+	R2(a, b, c, d, e, 25);
+	R2(e, a, b, c, d, 26);
+	R2(d, e, a, b, c, 27);
+	R2(c, d, e, a, b, 28);
+	R2(b, c, d, e, a, 29);
+	R2(a, b, c, d, e, 30);
+	R2(e, a, b, c, d, 31);
+	R2(d, e, a, b, c, 32);
+	R2(c, d, e, a, b, 33);
+	R2(b, c, d, e, a, 34);
+	R2(a, b, c, d, e, 35);
+	R2(e, a, b, c, d, 36);
+	R2(d, e, a, b, c, 37);
+	R2(c, d, e, a, b, 38);
+	R2(b, c, d, e, a, 39);
+	R3(a, b, c, d, e, 40);
+	R3(e, a, b, c, d, 41);
+	R3(d, e, a, b, c, 42);
+	R3(c, d, e, a, b, 43);
+	R3(b, c, d, e, a, 44);
+	R3(a, b, c, d, e, 45);
+	R3(e, a, b, c, d, 46);
+	R3(d, e, a, b, c, 47);
+	R3(c, d, e, a, b, 48);
+	R3(b, c, d, e, a, 49);
+	R3(a, b, c, d, e, 50);
+	R3(e, a, b, c, d, 51);
+	R3(d, e, a, b, c, 52);
+	R3(c, d, e, a, b, 53);
+	R3(b, c, d, e, a, 54);
+	R3(a, b, c, d, e, 55);
+	R3(e, a, b, c, d, 56);
+	R3(d, e, a, b, c, 57);
+	R3(c, d, e, a, b, 58);
+	R3(b, c, d, e, a, 59);
+	R4(a, b, c, d, e, 60);
+	R4(e, a, b, c, d, 61);
+	R4(d, e, a, b, c, 62);
+	R4(c, d, e, a, b, 63);
+	R4(b, c, d, e, a, 64);
+	R4(a, b, c, d, e, 65);
+	R4(e, a, b, c, d, 66);
+	R4(d, e, a, b, c, 67);
+	R4(c, d, e, a, b, 68);
+	R4(b, c, d, e, a, 69);
+	R4(a, b, c, d, e, 70);
+	R4(e, a, b, c, d, 71);
+	R4(d, e, a, b, c, 72);
+	R4(c, d, e, a, b, 73);
+	R4(b, c, d, e, a, 74);
+	R4(a, b, c, d, e, 75);
+	R4(e, a, b, c, d, 76);
+	R4(d, e, a, b, c, 77);
+	R4(c, d, e, a, b, 78);
+	R4(b, c, d, e, a, 79);
+	/* Add the working vars back into context.state[] */
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	/* Wipe variables */
+	a = b = c = d = e = 0;
+#ifdef SHA1HANDSOFF
+	memset(block, '\0', sizeof(block));
+#endif
+}
+
+
+/* SHA1Init - Initialize new context */
+
+void SHA1Init(
+	SHA1_CTX * context
+)
+{
+	/* SHA1 initialization constants */
+	context->state[0] = 0x67452301;
+	context->state[1] = 0xEFCDAB89;
+	context->state[2] = 0x98BADCFE;
+	context->state[3] = 0x10325476;
+	context->state[4] = 0xC3D2E1F0;
+	context->count[0] = context->count[1] = 0;
+}
+
+
+/* Run your data through this. */
+
+void SHA1Update(
+	SHA1_CTX * context,
+	const unsigned char *data,
+	uint32_t len
+)
+{
+	uint32_t i;
+
+	uint32_t j;
+
+	j = context->count[0];
+	if ((context->count[0] += len << 3) < j)
+		context->count[1]++;
+	context->count[1] += (len >> 29);
+	j = (j >> 3) & 63;
+	if ((j + len) > 63)
+	{
+		memcpy(&context->buffer[j], data, (i = 64 - j));
+		SHA1Transform(context->state, context->buffer);
+		for (; i + 63 < len; i += 64)
+		{
+			SHA1Transform(context->state, &data[i]);
+		}
+		j = 0;
+	}
+	else
+		i = 0;
+	memcpy(&context->buffer[j], &data[i], len - i);
+}
+
+
+/* Add padding and return the message digest. */
+
+void SHA1Final(
+	unsigned char digest[20],
+	SHA1_CTX * context
+)
+{
+	unsigned i;
+
+	unsigned char finalcount[8];
+
+	unsigned char c;
+
+#if 0    /* untested "improvement" by DHR */
+	/* Convert context->count to a sequence of bytes
+	* in finalcount.  Second element first, but
+	* big-endian order within element.
+	* But we do it all backwards.
+	*/
+	unsigned char *fcp = &finalcount[8];
+
+	for (i = 0; i < 2; i++)
+	{
+		uint32_t t = context->count[i];
+
+		int j;
+
+		for (j = 0; j < 4; t >>= 8, j++)
+			*--fcp = (unsigned char)t
+	}
+#else
+	for (i = 0; i < 8; i++)
+	{
+		finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)] >> ((3 - (i & 3)) * 8)) & 255);      /* Endian independent */
+	}
+#endif
+	c = 0200;
+	SHA1Update(context, &c, 1);
+	while ((context->count[0] & 504) != 448)
+	{
+		c = 0000;
+		SHA1Update(context, &c, 1);
+	}
+	SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
+	for (i = 0; i < 20; i++)
+	{
+		digest[i] = (unsigned char)
+			((context->state[i >> 2] >> ((3 - (i & 3)) * 8)) & 255);
+	}
+	/* Wipe variables */
+	memset(context, '\0', sizeof(*context));
+	memset(&finalcount, '\0', sizeof(finalcount));
+}
+
+void SHA1(
+	char *hash_out,
+	const char *str,
+	int len)
+{
+	SHA1_CTX ctx;
+	unsigned int ii;
+
+	SHA1Init(&ctx);
+	for (ii = 0; ii<len; ii += 1)
+		SHA1Update(&ctx, (const unsigned char*)str + ii, 1);
+	SHA1Final((unsigned char *)hash_out, &ctx);
+	hash_out[20] = '\0';
+}
diff --git a/tools/gzinject/src/sha1.h b/tools/gzinject/src/sha1.h
new file mode 100644
index 000000000..5fca373b9
--- /dev/null
+++ b/tools/gzinject/src/sha1.h
@@ -0,0 +1,44 @@
+#ifndef SHA1_H
+#define SHA1_H
+
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+*/
+
+#include "stdint.h"
+
+typedef struct
+{
+	uint32_t state[5];
+	uint32_t count[2];
+	unsigned char buffer[64];
+} SHA1_CTX;
+
+void SHA1Transform(
+	uint32_t state[5],
+	const unsigned char buffer[64]
+);
+
+void SHA1Init(
+	SHA1_CTX * context
+);
+
+void SHA1Update(
+	SHA1_CTX * context,
+	const unsigned char *data,
+	uint32_t len
+);
+
+void SHA1Final(
+	unsigned char digest[20],
+	SHA1_CTX * context
+);
+
+void SHA1(
+	char *hash_out,
+	const char *str,
+	int len);
+
+#endif /* SHA1_H */
\ No newline at end of file
diff --git a/tools/gzinject/src/u8.c b/tools/gzinject/src/u8.c
new file mode 100644
index 000000000..fec215538
--- /dev/null
+++ b/tools/gzinject/src/u8.c
@@ -0,0 +1,248 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <getopt.h>
+#include "u8.h"
+
+void free_nodes(node_entry_t **nodes, uint8_t nodec){
+    for(int i=0;i<nodec;i++){
+        if(nodes[i]){
+            if(nodes[i]->filename) free(nodes[i]->filename);
+            free(nodes[i]);
+        }
+    }
+}
+
+void get_dir_contents_recursive(const char *dirname, node_entry_t ***nodes, uint8_t *idx, node_entry_t *directory, int recursion){
+    struct stat sbuffer;
+    node_entry_t **node_array = *nodes;
+    DIR *dir;
+	struct dirent *ent;
+    chdir(dirname);
+	if ((dir = opendir(".")) != NULL) {
+		while ((ent = readdir(dir)) != NULL) {
+            if(ent->d_name[0]=='.') continue;
+            uint8_t this_idx = *idx;
+            node_entry_t **new_nodes = realloc(node_array, ((this_idx+1) * sizeof(*node_array)));
+            node_array = new_nodes;
+            size_t len = strlen(ent->d_name);
+            char *name = malloc(strlen(ent->d_name) + 1);
+            strcpy(name,ent->d_name);
+            name[len] = 0;
+            node_entry_t *node = malloc(sizeof(node_entry_t));
+            node->filename = name;
+            node->directory = directory;
+            stat(name, &sbuffer);
+            node->node.size = sbuffer.st_size;
+            (*idx)++;
+            directory->node.size++;
+			if ((sbuffer.st_mode & S_IFMT) == S_IFDIR) {
+                node->node.type = 0x0001;
+                node->node.data_offset=recursion;
+                node->node.size = 0;
+                get_dir_contents_recursive(name,&node_array,idx,node,recursion+1);    
+                
+			}else{
+                node->node.type = 0x0000;
+            }
+            node_array[this_idx] = node;
+		}
+		closedir(dir);
+	}
+    chdir("..");
+    *nodes = node_array;
+}
+
+void sort_dir(node_entry_t **src, node_entry_t **dest, node_entry_t *dir, size_t total_cnt,int start, int *pos){
+    for(int i=start;i<total_cnt;i++){
+        node_entry_t *node = src[i];
+        if(node->directory == dir && node->node.type==0x0000){
+            dest[(*pos)++] = node;
+        }
+    }
+    for(int i=start;i<total_cnt;i++){
+        node_entry_t *node = src[i];
+        if(node->directory == dir && node->node.type==0x0001){
+            dest[(*pos)++] = node;
+            sort_dir(src,dest,node,total_cnt,start,pos);
+            node->node.size = *pos;
+        }
+    }
+}
+
+int create_u8_archive(const char *dir, const char *output){
+    // Root Directory node. 
+    node_entry_t rootdirnode;
+    rootdirnode.node.data_offset = 0;
+    rootdirnode.node.type=0x0001;
+    rootdirnode.node.name_offset=0;
+    rootdirnode.node.size=0;
+    rootdirnode.directory=NULL;
+    rootdirnode.filename=".";
+
+	uint8_t nodec = 1;
+
+    node_entry_t **dirnodes = malloc(sizeof(*dirnodes));
+    dirnodes[0] = &rootdirnode; 
+    get_dir_contents_recursive(dir,&dirnodes,&nodec,dirnodes[0],0);
+    dirnodes[0]->node.size = nodec+1;
+    node_entry_t **sorted = malloc(sizeof(*sorted) * nodec);
+    sorted[0] = dirnodes[0];
+    int pos = 1;
+
+    sort_dir(dirnodes,sorted,sorted[0],nodec,pos,&pos);
+    free(dirnodes);
+    uint8_t *string_table = malloc(1);
+    string_table[0] = 0;
+
+    int npos = 1 , dpos = 0, dirdepth = 0;
+
+    uint8_t *data = NULL;
+    chdir(dir);
+
+    for(int i=0;i<nodec;i++){
+        sorted[i]->node.name_offset = npos;
+        size_t nlen = strlen(sorted[i]->filename) + 1;
+        uint8_t *new_table = realloc(string_table,npos + nlen);
+        if(new_table!=NULL){
+            string_table = new_table;
+        }
+        
+        memcpy(string_table + npos,sorted[i]->filename,nlen);
+        string_table[npos+nlen-1]=0;
+        npos+=nlen;
+        if(sorted[i]->node.type==0x0001){
+            chdir(sorted[i]->filename);
+            dirdepth++;
+        }else{
+            uint32_t padlen = addpadding(sorted[i]->node.size,32);
+            uint8_t *new_data = realloc(data,dpos + padlen);
+            if(new_data!=NULL){
+                data = new_data;
+            }
+            memset(data + dpos,0,padlen);
+            FILE *fle = fopen(sorted[i]->filename, "rb");
+            fread(data + dpos, 1, sorted[i]->node.size, fle);
+            fclose(fle);
+            sorted[i]->node.data_offset = dpos;
+            dpos+=padlen;
+        }
+    }
+    
+    for(int i=0;i<dirdepth;i++) chdir("..");
+
+	u8_header header;
+	header.tag = 0x2D38AA55; // 0x55AA382D 
+	header.rootnode_offset = 0x20; // 0x00000020
+	header.header_size = npos + ((nodec+1) * sizeof(u8_node));
+	header.data_offset = addpadding(header.rootnode_offset + header.header_size, 0x20);
+	memset(header.padding, 0, 16);
+
+	uint32_t dataoffset = header.data_offset;
+	uint16_t padcount = header.data_offset - (header.header_size + header.rootnode_offset);
+
+	FILE *foutfile = fopen(output, "wb");
+
+	header.header_size = REVERSEENDIAN32(header.header_size);
+	header.data_offset = REVERSEENDIAN32(header.data_offset);
+	header.rootnode_offset = REVERSEENDIAN32(header.rootnode_offset);
+	fwrite(&header, 1, sizeof(u8_header), foutfile);
+
+	u8_node rootnode;
+	rootnode.name_offset = 0x00;
+	rootnode.data_offset = 0x00;
+	rootnode.size = nodec+1;
+	rootnode.type = 0x0001;
+	rootnode.size = REVERSEENDIAN32(rootnode.size);
+	fwrite(&rootnode, 1, sizeof(u8_node), foutfile);
+
+    int j = 0;
+
+	for (j = 0; j < nodec; j++) {
+		u8_node node = sorted[j]->node;
+        if(node.type==0x0000){
+            node.data_offset+=dataoffset;
+        }
+		node.data_offset = REVERSEENDIAN32(node.data_offset);
+		node.size = REVERSEENDIAN32(node.size);
+		node.name_offset = REVERSEENDIAN16(node.name_offset);
+        fwrite(&node, 1, sizeof(u8_node), foutfile);
+	}
+    free_nodes(sorted + 1,nodec - 1);
+    free(sorted);
+	fwrite(string_table, 1, npos, foutfile);
+	free(string_table);
+
+	uint8_t *padding = calloc(padcount, sizeof(uint8_t));
+	fwrite(padding, 1, padcount, foutfile);
+	free(padding);
+
+	fwrite(data, 1, dpos, foutfile);
+	free(data);
+
+	fclose(foutfile);
+
+    return 1;
+}
+
+int extract_u8_archive(uint8_t *data, const char *outdir){
+    mkdir(outdir, 0755);
+	chdir(outdir);
+	u8_header header;
+	uint32_t data_offset;
+	uint8_t *string_table;
+	size_t rest_size;
+
+	memcpy(&header, data, sizeof(header));
+
+	int curpos = sizeof(header);
+
+	u8_node root_node;
+	memcpy(&root_node, data + curpos, sizeof(u8_node));
+	curpos += sizeof(u8_node);
+
+	uint32_t nodec = be32((uint8_t*)&root_node.size) - 1;
+	u8_node *nodes = malloc(sizeof(u8_node)*nodec);
+	memcpy(nodes, data + curpos, sizeof(u8_node)*nodec);
+	curpos += sizeof(u8_node)*nodec;
+
+	data_offset = be32((uint8_t*)&header.data_offset);
+	rest_size = data_offset - sizeof(header) - (nodec + 1) * sizeof(u8_node);
+	string_table = malloc(rest_size);
+	memcpy(string_table, data + curpos, rest_size);
+
+	u8_node *node;
+    int dir_depth = 0;
+    FILE *outfile;
+	for (int j = 0; j < nodec; j++) {
+		node = &nodes[j];
+		uint32_t doffset = be32((uint8_t*)&node->data_offset);
+		uint32_t dsize = be32((uint8_t*)&node->size);
+		uint16_t name_offset = be16((uint8_t*)&node->name_offset);
+		uint16_t type = be16((uint8_t*)&node->type);
+		char *name = (char*)&string_table[name_offset];
+		if (type == 0x0000) { // Regular file 
+			outfile = fopen(name, "wb");
+			fwrite(data + doffset, 1, dsize, outfile);
+			fclose(outfile);
+		}else if(type==0x0100){ // Directory
+            while(dir_depth>doffset+1){
+                chdir("..");
+                dir_depth--;
+            }
+            mkdir(name,0755);
+            chdir(name);
+            dir_depth++;
+        }
+    }
+    do{
+	    chdir("..");
+        dir_depth--;
+    }while(dir_depth>0);
+	free(string_table);
+	free(nodes);
+    return 1;
+}
\ No newline at end of file
diff --git a/tools/gzinject/src/u8.h b/tools/gzinject/src/u8.h
new file mode 100644
index 000000000..7519c692a
--- /dev/null
+++ b/tools/gzinject/src/u8.h
@@ -0,0 +1,34 @@
+#ifndef U8_H_
+#define U8_H_
+
+#include <stdint.h>
+#include "gzinject.h"
+
+typedef struct {
+	uint16_t type;
+	uint16_t name_offset;
+	uint32_t data_offset;
+	uint32_t size;
+}u8_node;
+
+typedef struct
+{
+	uint32_t tag;
+	uint32_t rootnode_offset;
+	uint32_t header_size;
+	uint32_t data_offset;
+	uint8_t padding[16];
+} u8_header;
+
+typedef struct node_entry_s node_entry_t;
+
+struct node_entry_s {
+    u8_node node;
+    char *filename;
+    node_entry_t *directory;
+};
+
+int create_u8_archive(const char *dir, const char *output);
+int extract_u8_archive(uint8_t *data, const char *outdir);
+
+#endif
\ No newline at end of file
diff --git a/tools/z64compress/.editorconfig b/tools/z64compress/.editorconfig
new file mode 100644
index 000000000..342ff359c
--- /dev/null
+++ b/tools/z64compress/.editorconfig
@@ -0,0 +1,15 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+
+# Matches multiple files with brace expansion notation
+[*.{c,h,ch}]
+charset = utf-8
+indent_style = tab
+indent_size = 3
+trim_trailing_whitespace = false
+
+[*.md]
+trim_trailing_whitespace = false
diff --git a/tools/z64compress/.gitignore b/tools/z64compress/.gitignore
new file mode 100644
index 000000000..6a47e0517
--- /dev/null
+++ b/tools/z64compress/.gitignore
@@ -0,0 +1,3 @@
+bin/
+o/
+z64compress
diff --git a/tools/z64compress/.gitrepo b/tools/z64compress/.gitrepo
new file mode 100644
index 000000000..0165907a9
--- /dev/null
+++ b/tools/z64compress/.gitrepo
@@ -0,0 +1,12 @@
+; DO NOT EDIT (unless you know what you are doing)
+;
+; This subdirectory is a git "subrepo", and this file is maintained by the
+; git-subrepo command. See https://github.com/ingydotnet/git-subrepo#readme
+;
+[subrepo]
+	remote = https://github.com/z64tools/z64compress.git
+	branch = main
+	commit = 331039828b0e9c995b8727a64b5bc083c78d1476
+	parent = ce3fe6d65dd1b46509f3bbcb538e9bcc56f2cfa3
+	method = merge
+	cmdver = 0.4.5
diff --git a/tools/z64compress/LICENSE b/tools/z64compress/LICENSE
new file mode 100644
index 000000000..f288702d2
--- /dev/null
+++ b/tools/z64compress/LICENSE
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
diff --git a/tools/z64compress/Makefile b/tools/z64compress/Makefile
new file mode 100644
index 000000000..7a83c6190
--- /dev/null
+++ b/tools/z64compress/Makefile
@@ -0,0 +1,49 @@
+CC := gcc
+CFLAGS := -DNDEBUG -s -Os -flto -Wall -Wextra
+
+# Target platform, specify with TARGET= on the command line, linux64 is default.
+# Currently supported: linux64, linux32, win32
+TARGET ?= linux64
+
+ifeq ($(TARGET),linux32)
+	TARGET_CFLAGS := -m32
+else ifeq ($(TARGET),win32)
+# If using a cross compiler, specify the compiler executable on the command line.
+# make TARGET=win32 CC=~/c/mxe/usr/bin/i686-w64-mingw32.static-gcc
+	TARGET_LIBS := -mconsole -municode
+else ifneq ($(TARGET),linux64)
+	$(error Supported targets: linux64, linux32, win32)
+endif
+
+# Whether to use native optimizations, specify with NATIVE_OPT=0/1 on the command line, default is 0.
+# This is not supported by all compilers which is particularly an issue on Mac, and may inhibit tests.
+NATIVE_OPT ?= 0
+ifeq ($(NATIVE_OPT),1)
+	TARGET_CFLAGS += -march=native -mtune=native
+endif
+
+OBJ_DIR := o/$(TARGET)
+
+$(OBJ_DIR)/src/enc/%.o: CFLAGS := -DNDEBUG -s -Ofast -flto -Wall -Isrc/enc/libdeflate
+
+SRC_DIRS := $(shell find src -type d)
+C_DIRS   := $(shell find src -type d -not -path "src/enc/libdeflate/*")
+C_FILES  := $(foreach dir,$(C_DIRS),$(wildcard $(dir)/*.c))
+C_FILES  += src/enc/libdeflate/lib/deflate_compress.c src/enc/libdeflate/lib/utils.c
+O_FILES  := $(foreach f,$(C_FILES:.c=.o),$(OBJ_DIR)/$f)
+
+# Make build directories
+$(shell mkdir -p $(foreach dir,$(SRC_DIRS),$(OBJ_DIR)/$(dir)))
+
+.PHONY: all clean
+
+all: z64compress
+
+z64compress: $(O_FILES)
+	$(CC) $(TARGET_CFLAGS) $(CFLAGS) $(O_FILES) -lm -lpthread -lz $(TARGET_LIBS) -o z64compress
+
+$(OBJ_DIR)/%.o: %.c
+	$(CC) -c $(TARGET_CFLAGS) $(CFLAGS) $< -o $@
+
+clean:
+	$(RM) -rf z64compress bin o
diff --git a/tools/z64compress/README.md b/tools/z64compress/README.md
new file mode 100644
index 000000000..4e9a6ba86
--- /dev/null
+++ b/tools/z64compress/README.md
@@ -0,0 +1,102 @@
+# z64compress
+
+`z64compress` is a program for compressing Zelda 64 roms: be they retail, hacked traditionally, or custom-built from the [`Ocarina of Time`](https://github.com/zeldaret/oot) or [`Majora's Mask`](https://github.com/zeldaret/mm) reverse engineering projects. It is written in highly efficient C and leverages the power of multithreading to make compression as fast as possible. To reduce overhead on subsequent compressions, an optional cache directory can be specified.
+
+In addition to the default `yaz`, it supports some faster and more compact algorithms such as `DEFLATE`, `lzo`, `ucl`, and `aplib`. In order to use these, grab patches or code from my [`z64enc` repository](https://github.com/z64me/z64enc).
+
+If you add an algorithm, please make sure `valgrind` reports no memory leaks or other errors before making a pull request. Thank you!
+
+(By the way, `valgrind` works better without the `-march=native -mtune=native` optimizations, so turn those off when testing `valgrind`.)
+
+## Usage
+This is a command line application. Learn from these common examples and adapt the arguments to your needs:
+```
+  compressing oot debug
+    --in           "path/to/in.z64"
+    --out          "path/to/out.z64"
+    --mb           32
+    --codec        yaz
+    --cache        "path/to/cache"
+    --dma          "0x12F70,1548"
+    --compress     "9-14,28-END"
+    --threads      4
+
+  compressing oot ntsc 1.0
+    --in           "path/to/in.z64"
+    --out          "path/to/out.z64"
+    --mb           32
+    --codec        yaz
+    --cache        "path/to/cache"
+    --dma          "0x7430,1526"
+    --compress     "10-14,27-END"
+    --threads      4
+
+  compressing mm usa
+    --in           "path/to/in.z64"
+    --out          "path/to/out.z64"
+    --mb           32
+    --codec        yaz
+    --cache        "path/to/cache"
+    --dma          "0x1A500,1568"
+    --compress     "10-14,23,24,31-END"
+    --skip         "1127"
+    --repack       "15-20,22"
+    --threads      4
+```
+
+## Arguments
+```
+    --in           uncompressed input rom
+
+    --out          compressed output rom
+
+    --matching     attempt matching compression at the cost of
+                   some optimizations and reduced performance
+
+    --mb           how many mb the compressed rom should be
+
+    --codec        currently supported codecs
+                      yaz
+                      ucl
+                      lzo
+                      zlib
+                      aplib
+                 * to use non-yaz codecs, find patches
+                   and code on my z64enc repo
+
+    --cache        is optional and won't be created if
+                   no path is specified (having a cache
+                   makes subsequent compressions faster)
+                 * pro-tip: linux users who don't want a
+                   cache to persist across power cycles
+                   can use the path "/tmp/z64compress"
+
+    --dma          specify dmadata address and count
+
+    --compress     enable compression on specified files
+
+    --skip         disable compression on specified files
+
+    --repack       handles Majora's Mask archives
+
+    --threads      optional multithreading;
+                   exclude this argument to disable it
+
+    --only-stdout  reserve stderr for errors and print
+                   everything else to stdout
+
+  arguments are executed as they
+  are parsed, so order matters!
+```
+
+## Building
+First, clone the repository and initialize its submodules:
+```
+git clone https://github.com/z64me/z64compress.git
+cd z64compress
+git submodule update --init
+```
+
+A Makefile-based build system is provided. Choose the target platform with `make TARGET=linux64|linux32|win32`, default is linux64. If building for windows with a cross compiler, specify the compiler executable with `make TARGET=win32 CC=/path/to/executable`.
+
+Alternatively, I have included shell scripts for building Linux and Windows binaries. Windows binaries are built using a cross compiler ([I recommend `MXE`](https://mxe.cc/)).
diff --git a/tools/z64compress/release-linux.sh b/tools/z64compress/release-linux.sh
new file mode 100644
index 000000000..bdac70dcc
--- /dev/null
+++ b/tools/z64compress/release-linux.sh
@@ -0,0 +1,14 @@
+# build compression functions (slow)
+gcc -DNDEBUG -s -Ofast -flto -lm -c -Wall -march=native -mtune=native src/enc/*.c src/enc/lzo/*.c src/enc/ucl/comp/*.c src/enc/apultra/*.c
+mkdir -p o
+mv *.o o
+
+# build everything else
+gcc -o z64compress -DNDEBUG src/*.c o/*.o src/enc/libdeflate/lib/deflate_compress.c src/enc/libdeflate/lib/utils.c -Isrc/enc/libdeflate -Wall -Wextra -s -Os -flto -lpthread -lz -march=native -mtune=native
+
+# move to bin directory
+mkdir -p bin/linux64
+mv z64compress bin/linux64
+
+
+
diff --git a/tools/z64compress/release-linux32.sh b/tools/z64compress/release-linux32.sh
new file mode 100644
index 000000000..06d829a7d
--- /dev/null
+++ b/tools/z64compress/release-linux32.sh
@@ -0,0 +1,14 @@
+# build compression functions (slow)
+gcc -m32 -DNDEBUG -s -Ofast -flto -lm -c -Wall -march=native -mtune=native src/enc/*.c src/enc/lzo/*.c src/enc/ucl/comp/*.c src/enc/apultra/*.c
+mkdir -p o
+mv *.o o
+
+# build everything else
+gcc -m32 -o z64compress -DNDEBUG src/*.c o/*.o src/enc/libdeflate/lib/deflate_compress.c src/enc/libdeflate/lib/utils.c -Isrc/enc/libdeflate -Wall -Wextra -s -Os -flto -lpthread -lz -march=native -mtune=native
+
+# move to bin directory
+mkdir -p bin/linux32
+mv z64compress bin/linux32
+
+
+
diff --git a/tools/z64compress/release-win32.sh b/tools/z64compress/release-win32.sh
new file mode 100644
index 000000000..94fc245c9
--- /dev/null
+++ b/tools/z64compress/release-win32.sh
@@ -0,0 +1,12 @@
+# build compression functions (slow)
+i686-w64-mingw32.static-gcc -DNDEBUG -s -Ofast -flto -lm -c -Wall src/enc/*.c src/enc/lzo/*.c src/enc/ucl/comp/*.c src/enc/apultra/*.c
+mkdir -p o
+mv *.o o
+
+# build everything else
+i686-w64-mingw32.static-gcc -o z64compress.exe -DNDEBUG src/*.c o/*.o src/enc/libdeflate/lib/deflate_compress.c src/enc/libdeflate/lib/utils.c -Isrc/enc/libdeflate -Wall -Wextra -s -Os -flto -lpthread -lz -mconsole -municode
+
+# move to bin directory
+mkdir -p bin/win32
+mv z64compress.exe bin/win32
+
diff --git a/tools/z64compress/src/enc/aplib.c b/tools/z64compress/src/enc/aplib.c
new file mode 100644
index 000000000..c2e720a7b
--- /dev/null
+++ b/tools/z64compress/src/enc/aplib.c
@@ -0,0 +1,48 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "apultra/libapultra.h"
+
+static void compression_progress(long long nOriginalSize, long long nCompressedSize) {
+   /* do nothing */
+}
+
+int
+aplenc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	int nMaxCompressedSize = apultra_get_max_compressed_size(src_sz);
+	apultra_stats stats;
+	
+	extern int g_hlen; /* header length */
+	memset(dst, 0, g_hlen);
+	memcpy(dst, "APL0", 4);
+	dst[4] = (src_sz >> 24);
+	dst[5] = (src_sz >> 16);
+	dst[6] = (src_sz >>  8);
+	dst[7] = (src_sz >>  0);
+	
+	*dst_sz = apultra_compress(
+		src
+		, dst + g_hlen
+		, src_sz
+		, nMaxCompressedSize
+		, 0 /* flags */
+		, 0 /* nMaxWindowSize */
+		, 0 /* nDictionarySize */
+		, compression_progress
+		, &stats
+	);
+	
+	*dst_sz = *dst_sz + g_hlen;
+	
+	return 0;
+}
+
diff --git a/tools/z64compress/src/enc/apultra/apultra.c b/tools/z64compress/src/enc/apultra/apultra.c
new file mode 100644
index 000000000..24dc2b692
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/apultra.c
@@ -0,0 +1,1225 @@
+#if 0
+/*
+ * apultra.c - command line compression utility for the apultra library
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <windows.h>
+#include <sys/timeb.h>
+#else
+#include <sys/time.h>
+#endif
+#include "libapultra.h"
+
+#define OPT_VERBOSE        1
+#define OPT_STATS          2
+#define OPT_BACKWARD       4
+
+#define TOOL_VERSION "1.4.1"
+
+/*---------------------------------------------------------------------------*/
+
+#ifdef _WIN32
+LARGE_INTEGER hpc_frequency;
+BOOL hpc_available = FALSE;
+#endif
+
+static void do_init_time() {
+#ifdef _WIN32
+   hpc_frequency.QuadPart = 0;
+   hpc_available = QueryPerformanceFrequency(&hpc_frequency);
+#endif
+}
+
+static long long do_get_time() {
+   long long nTime;
+
+#ifdef _WIN32
+   if (hpc_available) {
+      LARGE_INTEGER nCurTime;
+
+      /* Use HPC hardware for best precision */
+      QueryPerformanceCounter(&nCurTime);
+      nTime = (long long)(nCurTime.QuadPart * 1000000LL / hpc_frequency.QuadPart);
+   }
+   else {
+      struct _timeb tb;
+      _ftime(&tb);
+
+      nTime = ((long long)tb.time * 1000LL + (long long)tb.millitm) * 1000LL;
+   }
+#else
+   struct timeval tm;
+   gettimeofday(&tm, NULL);
+
+   nTime = (long long)tm.tv_sec * 1000000LL + (long long)tm.tv_usec;
+#endif
+   return nTime;
+}
+
+static void do_reverse_buffer(unsigned char *pBuffer, size_t nBufferSize) {
+   size_t nMidPoint = nBufferSize / 2;
+   size_t i, j;
+
+   for (i = 0, j = nBufferSize - 1; i < nMidPoint; i++, j--) {
+      unsigned char c = pBuffer[i];
+      pBuffer[i] = pBuffer[j];
+      pBuffer[j] = c;
+   }
+}
+
+/*---------------------------------------------------------------------------*/
+
+static void compression_progress(long long nOriginalSize, long long nCompressedSize) {
+   if (nOriginalSize >= 512 * 1024) {
+      fprintf(stdout, "\r%lld => %lld (%g %%)     \b\b\b\b\b", nOriginalSize, nCompressedSize, (double)(nCompressedSize * 100.0 / nOriginalSize));
+      fflush(stdout);
+   }
+}
+
+static int do_compress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, const unsigned int nMaxWindowSize) {
+   long long nStartTime = 0LL, nEndTime = 0LL;
+   size_t nOriginalSize = 0L, nCompressedSize = 0L, nMaxCompressedSize;
+   int nFlags = 0;
+   apultra_stats stats;
+   unsigned char *pDecompressedData;
+   unsigned char *pCompressedData;
+
+   if (nOptions & OPT_VERBOSE) {
+      nStartTime = do_get_time();
+   }
+
+   FILE* f_dict = NULL;
+   size_t nDictionarySize = 0;
+   if (pszDictionaryFilename) {
+      /* Open the dictionary */
+      f_dict = fopen(pszDictionaryFilename, "rb");
+      if (!f_dict) {
+         fprintf(stderr, "error opening dictionary '%s' for reading\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      /* Get dictionary size */
+      fseek(f_dict, 0, SEEK_END);
+      nDictionarySize = (size_t)ftell(f_dict);
+      fseek(f_dict, 0, SEEK_SET);
+
+      if (nDictionarySize > BLOCK_SIZE) nDictionarySize = BLOCK_SIZE;
+   }
+
+   /* Read the whole original file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nOriginalSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pDecompressedData = (unsigned char*)malloc(nDictionarySize + nOriginalSize);
+   if (!pDecompressedData) {
+      fclose(f_in);
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nOriginalSize);
+      return 100;
+   }
+
+   if (f_dict) {
+      /* Read dictionary data */
+      if (fread(pDecompressedData + ((nOptions & OPT_BACKWARD) ? nOriginalSize : 0), 1, nDictionarySize, f_dict) != nDictionarySize) {
+         free(pDecompressedData);
+         fclose(f_in);
+         fclose(f_dict);
+         fprintf(stderr, "I/O error while reading dictionary '%s'\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      fclose(f_dict);
+      f_dict = NULL;
+   }
+
+   /* Read input file data */
+   if (fread(pDecompressedData + ((nOptions & OPT_BACKWARD) ? 0 : nDictionarySize), 1, nOriginalSize, f_in) != nOriginalSize) {
+      free(pDecompressedData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData, nDictionarySize + nOriginalSize);
+
+   /* Allocate max compressed size */
+
+   nMaxCompressedSize = apultra_get_max_compressed_size(nDictionarySize + nOriginalSize);
+
+   pCompressedData = (unsigned char*)malloc(nMaxCompressedSize);
+   if (!pCompressedData) {
+      free(pDecompressedData);
+      fprintf(stderr, "out of memory for compressing '%s', %zd bytes needed\n", pszInFilename, nMaxCompressedSize);
+      return 100;
+   }
+
+   memset(pCompressedData, 0, nMaxCompressedSize);
+
+   nCompressedSize = apultra_compress(pDecompressedData, pCompressedData, nDictionarySize + nOriginalSize, nMaxCompressedSize, nFlags, nMaxWindowSize, nDictionarySize, compression_progress, &stats);
+
+   if ((nOptions & OPT_VERBOSE)) {
+      nEndTime = do_get_time();
+   }
+
+   if (nCompressedSize == -1) {
+      free(pCompressedData);
+      free(pDecompressedData);
+      fprintf(stderr, "compression error for '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData, nCompressedSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole compressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pCompressedData, 1, nCompressedSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pCompressedData);
+   free(pDecompressedData);
+
+   if ((nOptions & OPT_VERBOSE)) {
+      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
+      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
+      fprintf(stdout, "\rCompressed '%s' in %g seconds, %.02g Mb/s, %d tokens (%g bytes/token), %d into %d bytes ==> %g %%\n",
+         pszInFilename, fDelta, fSpeed, stats.commands_divisor, (double)nOriginalSize / (double)stats.commands_divisor,
+         (int)nOriginalSize, (int)nCompressedSize, (double)(nCompressedSize * 100.0 / nOriginalSize));
+   }
+
+   if (nOptions & OPT_STATS) {
+      fprintf(stdout, "Tokens: literals: %d short matches: %d normal matches: %d large matches: %d rep matches: %d EOD: %d\n",
+         stats.num_literals, stats.num_4bit_matches, stats.num_7bit_matches, stats.num_variable_matches, stats.num_rep_matches, stats.num_eod);
+      if (stats.match_divisor > 0) {
+         fprintf(stdout, "Offsets: min: %d avg: %d max: %d count: %d\n", stats.min_offset, (int)(stats.total_offsets / (long long)stats.match_divisor), stats.max_offset, stats.match_divisor);
+         fprintf(stdout, "Match lens: min: %d avg: %d max: %d count: %d\n", stats.min_match_len, stats.total_match_lens / stats.match_divisor, stats.max_match_len, stats.match_divisor);
+      }
+      else {
+         fprintf(stdout, "Offsets: none\n");
+         fprintf(stdout, "Match lens: none\n");
+      }
+      if (stats.rle1_divisor > 0) {
+         fprintf(stdout, "RLE1 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle1_len, stats.total_rle1_lens / stats.rle1_divisor, stats.max_rle1_len, stats.rle1_divisor);
+      }
+      else {
+         fprintf(stdout, "RLE1 lens: none\n");
+      }
+      if (stats.rle2_divisor > 0) {
+         fprintf(stdout, "RLE2 lens: min: %d avg: %d max: %d count: %d\n", stats.min_rle2_len, stats.total_rle2_lens / stats.rle2_divisor, stats.max_rle2_len, stats.rle2_divisor);
+      }
+      else {
+         fprintf(stdout, "RLE2 lens: none\n");
+      }
+      fprintf(stdout, "Safe distance: %d (0x%X)\n", stats.safe_dist, stats.safe_dist);
+   }
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_decompress(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+   long long nStartTime = 0LL, nEndTime = 0LL;
+   size_t nCompressedSize, nMaxDecompressedSize, nOriginalSize;
+   unsigned char *pCompressedData;
+   unsigned char *pDecompressedData;
+   int nFlags = 0;
+
+   /* Read the whole compressed file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nCompressedSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pCompressedData = (unsigned char*)malloc(nCompressedSize);
+   if (!pCompressedData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nCompressedSize);
+      return 100;
+   }
+
+   if (fread(pCompressedData, 1, nCompressedSize, f_in) != nCompressedSize) {
+      free(pCompressedData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData, nCompressedSize);
+
+   /* Get max decompressed size */
+
+   nMaxDecompressedSize = apultra_get_max_decompressed_size(pCompressedData, nCompressedSize, nFlags);
+   if (nMaxDecompressedSize == -1) {
+      free(pCompressedData);
+      fprintf(stderr, "invalid compressed format for file '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   FILE* f_dict = NULL;
+   size_t nDictionarySize = 0;
+   if (pszDictionaryFilename) {
+      /* Open the dictionary */
+      f_dict = fopen(pszDictionaryFilename, "rb");
+      if (!f_dict) {
+         fprintf(stderr, "error opening dictionary '%s' for reading\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      /* Get dictionary size */
+      fseek(f_dict, 0, SEEK_END);
+      nDictionarySize = (size_t)ftell(f_dict);
+      fseek(f_dict, 0, SEEK_SET);
+
+      if (nDictionarySize > BLOCK_SIZE) nDictionarySize = BLOCK_SIZE;
+   }
+
+   /* Allocate max decompressed size */
+
+   pDecompressedData = (unsigned char*)malloc(nDictionarySize + nMaxDecompressedSize);
+   if (!pDecompressedData) {
+      free(pCompressedData);
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "out of memory for decompressing '%s', %zd bytes needed\n", pszInFilename, nMaxDecompressedSize);
+      return 100;
+   }
+
+   memset(pDecompressedData, 0, nDictionarySize + nMaxDecompressedSize);
+
+   if (f_dict) {
+      /* Read dictionary data */
+      if (fread(pDecompressedData, 1, nDictionarySize, f_dict) != nDictionarySize) {
+         free(pDecompressedData);
+         fclose(f_in);
+         fclose(f_dict);
+         fprintf(stderr, "I/O error while reading dictionary '%s'\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      fclose(f_dict);
+      f_dict = NULL;
+
+      if (nOptions & OPT_BACKWARD)
+         do_reverse_buffer(pDecompressedData, nDictionarySize);
+   }
+
+   if (nOptions & OPT_VERBOSE) {
+      nStartTime = do_get_time();
+   }
+
+   nOriginalSize = apultra_decompress(pCompressedData, pDecompressedData, nCompressedSize, nMaxDecompressedSize, nDictionarySize, nFlags);
+   if (nOriginalSize == -1) {
+      free(pDecompressedData);
+      free(pCompressedData);
+
+      fprintf(stderr, "decompression error for '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData + nDictionarySize, nOriginalSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole decompressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pDecompressedData + nDictionarySize, 1, nOriginalSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pDecompressedData);
+   free(pCompressedData);
+
+   if (nOptions & OPT_VERBOSE) {
+      nEndTime = do_get_time();
+      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
+      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
+      fprintf(stdout, "Decompressed '%s' in %g seconds, %g Mb/s\n",
+         pszInFilename, fDelta, fSpeed);
+   }
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_compare(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+   long long nStartTime = 0LL, nEndTime = 0LL;
+   size_t nCompressedSize, nMaxDecompressedSize, nOriginalSize, nDecompressedSize;
+   unsigned char *pCompressedData = NULL;
+   unsigned char *pOriginalData = NULL;
+   unsigned char *pDecompressedData = NULL;
+   int nFlags = 0;
+
+   /* Read the whole compressed file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nCompressedSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pCompressedData = (unsigned char*)malloc(nCompressedSize);
+   if (!pCompressedData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nCompressedSize);
+      return 100;
+   }
+
+   if (fread(pCompressedData, 1, nCompressedSize, f_in) != nCompressedSize) {
+      free(pCompressedData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData, nCompressedSize);
+
+   /* Read the whole original file in memory */
+
+   f_in = fopen(pszOutFilename, "rb");
+   if (!f_in) {
+      free(pCompressedData);
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nOriginalSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pOriginalData = (unsigned char*)malloc(nOriginalSize);
+   if (!pOriginalData) {
+      fclose(f_in);
+      free(pCompressedData);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nOriginalSize);
+      return 100;
+   }
+
+   if (fread(pOriginalData, 1, nOriginalSize, f_in) != nOriginalSize) {
+      free(pOriginalData);
+      fclose(f_in);
+      free(pCompressedData);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   /* Get max decompressed size */
+
+   nMaxDecompressedSize = apultra_get_max_decompressed_size(pCompressedData, nCompressedSize, nFlags);
+   if (nMaxDecompressedSize == -1) {
+      free(pOriginalData);
+      free(pCompressedData);
+      fprintf(stderr, "invalid compressed format for file '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   FILE* f_dict = NULL;
+   size_t nDictionarySize = 0;
+   if (pszDictionaryFilename) {
+      /* Open the dictionary */
+      f_dict = fopen(pszDictionaryFilename, "rb");
+      if (!f_dict) {
+         fprintf(stderr, "error opening dictionary '%s' for reading\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      /* Get dictionary size */
+      fseek(f_dict, 0, SEEK_END);
+      nDictionarySize = (size_t)ftell(f_dict);
+      fseek(f_dict, 0, SEEK_SET);
+
+      if (nDictionarySize > BLOCK_SIZE) nDictionarySize = BLOCK_SIZE;
+   }
+
+   /* Allocate max decompressed size */
+
+   pDecompressedData = (unsigned char*)malloc(nDictionarySize + nMaxDecompressedSize);
+   if (!pDecompressedData) {
+      free(pOriginalData);
+      free(pCompressedData);
+      if (f_dict) fclose(f_dict);
+      fprintf(stderr, "out of memory for decompressing '%s', %zd bytes needed\n", pszInFilename, nMaxDecompressedSize);
+      return 100;
+   }
+
+   memset(pDecompressedData, 0, nDictionarySize + nMaxDecompressedSize);
+
+   if (f_dict) {
+      /* Read dictionary data */
+      if (fread(pDecompressedData, 1, nDictionarySize, f_dict) != nDictionarySize) {
+         free(pDecompressedData);
+         fclose(f_in);
+         fclose(f_dict);
+         fprintf(stderr, "I/O error while reading dictionary '%s'\n", pszDictionaryFilename);
+         return 100;
+      }
+
+      fclose(f_dict);
+      f_dict = NULL;
+
+      if (nOptions & OPT_BACKWARD)
+         do_reverse_buffer(pDecompressedData, nDictionarySize);
+   }
+
+   if (nOptions & OPT_VERBOSE) {
+      nStartTime = do_get_time();
+   }
+
+   nDecompressedSize = apultra_decompress(pCompressedData, pDecompressedData, nCompressedSize, nMaxDecompressedSize, nDictionarySize, nFlags);
+   if (nDecompressedSize == -1) {
+      free(pDecompressedData);
+      free(pOriginalData);
+      free(pCompressedData);
+
+      fprintf(stderr, "decompression error for '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData + nDictionarySize, nDecompressedSize);
+
+   if (nDecompressedSize != nOriginalSize || memcmp(pDecompressedData + nDictionarySize, pOriginalData, nOriginalSize)) {
+      fprintf(stderr, "error comparing compressed file '%s' with original '%s'\n", pszInFilename, pszOutFilename);
+      return 100;
+   }
+
+   free(pDecompressedData);
+   free(pOriginalData);
+   free(pCompressedData);
+
+   if (nOptions & OPT_VERBOSE) {
+      nEndTime = do_get_time();
+      double fDelta = ((double)(nEndTime - nStartTime)) / 1000000.0;
+      double fSpeed = ((double)nOriginalSize / 1048576.0) / fDelta;
+      fprintf(stdout, "Compared '%s' in %g seconds, %g Mb/s\n",
+         pszInFilename, fDelta, fSpeed);
+   }
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static void generate_compressible_data(unsigned char *pBuffer, size_t nBufferSize, unsigned int nSeed, int nNumLiteralValues, float fMatchProbability) {
+   size_t nIndex = 0;
+   int nMatchProbability = (int)(fMatchProbability * 1023.0f);
+
+   srand(nSeed);
+   
+   if (nIndex >= nBufferSize) return;
+   pBuffer[nIndex++] = rand() % nNumLiteralValues;
+
+   while (nIndex < nBufferSize) {
+      if ((rand() & 1023) >= nMatchProbability) {
+         size_t nLiteralCount = rand() & 127;
+         if (nLiteralCount > (nBufferSize - nIndex))
+            nLiteralCount = nBufferSize - nIndex;
+
+         while (nLiteralCount--)
+            pBuffer[nIndex++] = rand() % nNumLiteralValues;
+      }
+      else {
+         size_t nMatchLength = MIN_MATCH_SIZE + (rand() & 1023);
+         size_t nMatchOffset;
+
+         if (nMatchLength > (nBufferSize - nIndex))
+            nMatchLength = nBufferSize - nIndex;
+         if (nMatchLength > nIndex)
+            nMatchLength = nIndex;
+
+         if (nMatchLength < nIndex)
+            nMatchOffset = rand() % (nIndex - nMatchLength);
+         else
+            nMatchOffset = 0;
+
+         while (nMatchLength--) {
+            pBuffer[nIndex] = pBuffer[nIndex - nMatchOffset];
+            nIndex++;
+         }
+      }
+   }
+}
+
+static void xor_data(unsigned char *pBuffer, size_t nBufferSize, unsigned int nSeed, float fXorProbability) {
+   size_t nIndex = 0;
+   int nXorProbability = (int)(fXorProbability * 1023.0f);
+
+   srand(nSeed);
+
+   if (nIndex >= nBufferSize) return;
+
+   while (nIndex < nBufferSize) {
+      if ((rand() & 1023) < nXorProbability) {
+         pBuffer[nIndex] ^= 0xff;
+      }
+      nIndex++;
+   }
+}
+
+static int do_self_test(const unsigned int nOptions, const unsigned int nMaxWindowSize, const int nIsQuickTest) {
+   unsigned char *pGeneratedData;
+   unsigned char *pCompressedData;
+   unsigned char *pTmpCompressedData;
+   unsigned char *pTmpDecompressedData;
+   size_t nGeneratedDataSize;
+   size_t nMaxCompressedDataSize;
+   unsigned int nSeed = 123;
+   int nFlags = 0;
+   int i;
+
+   pGeneratedData = (unsigned char*)malloc(4 * BLOCK_SIZE);
+   if (!pGeneratedData) {
+      fprintf(stderr, "out of memory, %d bytes needed\n", 4 * BLOCK_SIZE);
+      return 100;
+   }
+
+   nMaxCompressedDataSize = apultra_get_max_compressed_size(4 * BLOCK_SIZE);
+   pCompressedData = (unsigned char*)malloc(nMaxCompressedDataSize);
+   if (!pCompressedData) {
+      free(pGeneratedData);
+      pGeneratedData = NULL;
+
+      fprintf(stderr, "out of memory, %zd bytes needed\n", nMaxCompressedDataSize);
+      return 100;
+   }
+
+   pTmpCompressedData = (unsigned char*)malloc(nMaxCompressedDataSize);
+   if (!pTmpCompressedData) {
+      free(pCompressedData);
+      pCompressedData = NULL;
+      free(pGeneratedData);
+      pGeneratedData = NULL;
+
+      fprintf(stderr, "out of memory, %zd bytes needed\n", nMaxCompressedDataSize);
+      return 100;
+   }
+
+   pTmpDecompressedData = (unsigned char*)malloc(4 * BLOCK_SIZE);
+   if (!pTmpDecompressedData) {
+      free(pTmpCompressedData);
+      pTmpCompressedData = NULL;
+      free(pCompressedData);
+      pCompressedData = NULL;
+      free(pGeneratedData);
+      pGeneratedData = NULL;
+
+      fprintf(stderr, "out of memory, %d bytes needed\n", 4 * BLOCK_SIZE);
+      return 100;
+   }
+
+   memset(pGeneratedData, 0, 4 * BLOCK_SIZE);
+   memset(pCompressedData, 0, nMaxCompressedDataSize);
+   memset(pTmpCompressedData, 0, nMaxCompressedDataSize);
+
+   /* Test compressing with a too small buffer to do anything, expect to fail cleanly */
+   for (i = 0; i < 12; i++) {
+      generate_compressible_data(pGeneratedData, i, nSeed, 256, 0.5f);
+      apultra_compress(pGeneratedData, pCompressedData, i, i, nFlags, nMaxWindowSize, 0 /* dictionary size */, NULL, NULL);
+   }
+
+   size_t nDataSizeStep = 128;
+   float fProbabilitySizeStep = nIsQuickTest ? 0.005f : 0.0005f;
+
+   for (nGeneratedDataSize = 1024; nGeneratedDataSize <= (nIsQuickTest ? 1024U : (4U * BLOCK_SIZE)); nGeneratedDataSize += nDataSizeStep) {
+      float fMatchProbability;
+
+      fprintf(stdout, "size %zd", nGeneratedDataSize);
+      for (fMatchProbability = 0; fMatchProbability <= 0.995f; fMatchProbability += fProbabilitySizeStep) {
+         int nNumLiteralValues[12] = { 1, 2, 3, 15, 30, 56, 96, 137, 178, 191, 255, 256 };
+         float fXorProbability;
+
+         fputc('.', stdout);
+         fflush(stdout);
+
+         for (i = 0; i < 12; i++) {
+            /* Generate data to compress */
+            generate_compressible_data(pGeneratedData, nGeneratedDataSize, nSeed, nNumLiteralValues[i], fMatchProbability);
+
+            /* Try to compress it, expected to succeed */
+            size_t nActualCompressedSize = apultra_compress(pGeneratedData, pCompressedData, nGeneratedDataSize, apultra_get_max_compressed_size(nGeneratedDataSize),
+               nFlags, nMaxWindowSize, 0 /* dictionary size */, NULL, NULL);
+            if (nActualCompressedSize == -1 || nActualCompressedSize < (1 + 1 + 1 /* footer */)) {
+               free(pTmpDecompressedData);
+               pTmpDecompressedData = NULL;
+               free(pTmpCompressedData);
+               pTmpCompressedData = NULL;
+               free(pCompressedData);
+               pCompressedData = NULL;
+               free(pGeneratedData);
+               pGeneratedData = NULL;
+
+               fprintf(stderr, "\nself-test: error compressing size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               return 100;
+            }
+
+            /* Try to decompress it, expected to succeed */
+            size_t nActualDecompressedSize;
+            nActualDecompressedSize = apultra_decompress(pCompressedData, pTmpDecompressedData, nActualCompressedSize, nGeneratedDataSize, 0 /* dictionary size */, nFlags);
+            if (nActualDecompressedSize == -1) {
+               free(pTmpDecompressedData);
+               pTmpDecompressedData = NULL;
+               free(pTmpCompressedData);
+               pTmpCompressedData = NULL;
+               free(pCompressedData);
+               pCompressedData = NULL;
+               free(pGeneratedData);
+               pGeneratedData = NULL;
+
+               fprintf(stderr, "\nself-test: error decompressing size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               return 100;
+            }
+
+            if (memcmp(pGeneratedData, pTmpDecompressedData, nGeneratedDataSize)) {
+               free(pTmpDecompressedData);
+               pTmpDecompressedData = NULL;
+               free(pTmpCompressedData);
+               pTmpCompressedData = NULL;
+               free(pCompressedData);
+               pCompressedData = NULL;
+               free(pGeneratedData);
+               pGeneratedData = NULL;
+
+               fprintf(stderr, "\nself-test: error comparing decompressed and original data, size %zd, seed %d, match probability %f, literals range %d\n", nGeneratedDataSize, nSeed, fMatchProbability, nNumLiteralValues[i]);
+               return 100;
+            }
+
+            /* Try to decompress corrupted data, expected to fail cleanly, without crashing or corrupting memory outside the output buffer */
+            for (fXorProbability = 0.05f; fXorProbability <= 0.5f; fXorProbability += 0.05f) {
+               memcpy(pTmpCompressedData, pCompressedData, nActualCompressedSize);
+               xor_data(pTmpCompressedData, nActualCompressedSize, nSeed, fXorProbability);
+               apultra_decompress(pTmpCompressedData, pGeneratedData, nActualCompressedSize, nGeneratedDataSize, 0 /* dictionary size */, nFlags);
+            }
+         }
+
+         nSeed++;
+      }
+
+      fputc(10, stdout);
+      fflush(stdout);
+
+      nDataSizeStep <<= 1;
+      if (nDataSizeStep > (128 * 4096))
+         nDataSizeStep = 128 * 4096;
+      fProbabilitySizeStep *= 1.25;
+      if (fProbabilitySizeStep > (0.0005f * 4096))
+         fProbabilitySizeStep = 0.0005f * 4096;
+   }
+
+   free(pTmpDecompressedData);
+   pTmpDecompressedData = NULL;
+
+   free(pTmpCompressedData);
+   pTmpCompressedData = NULL;
+
+   free(pCompressedData);
+   pCompressedData = NULL;
+
+   free(pGeneratedData);
+   pGeneratedData = NULL;
+
+   fprintf(stdout, "All tests passed.\n");
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_compr_benchmark(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions, const unsigned int nMaxWindowSize) {
+   size_t nFileSize, nMaxCompressedSize;
+   unsigned char *pFileData;
+   unsigned char *pCompressedData;
+   int nFlags = 0;
+   int i;
+
+   if (pszDictionaryFilename) {
+      fprintf(stderr, "in-memory benchmarking does not support dictionaries\n");
+      return 100;
+   }
+
+   /* Read the whole original file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nFileSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pFileData = (unsigned char*)malloc(nFileSize);
+   if (!pFileData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nFileSize);
+      return 100;
+   }
+
+   if (fread(pFileData, 1, nFileSize, f_in) != nFileSize) {
+      free(pFileData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pFileData, nFileSize);
+
+   /* Allocate max compressed size */
+
+   nMaxCompressedSize = apultra_get_max_compressed_size(nFileSize);
+
+   pCompressedData = (unsigned char*)malloc(nMaxCompressedSize + 2048);
+   if (!pCompressedData) {
+      free(pFileData);
+      fprintf(stderr, "out of memory for compressing '%s', %zd bytes needed\n", pszInFilename, nMaxCompressedSize);
+      return 100;
+   }
+
+   memset(pCompressedData + 1024, 0, nMaxCompressedSize);
+
+   long long nBestCompTime = -1;
+
+   size_t nActualCompressedSize = 0;
+   size_t nRightGuardPos = nMaxCompressedSize;
+
+   for (i = 0; i < 5; i++) {
+      unsigned char nGuard = 0x33 + i;
+      int j;
+
+      /* Write guard bytes around the output buffer, to help check for writes outside of it by the compressor */
+      memset(pCompressedData, nGuard, 1024);
+      memset(pCompressedData + 1024 + nRightGuardPos, nGuard, 1024);
+
+      long long t0 = do_get_time();
+      nActualCompressedSize = apultra_compress(pFileData, pCompressedData + 1024, nFileSize, nRightGuardPos, nFlags, nMaxWindowSize, 0 /* dictionary size */, NULL, NULL);
+      long long t1 = do_get_time();
+      if (nActualCompressedSize == -1) {
+         free(pCompressedData);
+         free(pFileData);
+         fprintf(stderr, "compression error\n");
+         return 100;
+      }
+
+      long long nCurDecTime = t1 - t0;
+      if (nBestCompTime == -1 || nBestCompTime > nCurDecTime)
+         nBestCompTime = nCurDecTime;
+
+      /* Check guard bytes before the output buffer */
+      for (j = 0; j < 1024; j++) {
+         if (pCompressedData[j] != nGuard) {
+            free(pCompressedData);
+            free(pFileData);
+            fprintf(stderr, "error, wrote outside of output buffer at %d!\n", j - 1024);
+            return 100;
+         }
+      }
+
+      /* Check guard bytes after the output buffer */
+      for (j = 0; j < 1024; j++) {
+         if (pCompressedData[1024 + nRightGuardPos + j] != nGuard) {
+            free(pCompressedData);
+            free(pFileData);
+            fprintf(stderr, "error, wrote outside of output buffer at %d!\n", j);
+            return 100;
+         }
+      }
+
+      nRightGuardPos = nActualCompressedSize;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pCompressedData + 1024, nActualCompressedSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole compressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pCompressedData + 1024, 1, nActualCompressedSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pCompressedData);
+   free(pFileData);
+
+   fprintf(stdout, "compressed size: %zd bytes\n", nActualCompressedSize);
+   fprintf(stdout, "compression time: %lld microseconds (%g Mb/s)\n", nBestCompTime, ((double)nActualCompressedSize / 1024.0) / ((double)nBestCompTime / 1000.0));
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+static int do_dec_benchmark(const char *pszInFilename, const char *pszOutFilename, const char *pszDictionaryFilename, const unsigned int nOptions) {
+   size_t nFileSize, nMaxDecompressedSize;
+   unsigned char *pFileData;
+   unsigned char *pDecompressedData;
+   int nFlags = 0;
+   int i;
+
+   if (pszDictionaryFilename) {
+      fprintf(stderr, "in-memory benchmarking does not support dictionaries\n");
+      return 100;
+   }
+
+   /* Read the whole compressed file in memory */
+
+   FILE *f_in = fopen(pszInFilename, "rb");
+   if (!f_in) {
+      fprintf(stderr, "error opening '%s' for reading\n", pszInFilename);
+      return 100;
+   }
+
+   fseek(f_in, 0, SEEK_END);
+   nFileSize = (size_t)ftell(f_in);
+   fseek(f_in, 0, SEEK_SET);
+
+   pFileData = (unsigned char*)malloc(nFileSize);
+   if (!pFileData) {
+      fclose(f_in);
+      fprintf(stderr, "out of memory for reading '%s', %zd bytes needed\n", pszInFilename, nFileSize);
+      return 100;
+   }
+
+   if (fread(pFileData, 1, nFileSize, f_in) != nFileSize) {
+      free(pFileData);
+      fclose(f_in);
+      fprintf(stderr, "I/O error while reading '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   fclose(f_in);
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pFileData, nFileSize);
+
+   /* Allocate max decompressed size */
+
+   nMaxDecompressedSize = apultra_get_max_decompressed_size(pFileData, nFileSize, nFlags);
+   if (nMaxDecompressedSize == -1) {
+      free(pFileData);
+      fprintf(stderr, "invalid compressed format for file '%s'\n", pszInFilename);
+      return 100;
+   }
+
+   pDecompressedData = (unsigned char*)malloc(nMaxDecompressedSize);
+   if (!pDecompressedData) {
+      free(pFileData);
+      fprintf(stderr, "out of memory for decompressing '%s', %zd bytes needed\n", pszInFilename, nMaxDecompressedSize);
+      return 100;
+   }
+
+   memset(pDecompressedData, 0, nMaxDecompressedSize);
+
+   long long nBestDecTime = -1;
+
+   size_t nActualDecompressedSize = 0;
+   for (i = 0; i < 50; i++) {
+      long long t0 = do_get_time();
+      nActualDecompressedSize = apultra_decompress(pFileData, pDecompressedData, nFileSize, nMaxDecompressedSize, 0 /* dictionary size */, nFlags);
+      long long t1 = do_get_time();
+      if (nActualDecompressedSize == -1) {
+         free(pDecompressedData);
+         free(pFileData);
+         fprintf(stderr, "decompression error\n");
+         return 100;
+      }
+
+      long long nCurDecTime = t1 - t0;
+      if (nBestDecTime == -1 || nBestDecTime > nCurDecTime)
+         nBestDecTime = nCurDecTime;
+   }
+
+   if (nOptions & OPT_BACKWARD)
+      do_reverse_buffer(pDecompressedData, nActualDecompressedSize);
+
+   if (pszOutFilename) {
+      FILE *f_out;
+
+      /* Write whole decompressed file out */
+
+      f_out = fopen(pszOutFilename, "wb");
+      if (f_out) {
+         fwrite(pDecompressedData, 1, nActualDecompressedSize, f_out);
+         fclose(f_out);
+      }
+   }
+
+   free(pDecompressedData);
+   free(pFileData);
+
+   fprintf(stdout, "decompressed size: %zd bytes\n", nActualDecompressedSize);
+   fprintf(stdout, "decompression time: %lld microseconds (%g Mb/s)\n", nBestDecTime, ((double)nActualDecompressedSize / 1024.0) / ((double)nBestDecTime / 1000.0));
+
+   return 0;
+}
+
+/*---------------------------------------------------------------------------*/
+
+int main(int argc, char **argv) {
+   int i;
+   const char *pszInFilename = NULL;
+   const char *pszOutFilename = NULL;
+   const char *pszDictionaryFilename = NULL;
+   int nArgsError = 0;
+   int nCommandDefined = 0;
+   int nVerifyCompression = 0;
+   char cCommand = 'z';
+   unsigned int nOptions = 0;
+   unsigned int nMaxWindowSize = 0;
+
+   for (i = 1; i < argc; i++) {
+      if (!strcmp(argv[i], "-d")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'd';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-z")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'z';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-c")) {
+         if (!nVerifyCompression) {
+            nVerifyCompression = 1;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-cbench")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'B';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-dbench")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'b';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-test")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 't';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-quicktest")) {
+         if (!nCommandDefined) {
+            nCommandDefined = 1;
+            cCommand = 'T';
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-D")) {
+         if (!pszDictionaryFilename && (i + 1) < argc) {
+            pszDictionaryFilename = argv[i + 1];
+            i++;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strncmp(argv[i], "-D", 2)) {
+         if (!pszDictionaryFilename) {
+            pszDictionaryFilename = argv[i] + 2;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-v")) {
+         if ((nOptions & OPT_VERBOSE) == 0) {
+            nOptions |= OPT_VERBOSE;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-w")) {
+         if (!nMaxWindowSize && (i + 1) < argc) {
+            char *pEnd = NULL;
+            nMaxWindowSize = (int)strtol(argv[i + 1], &pEnd, 10);
+            if (pEnd && pEnd != argv[i + 1] && (nMaxWindowSize >= 16 && nMaxWindowSize <= 0x200000)) {
+               i++;
+            }
+            else {
+               nArgsError = 1;
+            }
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strncmp(argv[i], "-w", 2)) {
+         if (!nMaxWindowSize) {
+            char *pEnd = NULL;
+            nMaxWindowSize = (int)strtol(argv[i] + 2, &pEnd, 10);
+            if (!(pEnd && pEnd != (argv[i] + 2) && (nMaxWindowSize >= 16 && nMaxWindowSize <= 0x200000))) {
+               nArgsError = 1;
+            }
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-stats")) {
+         if ((nOptions & OPT_STATS) == 0) {
+            nOptions |= OPT_STATS;
+         }
+         else
+            nArgsError = 1;
+      }
+      else if (!strcmp(argv[i], "-b")) {
+         if ((nOptions & OPT_BACKWARD) == 0) {
+            nOptions |= OPT_BACKWARD;
+         }
+         else
+            nArgsError = 1;
+      }
+      else {
+         if (!pszInFilename)
+            pszInFilename = argv[i];
+         else {
+            if (!pszOutFilename)
+               pszOutFilename = argv[i];
+            else
+               nArgsError = 1;
+         }
+      }
+   }
+
+   if (!nArgsError && cCommand == 't') {
+      return do_self_test(nOptions, nMaxWindowSize, 0);
+   }
+   else if (!nArgsError && cCommand == 'T') {
+      return do_self_test(nOptions, nMaxWindowSize, 1);
+   }
+
+   if (nArgsError || !pszInFilename || !pszOutFilename) {
+      fprintf(stderr, "apultra command-line tool v" TOOL_VERSION " by Emmanuel Marty and spke\n");
+      fprintf(stderr, "usage: %s [-c] [-d] [-v] [-b] <infile> <outfile>\n", argv[0]);
+      fprintf(stderr, "        -c: check resulting stream after compressing\n");
+      fprintf(stderr, "        -d: decompress (default: compress)\n");
+      fprintf(stderr, "        -b: backwards compression or decompression\n");
+      fprintf(stderr, " -w <size>: maximum window size, in bytes (16..2097152), defaults to maximum\n");
+      fprintf(stderr, " -D <file>: use dictionary file\n");
+      fprintf(stderr, "   -cbench: benchmark in-memory compression\n");
+      fprintf(stderr, "   -dbench: benchmark in-memory decompression\n");
+      fprintf(stderr, "     -test: run full automated self-tests\n");
+      fprintf(stderr, "-quicktest: run quick automated self-tests\n");
+      fprintf(stderr, "    -stats: show compressed data stats\n");
+      fprintf(stderr, "        -v: be verbose\n");
+      return 100;
+   }
+
+   do_init_time();
+
+   if (cCommand == 'z') {
+      int nResult = do_compress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMaxWindowSize);
+      if (nResult == 0 && nVerifyCompression) {
+         return do_compare(pszOutFilename, pszInFilename, pszDictionaryFilename, nOptions);
+      } else {
+         return nResult;
+      }
+   }
+   else if (cCommand == 'd') {
+      return do_decompress(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions);
+   }
+   else if (cCommand == 'B') {
+      return do_compr_benchmark(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions, nMaxWindowSize);
+   }
+   else if (cCommand == 'b') {
+      return do_dec_benchmark(pszInFilename, pszOutFilename, pszDictionaryFilename, nOptions);
+   }
+   else {
+      return 100;
+   }
+}
+#endif
diff --git a/tools/z64compress/src/enc/apultra/divsufsort.c b/tools/z64compress/src/enc/apultra/divsufsort.c
new file mode 100644
index 000000000..3a1c75304
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/divsufsort.c
@@ -0,0 +1,460 @@
+/*
+ * divsufsort.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "divsufsort_private.h"
+#ifdef _OPENMP
+# include <omp.h>
+#endif
+
+
+/*- Private Functions -*/
+
+/* Sorts suffixes of type B*. */
+static
+saidx_t
+sort_typeBstar(const sauchar_t *T, saidx_t *SA,
+               saidx_t *bucket_A, saidx_t *bucket_B,
+               saidx_t n) {
+  saidx_t *PAb, *ISAb, *buf;
+#ifdef _OPENMP
+  saidx_t *curbuf;
+  saidx_t l;
+#endif
+  saidx_t i, j, k, t, m, bufsize;
+  saint_t c0, c1;
+#ifdef _OPENMP
+  saint_t d0, d1;
+  int tmp;
+#endif
+
+  /* Initialize bucket arrays. */
+  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
+  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
+
+  /* Count the number of occurrences of the first one or two characters of each
+     type A, B and B* suffix. Moreover, store the beginning position of all
+     type B* suffixes into the array SA. */
+  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
+    /* type A suffix. */
+    do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
+    if(0 <= i) {
+      /* type B* suffix. */
+      ++BUCKET_BSTAR(c0, c1);
+      SA[--m] = i;
+      /* type B suffix. */
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
+        ++BUCKET_B(c0, c1);
+      }
+    }
+  }
+  m = n - m;
+/*
+note:
+  A type B* suffix is lexicographically smaller than a type B suffix that
+  begins with the same first two characters.
+*/
+
+  /* Calculate the index of start/end point of each bucket. */
+  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
+    t = i + BUCKET_A(c0);
+    BUCKET_A(c0) = i + j; /* start point */
+    i = t + BUCKET_B(c0, c0);
+    for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
+      j += BUCKET_BSTAR(c0, c1);
+      BUCKET_BSTAR(c0, c1) = j; /* end point */
+      i += BUCKET_B(c0, c1);
+    }
+  }
+
+  if(0 < m) {
+    /* Sort the type B* suffixes by their first two characters. */
+    PAb = SA + n - m; ISAb = SA + m;
+    for(i = m - 2; 0 <= i; --i) {
+      t = PAb[i], c0 = T[t], c1 = T[t + 1];
+      SA[--BUCKET_BSTAR(c0, c1)] = i;
+    }
+    t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
+    SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
+
+    /* Sort the type B* substrings using sssort. */
+#ifdef _OPENMP
+    tmp = omp_get_max_threads();
+    buf = SA + m, bufsize = (n - (2 * m)) / tmp;
+    c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
+#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)
+    {
+      tmp = omp_get_thread_num();
+      curbuf = buf + tmp * bufsize;
+      k = 0;
+      for(;;) {
+        #pragma omp critical(sssort_lock)
+        {
+          if(0 < (l = j)) {
+            d0 = c0, d1 = c1;
+            do {
+              k = BUCKET_BSTAR(d0, d1);
+              if(--d1 <= d0) {
+                d1 = ALPHABET_SIZE - 1;
+                if(--d0 < 0) { break; }
+              }
+            } while(((l - k) <= 1) && (0 < (l = k)));
+            c0 = d0, c1 = d1, j = k;
+          }
+        }
+        if(l == 0) { break; }
+        sssort(T, PAb, SA + k, SA + l,
+               curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
+      }
+    }
+#else
+    buf = SA + m, bufsize = n - (2 * m);
+    for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+        i = BUCKET_BSTAR(c0, c1);
+        if(1 < (j - i)) {
+          sssort(T, PAb, SA + i, SA + j,
+                 buf, bufsize, 2, n, *(SA + i) == (m - 1));
+        }
+      }
+    }
+#endif
+
+    /* Compute ranks of type B* substrings. */
+    for(i = m - 1; 0 <= i; --i) {
+      if(0 <= SA[i]) {
+        j = i;
+        do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
+        SA[i + 1] = i - j;
+        if(i <= 0) { break; }
+      }
+      j = i;
+      do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
+      ISAb[SA[i]] = j;
+    }
+
+    /* Construct the inverse suffix array of type B* suffixes using trsort. */
+    trsort(ISAb, SA, m, 1);
+
+    /* Set the sorted order of tyoe B* suffixes. */
+    for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
+      if(0 <= i) {
+        t = i;
+        for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
+        SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
+      }
+    }
+
+    /* Calculate the index of start/end point of each bucket. */
+    BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
+    for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
+      i = BUCKET_A(c0 + 1) - 1;
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
+        t = i - BUCKET_B(c0, c1);
+        BUCKET_B(c0, c1) = i; /* end point */
+
+        /* Move all type B* suffixes to the correct position. */
+        for(i = t, j = BUCKET_BSTAR(c0, c1);
+            j <= k;
+            --i, --k) { SA[i] = SA[k]; }
+      }
+      BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
+      BUCKET_B(c0, c0) = i; /* end point */
+    }
+  }
+
+  return m;
+}
+
+/* Constructs the suffix array by using the sorted order of type B* suffixes. */
+static
+void
+construct_SA(const sauchar_t *T, saidx_t *SA,
+             saidx_t *bucket_A, saidx_t *bucket_B,
+             saidx_t n, saidx_t m) {
+  saidx_t *i, *j, *k;
+  saidx_t s;
+  saint_t c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          *j = ~s;
+          c0 = T[--s];
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else {
+          assert(((s == 0) && (T[s] == c1)) || (s < 0));
+          *j = ~s;
+        }
+      }
+    }
+  }
+
+  /* Construct the suffix array by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else {
+      assert(s < 0);
+      *i = ~s;
+    }
+  }
+}
+
+#if 0
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+saidx_t
+construct_BWT(const sauchar_t *T, saidx_t *SA,
+              saidx_t *bucket_A, saidx_t *bucket_B,
+              saidx_t n, saidx_t m) {
+  saidx_t *i, *j, *k, *orig;
+  saidx_t s;
+  saint_t c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          c0 = T[--s];
+          *j = ~((saidx_t)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~((saidx_t)T[n - 2]) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      *i = c0;
+      if((0 < s) && (T[s - 1] < c0)) { s = ~((saidx_t)T[s - 1]); }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+#endif
+
+/*---------------------------------------------------------------------------*/
+
+/**
+ * Initialize suffix array context
+ *
+ * @return 0 for success, or non-zero in case of an error
+ */
+int divsufsort_init(divsufsort_ctx_t *ctx) {
+   ctx->bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
+   ctx->bucket_B = NULL;
+
+   if (ctx->bucket_A) {
+      ctx->bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
+
+      if (ctx->bucket_B)
+         return 0;
+   }
+
+   divsufsort_destroy(ctx);
+   return -1;
+}
+
+/**
+ * Destroy suffix array context
+ *
+ * @param ctx suffix array context to destroy
+ */
+void divsufsort_destroy(divsufsort_ctx_t *ctx) {
+   if (ctx->bucket_B) {
+      free(ctx->bucket_B);
+      ctx->bucket_B = NULL;
+   }
+
+   if (ctx->bucket_A) {
+      free(ctx->bucket_A);
+      ctx->bucket_A = NULL;
+   }
+}
+
+/*- Function -*/
+
+saint_t
+divsufsort_build_array(divsufsort_ctx_t *ctx, const sauchar_t *T, saidx_t *SA, saidx_t n) {
+  saidx_t m;
+  saint_t err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  /* Suffixsort. */
+  if((ctx->bucket_A != NULL) && (ctx->bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, ctx->bucket_A, ctx->bucket_B, n);
+    construct_SA(T, SA, ctx->bucket_A, ctx->bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  return err;
+}
+
+#if 0
+saidx_t
+divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n) {
+  saidx_t *B;
+  saidx_t *bucket_A, *bucket_B;
+  saidx_t m, pidx, i;
+
+  /* Check arguments. */
+  if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
+  else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
+
+  if((B = A) == NULL) { B = (saidx_t *)malloc((size_t)(n + 1) * sizeof(saidx_t)); }
+  bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
+  bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
+
+  /* Burrows-Wheeler Transform. */
+  if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
+    pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
+
+    /* Copy to output string. */
+    U[0] = T[n - 1];
+    for(i = 0; i < pidx; ++i) { U[i + 1] = (sauchar_t)B[i]; }
+    for(i += 1; i < n; ++i) { U[i] = (sauchar_t)B[i]; }
+    pidx += 1;
+  } else {
+    pidx = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+  if(A == NULL) { free(B); }
+
+  return pidx;
+}
+
+const char *
+divsufsort_version(void) {
+  return PROJECT_VERSION_FULL;
+}
+#endif
+
+saint_t
+divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n) {
+  saidx_t *bucket_A, *bucket_B;
+  saidx_t m;
+  saint_t err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  bucket_A = (saidx_t *)malloc(BUCKET_A_SIZE * sizeof(saidx_t));
+  bucket_B = (saidx_t *)malloc(BUCKET_B_SIZE * sizeof(saidx_t));
+
+  /* Suffixsort. */
+  if((bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
+    construct_SA(T, SA, bucket_A, bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+
+  return err;
+}
diff --git a/tools/z64compress/src/enc/apultra/divsufsort.h b/tools/z64compress/src/enc/apultra/divsufsort.h
new file mode 100644
index 000000000..5c617ee73
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/divsufsort.h
@@ -0,0 +1,192 @@
+/*
+ * divsufsort.h for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_H
+#define _DIVSUFSORT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#define DIVSUFSORT_API
+
+/*- Datatypes -*/
+#ifndef SAUCHAR_T
+#define SAUCHAR_T
+typedef unsigned char sauchar_t;
+#endif /* SAUCHAR_T */
+#ifndef SAINT_T
+#define SAINT_T
+typedef int saint_t;
+#endif /* SAINT_T */
+#ifndef SAIDX_T
+#define SAIDX_T
+typedef int saidx_t;
+#endif /* SAIDX_T */
+#ifndef PRIdSAIDX_T
+#define PRIdSAIDX_T "d"
+#endif
+
+/*- divsufsort context */
+typedef struct _divsufsort_ctx_t {
+   saidx_t *bucket_A;
+   saidx_t *bucket_B;
+} divsufsort_ctx_t;
+
+/*- Prototypes -*/
+
+/**
+ * Initialize suffix array context
+ *
+ * @return 0 for success, or non-zero in case of an error
+ */
+int divsufsort_init(divsufsort_ctx_t *ctx);
+
+/**
+ * Destroy suffix array context
+ *
+ * @param ctx suffix array context to destroy
+ */
+void divsufsort_destroy(divsufsort_ctx_t *ctx);
+
+/**
+ * Constructs the suffix array of a given string.
+ * @param ctx suffix array context
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The output array of suffixes.
+ * @param n The length of the given string.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t divsufsort_build_array(divsufsort_ctx_t *ctx, const sauchar_t *T, saidx_t *SA, saidx_t n);
+
+#if 0
+/**
+ * Constructs the burrows-wheeler transformed string of a given string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @return The primary index if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saidx_t
+divbwt(const sauchar_t *T, sauchar_t *U, saidx_t *A, saidx_t n);
+
+/**
+ * Returns the version of the divsufsort library.
+ * @return The version number string.
+ */
+DIVSUFSORT_API
+const char *
+divsufsort_version(void);
+
+
+/**
+ * Constructs the burrows-wheeler transformed string of a given string and suffix array.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param SA[0..n-1] The suffix array. (can be NULL)
+ * @param n The length of the given string.
+ * @param idx The output primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t
+bw_transform(const sauchar_t *T, sauchar_t *U,
+             saidx_t *SA /* can NULL */,
+             saidx_t n, saidx_t *idx);
+
+/**
+ * Inverse BW-transforms a given BWTed string.
+ * @param T[0..n-1] The input string.
+ * @param U[0..n-1] The output string. (can be T)
+ * @param A[0..n-1] The temporary array. (can be NULL)
+ * @param n The length of the given string.
+ * @param idx The primary index.
+ * @return 0 if no error occurred, -1 or -2 otherwise.
+ */
+DIVSUFSORT_API
+saint_t
+inverse_bw_transform(const sauchar_t *T, sauchar_t *U,
+                     saidx_t *A /* can NULL */,
+                     saidx_t n, saidx_t idx);
+
+/**
+ * Checks the correctness of a given suffix array.
+ * @param T[0..n-1] The input string.
+ * @param SA[0..n-1] The input suffix array.
+ * @param n The length of the given string.
+ * @param verbose The verbose mode.
+ * @return 0 if no error occurred.
+ */
+DIVSUFSORT_API
+saint_t
+sufcheck(const sauchar_t *T, const saidx_t *SA, saidx_t n, saint_t verbose);
+
+/**
+ * Search for the pattern P in the string T.
+ * @param T[0..Tsize-1] The input string.
+ * @param Tsize The length of the given string.
+ * @param P[0..Psize-1] The input pattern string.
+ * @param Psize The length of the given pattern string.
+ * @param SA[0..SAsize-1] The input suffix array.
+ * @param SAsize The length of the given suffix array.
+ * @param idx The output index.
+ * @return The count of matches if no error occurred, -1 otherwise.
+ */
+DIVSUFSORT_API
+saidx_t
+sa_search(const sauchar_t *T, saidx_t Tsize,
+          const sauchar_t *P, saidx_t Psize,
+          const saidx_t *SA, saidx_t SAsize,
+          saidx_t *left);
+
+/**
+ * Search for the character c in the string T.
+ * @param T[0..Tsize-1] The input string.
+ * @param Tsize The length of the given string.
+ * @param SA[0..SAsize-1] The input suffix array.
+ * @param SAsize The length of the given suffix array.
+ * @param c The input character.
+ * @param idx The output index.
+ * @return The count of matches if no error occurred, -1 otherwise.
+ */
+DIVSUFSORT_API
+saidx_t
+sa_simplesearch(const sauchar_t *T, saidx_t Tsize,
+                const saidx_t *SA, saidx_t SAsize,
+                saint_t c, saidx_t *left);
+#endif
+
+saint_t
+divsufsort(const sauchar_t *T, saidx_t *SA, saidx_t n);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_H */
diff --git a/tools/z64compress/src/enc/apultra/divsufsort_config.h b/tools/z64compress/src/enc/apultra/divsufsort_config.h
new file mode 100644
index 000000000..f112983cf
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/divsufsort_config.h
@@ -0,0 +1,9 @@
+#define HAVE_STRING_H 1
+#define HAVE_STDLIB_H 1
+#define HAVE_MEMORY_H 1
+#define HAVE_STDINT_H 1
+#define INLINE inline
+
+#ifdef _MSC_VER
+#pragma warning( disable : 4244 )
+#endif /* _MSC_VER */
diff --git a/tools/z64compress/src/enc/apultra/divsufsort_private.h b/tools/z64compress/src/enc/apultra/divsufsort_private.h
new file mode 100644
index 000000000..b4d97ad4b
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/divsufsort_private.h
@@ -0,0 +1,205 @@
+/*
+ * divsufsort_private.h for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _DIVSUFSORT_PRIVATE_H
+#define _DIVSUFSORT_PRIVATE_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include "divsufsort_config.h"
+#include <assert.h>
+#include <stdio.h>
+#if HAVE_STRING_H
+# include <string.h>
+#endif
+#if HAVE_STDLIB_H
+# include <stdlib.h>
+#endif
+#if HAVE_MEMORY_H
+# include <memory.h>
+#endif
+#if HAVE_STDDEF_H
+# include <stddef.h>
+#endif
+#if HAVE_STRINGS_H
+# include <strings.h>
+#endif
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#else
+# if HAVE_STDINT_H
+#  include <stdint.h>
+# endif
+#endif
+#if defined(BUILD_DIVSUFSORT64)
+# include "divsufsort64.h"
+# ifndef SAIDX_T
+#  define SAIDX_T
+#  define saidx_t saidx64_t
+# endif /* SAIDX_T */
+# ifndef PRIdSAIDX_T
+#  define PRIdSAIDX_T PRIdSAIDX64_T
+# endif /* PRIdSAIDX_T */
+# define divsufsort divsufsort64
+# define divbwt divbwt64
+# define divsufsort_version divsufsort64_version
+# define bw_transform bw_transform64
+# define inverse_bw_transform inverse_bw_transform64
+# define sufcheck sufcheck64
+# define sa_search sa_search64
+# define sa_simplesearch sa_simplesearch64
+# define sssort sssort64
+# define trsort trsort64
+#else
+# include "divsufsort.h"
+#endif
+
+
+/*- Constants -*/
+#if !defined(UINT8_MAX)
+# define UINT8_MAX (255)
+#endif /* UINT8_MAX */
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (UINT8_MAX + 1)
+#endif
+/* for divsufsort.c */
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+/* for sssort.c */
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# if defined(BUILD_DIVSUFSORT64)
+#  define SS_MISORT_STACKSIZE (96)
+# else
+#  define SS_MISORT_STACKSIZE (64)
+# endif
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#if defined(BUILD_DIVSUFSORT64)
+# define SS_SMERGE_STACKSIZE (64)
+#else
+# define SS_SMERGE_STACKSIZE (32)
+#endif
+/* for trsort.c */
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#if defined(BUILD_DIVSUFSORT64)
+# define TR_STACKSIZE (96)
+#else
+# define TR_STACKSIZE (64)
+#endif
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+/* for divsufsort.c */
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Prototypes -*/
+/* sssort.c */
+void
+sssort(const sauchar_t *Td, const saidx_t *PA,
+       saidx_t *first, saidx_t *last,
+       saidx_t *buf, saidx_t bufsize,
+       saidx_t depth, saidx_t n, saint_t lastsuffix);
+/* trsort.c */
+void
+trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth);
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif /* _DIVSUFSORT_PRIVATE_H */
diff --git a/tools/z64compress/src/enc/apultra/expand.c b/tools/z64compress/src/enc/apultra/expand.c
new file mode 100644
index 000000000..c5ad18229
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/expand.c
@@ -0,0 +1,396 @@
+/*
+ * expand.c - decompressor implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "format.h"
+#include "expand.h"
+#include "libapultra.h"
+
+#ifdef _MSC_VER
+#define FORCE_INLINE __forceinline
+#else /* _MSC_VER */
+#define FORCE_INLINE __attribute__((always_inline))
+#endif /* _MSC_VER */
+
+static inline FORCE_INLINE int apultra_read_bit(const unsigned char **ppInBlock, const unsigned char *pDataEnd, int *nCurBitMask, unsigned char *bits) {
+   const unsigned char *pInBlock = *ppInBlock;
+   int nBit;
+
+   if ((*nCurBitMask) == 0) {
+      if (pInBlock >= pDataEnd) return -1;
+      (*bits) = *pInBlock++;
+      (*nCurBitMask) = 128;
+   }
+
+   nBit = ((*bits) & 128) ? 1 : 0;
+
+   (*bits) <<= 1;
+   (*nCurBitMask) >>= 1;
+
+   *ppInBlock = pInBlock;
+   return nBit;
+}
+
+static inline FORCE_INLINE int apultra_read_gamma2(const unsigned char **ppInBlock, const unsigned char *pDataEnd, int *nCurBitMask, unsigned char *bits) {
+   int bit;
+   unsigned int v = 1;
+
+   do {
+      v = (v << 1) + apultra_read_bit(ppInBlock, pDataEnd, nCurBitMask, bits);
+      bit = apultra_read_bit(ppInBlock, pDataEnd, nCurBitMask, bits);
+      if (bit < 0) return bit;
+   } while (bit);
+
+   return v;
+}
+
+/**
+ * Get maximum decompressed size of compressed data
+ *
+ * @param pInputData compressed data
+ * @param nInputSize compressed size in bytes
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return maximum decompressed size
+ */
+size_t apultra_get_max_decompressed_size(const unsigned char *pInputData, size_t nInputSize, const unsigned int nFlags) {
+   const unsigned char *pInputDataEnd = pInputData + nInputSize;
+   int nCurBitMask = 0;
+   unsigned char bits = 0;
+   int nMatchOffset = -1;
+   int nFollowsLiteral = 3;
+   size_t nDecompressedSize = 0;
+
+   if (pInputData >= pInputDataEnd)
+      return -1;
+   pInputData++;
+   nDecompressedSize++;
+
+   while (1) {
+      int nResult;
+
+      nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+      if (nResult < 0) return -1;
+
+      if (!nResult) {
+         /* '0': literal */
+         if (pInputData < pInputDataEnd) {
+            pInputData++;
+            nDecompressedSize++;
+            nFollowsLiteral = 3;
+         }
+         else {
+            return -1;
+         }
+      }
+      else {
+         nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+         if (nResult < 0) return -1;
+
+         if (nResult == 0) {
+            unsigned int nMatchLen;
+
+            /* '10': 8+n bits offset */
+            int nMatchOffsetHi = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            nMatchOffsetHi -= nFollowsLiteral;
+            if (nMatchOffsetHi >= 0) {
+               nMatchOffset = ((unsigned int) nMatchOffsetHi) << 8;
+               nMatchOffset |= (unsigned int)(*pInputData++);
+
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+
+               if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                  nMatchLen += 2;
+               else if (nMatchOffset >= MINMATCH3_OFFSET)
+                  nMatchLen++;
+            }
+            else {
+               /* else rep-match */
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            }
+
+            nFollowsLiteral = 2;
+
+            nDecompressedSize += nMatchLen;
+         }
+         else {
+            nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            if (nResult < 0) return -1;
+
+            if (nResult == 0) {
+               unsigned int nCommand;
+               unsigned int nMatchLen;
+
+               /* '110': 7 bits offset + 1 bit length */
+               nCommand = (unsigned int)(*pInputData++);
+               if (nCommand == 0x00) {
+                  /* EOD. No match len follows. */
+                  break;
+               }
+
+               /* Bits 7-1: offset; bit 0: length */
+               nMatchOffset = (nCommand >> 1);
+               nMatchLen = (nCommand & 1) + 2;
+
+               nFollowsLiteral = 2;
+               nDecompressedSize += nMatchLen;
+            }
+            else {
+               unsigned int nShortMatchOffset;
+
+               /* '111': 4 bit offset */
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset = nResult << 3;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 2;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 1;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 0;
+
+               nFollowsLiteral = 3;
+               nDecompressedSize++;
+            }
+         }
+      }
+   }
+
+   return nDecompressedSize;
+}
+
+/**
+ * Decompress data in memory
+ *
+ * @param pInputData compressed data
+ * @param pOutBuffer buffer for decompressed data
+ * @param nInputSize compressed size in bytes
+ * @param nMaxOutBufferSize maximum capacity of decompression buffer
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return actual decompressed size, or -1 for error
+ */
+size_t apultra_decompress(const unsigned char *pInputData, unsigned char *pOutData, size_t nInputSize, size_t nMaxOutBufferSize, size_t nDictionarySize, const unsigned int nFlags) {
+   const unsigned char *pInputDataEnd = pInputData + nInputSize;
+   unsigned char *pCurOutData = pOutData + nDictionarySize;
+   const unsigned char *pOutDataEnd = pCurOutData + nMaxOutBufferSize;
+   const unsigned char *pOutDataFastEnd = pOutDataEnd - 20;
+   int nCurBitMask = 0;
+   unsigned char bits = 0;
+   int nMatchOffset = -1;
+   int nFollowsLiteral = 3;
+
+   if (pInputData >= pInputDataEnd && pCurOutData < pOutDataEnd)
+      return -1;
+   *pCurOutData++ = *pInputData++;
+
+   while (1) {
+      int nResult;
+
+      nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+      if (nResult < 0) return -1;
+
+      if (!nResult) {
+         /* '0': literal */
+         if (pInputData < pInputDataEnd && pCurOutData < pOutDataEnd) {
+            *pCurOutData++ = *pInputData++;
+            nFollowsLiteral = 3;
+         }
+         else {
+            return -1;
+         }
+      }
+      else {
+         nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+         if (nResult < 0) return -1;
+
+         if (nResult == 0) {
+            unsigned int nMatchLen;
+
+            /* '10': 8+n bits offset */
+            int nMatchOffsetHi = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            nMatchOffsetHi -= nFollowsLiteral;
+            if (nMatchOffsetHi >= 0) {
+               nMatchOffset = ((unsigned int) nMatchOffsetHi) << 8;
+               nMatchOffset |= (unsigned int)(*pInputData++);
+
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+
+               if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                  nMatchLen += 2;
+               else if (nMatchOffset >= MINMATCH3_OFFSET)
+                  nMatchLen++;
+            }
+            else {
+               /* else rep-match */
+               nMatchLen = apultra_read_gamma2(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            }
+
+            nFollowsLiteral = 2;
+            const unsigned char *pSrc = pCurOutData - nMatchOffset;
+            if (pSrc >= pOutData && (pSrc + nMatchLen) <= pOutDataEnd) {
+               if (nMatchLen < 11 && nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+                  memcpy(pCurOutData, pSrc, 8);
+                  memcpy(pCurOutData + 8, pSrc + 8, 2);
+                  pCurOutData += nMatchLen;
+               }
+               else {
+                  if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
+                     /* Do a deterministic, left to right byte copy instead of memcpy() so as to handle overlaps */
+
+                     if (nMatchOffset >= 16 && (pCurOutData + nMatchLen) < (pOutDataFastEnd - 15)) {
+                        const unsigned char *pCopySrc = pSrc;
+                        unsigned char *pCopyDst = pCurOutData;
+                        const unsigned char *pCopyEndDst = pCurOutData + nMatchLen;
+
+                        do {
+                           memcpy(pCopyDst, pCopySrc, 16);
+                           pCopySrc += 16;
+                           pCopyDst += 16;
+                        } while (pCopyDst < pCopyEndDst);
+
+                        pCurOutData += nMatchLen;
+                     }
+                     else {
+                        while (nMatchLen) {
+                           *pCurOutData++ = *pSrc++;
+                           nMatchLen--;
+                        }
+                     }
+                  }
+                  else {
+                     return -1;
+                  }
+               }
+            }
+            else {
+               return -1;
+            }
+         }
+         else {
+            nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+            if (nResult < 0) return -1;
+
+            if (nResult == 0) {
+               unsigned int nCommand;
+               unsigned int nMatchLen;
+
+               /* '110': 7 bits offset + 1 bit length */
+               nCommand = (unsigned int)(*pInputData++);
+               if (nCommand == 0x00) {
+                  /* EOD. No match len follows. */
+                  break;
+               }
+
+               /* Bits 7-1: offset; bit 0: length */
+               nMatchOffset = (nCommand >> 1);
+               nMatchLen = (nCommand & 1) + 2;
+
+               nFollowsLiteral = 2;
+               const unsigned char *pSrc = pCurOutData - nMatchOffset;
+               if (pSrc >= pOutData && (pSrc + nMatchLen) <= pOutDataEnd) {
+                  if (nMatchOffset >= 8 && pCurOutData < pOutDataFastEnd) {
+                     memcpy(pCurOutData, pSrc, 8);
+                     memcpy(pCurOutData + 8, pSrc + 8, 2);
+                     pCurOutData += nMatchLen;
+                  }
+                  else {
+                     if ((pCurOutData + nMatchLen) <= pOutDataEnd) {
+                        while (nMatchLen) {
+                           *pCurOutData++ = *pSrc++;
+                           nMatchLen--;
+                        }
+                     }
+                     else {
+                        return -1;
+                     }
+                  }
+               }
+               else {
+                  return -1;
+               }
+            }
+            else {
+               unsigned int nShortMatchOffset;
+
+               /* '111': 4 bit offset */
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset = nResult << 3;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 2;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 1;
+
+               nResult = apultra_read_bit(&pInputData, pInputDataEnd, &nCurBitMask, &bits);
+               if (nResult < 0) return -1;
+               nShortMatchOffset |= nResult << 0;
+
+               nFollowsLiteral = 3;
+               if (nShortMatchOffset) {
+                  /* Short offset, 1-15 */
+                  const unsigned char *pSrc = pCurOutData - nShortMatchOffset;
+                  if (pSrc >= pOutData && (pCurOutData + 1) <= pOutDataEnd && (pSrc + 1) <= pOutDataEnd) {
+                     *pCurOutData++ = *pSrc++;
+                  }
+                  else {
+                     return -1;
+                  }
+               }
+               else {
+                  /* Write zero */
+                  if ((pCurOutData + 1) <= pOutDataEnd) {
+                     *pCurOutData++ = 0;
+                  }
+                  else {
+                     return -1;
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   return (size_t)(pCurOutData - pOutData) - nDictionarySize;
+}
diff --git a/tools/z64compress/src/enc/apultra/expand.h b/tools/z64compress/src/enc/apultra/expand.h
new file mode 100644
index 000000000..9cd658ad8
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/expand.h
@@ -0,0 +1,71 @@
+/*
+ * expand.h - decompressor definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _EXPAND_H
+#define _EXPAND_H
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Get maximum decompressed size of compressed data
+ *
+ * @param pInputData compressed data
+ * @param nInputSize compressed size in bytes
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return maximum decompressed size
+ */
+size_t apultra_get_max_decompressed_size(const unsigned char *pInputData, size_t nInputSize, const unsigned int nFlags);
+
+/**
+ * Decompress data in memory
+ *
+ * @param pInputData compressed data
+ * @param pOutBuffer buffer for decompressed data
+ * @param nInputSize compressed size in bytes
+ * @param nMaxOutBufferSize maximum capacity of decompression buffer
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param nFlags compression flags (set to 0)
+ *
+ * @return actual decompressed size, or -1 for error
+ */
+size_t apultra_decompress(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize, size_t nDictionarySize, const unsigned int nFlags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _EXPAND_H */
diff --git a/tools/z64compress/src/enc/apultra/format.h b/tools/z64compress/src/enc/apultra/format.h
new file mode 100644
index 000000000..1e280c1b3
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/format.h
@@ -0,0 +1,47 @@
+/*
+ * format.h - byte stream format definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _FORMAT_H
+#define _FORMAT_H
+
+#define MIN_OFFSET 1
+#define MAX_OFFSET 0x1fffff
+
+#define MAX_VARLEN 0x1fffff
+
+#define BLOCK_SIZE 0x100000
+
+#define MIN_MATCH_SIZE 1
+#define MINMATCH3_OFFSET 1280
+#define MINMATCH4_OFFSET 32000
+
+#endif /* _FORMAT_H */
diff --git a/tools/z64compress/src/enc/apultra/libapultra.h b/tools/z64compress/src/enc/apultra/libapultra.h
new file mode 100644
index 000000000..36fd29555
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/libapultra.h
@@ -0,0 +1,40 @@
+/*
+ * libapultra.h - library definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _LIB_APULTRA_H
+#define _LIB_APULTRA_H
+
+#include "format.h"
+#include "shrink.h"
+#include "expand.h"
+
+#endif /* _LIB_APULTRA_H */
diff --git a/tools/z64compress/src/enc/apultra/matchfinder.c b/tools/z64compress/src/enc/apultra/matchfinder.c
new file mode 100644
index 000000000..8d7802a52
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/matchfinder.c
@@ -0,0 +1,449 @@
+/*
+ * matchfinder.c - LZ match finder implementation
+ *
+ * The following copying information applies to this specific source code file:
+ *
+ * Written in 2019 by Emmanuel Marty <marty.emmanuel@gmail.com>
+ * Portions written in 2014-2015 by Eric Biggers <ebiggers3@gmail.com>
+ *
+ * To the extent possible under law, the author(s) have dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide via the Creative Commons Zero 1.0 Universal Public Domain
+ * Dedication (the "CC0").
+ *
+ * This software is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the CC0 for more details.
+ *
+ * You should have received a copy of the CC0 along with this software; if not
+ * see <http://creativecommons.org/publicdomain/zero/1.0/>.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "matchfinder.h"
+#include "format.h"
+#include "libapultra.h"
+
+/**
+ * Hash index into TAG_BITS
+ *
+ * @param nIndex index value
+ *
+ * @return hash
+ */
+static inline int apultra_get_index_tag(unsigned int nIndex) {
+   return (int)(((unsigned long long)nIndex * 11400714819323198485ULL) >> (64ULL - TAG_BITS));
+}
+
+/**
+ * Parse input data, build suffix array and overlaid data structures to speed up match finding
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
+ *
+ * @return 0 for success, non-zero for failure
+ */
+int apultra_build_suffix_array(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize) {
+   unsigned long long *intervals = pCompressor->intervals;
+
+   /* Build suffix array from input data */
+   saidx_t *suffixArray = (saidx_t*)intervals;
+   if (divsufsort_build_array(&pCompressor->divsufsort_context, pInWindow, suffixArray, nInWindowSize) != 0) {
+      return 100;
+   }
+
+   int i, r;
+
+   for (i = nInWindowSize - 1; i >= 0; i--) {
+      intervals[i] = suffixArray[i];
+   }
+
+   int *PLCP = (int*)pCompressor->pos_data;  /* Use temporarily */
+   int *Phi = PLCP;
+   int nCurLen = 0;
+
+   /* Compute the permuted LCP first (K�rkk�inen method) */
+   Phi[intervals[0]] = -1;
+   for (i = 1; i < nInWindowSize; i++)
+      Phi[intervals[i]] = (unsigned int)intervals[i - 1];
+   for (i = 0; i < nInWindowSize; i++) {
+      if (Phi[i] == -1) {
+         PLCP[i] = 0;
+         continue;
+      }
+      int nMaxLen = (i > Phi[i]) ? (nInWindowSize - i) : (nInWindowSize - Phi[i]);
+      while (nCurLen < nMaxLen && pInWindow[i + nCurLen] == pInWindow[Phi[i] + nCurLen]) nCurLen++;
+      PLCP[i] = nCurLen;
+      if (nCurLen > 0)
+         nCurLen--;
+   }
+
+   /* Rotate permuted LCP into the LCP. This has better cache locality than the direct Kasai LCP method. This also
+    * saves us from having to build the inverse suffix array index, as the LCP is calculated without it using this method,
+    * and the interval builder below doesn't need it either. */
+   intervals[0] &= POS_MASK;
+
+   for (i = 1; i < nInWindowSize; i++) {
+      int nIndex = (int)(intervals[i] & POS_MASK);
+      int nLen = PLCP[nIndex];
+      if (nLen < MIN_MATCH_SIZE)
+         nLen = 0;
+      if (nLen > LCP_MAX)
+         nLen = LCP_MAX;
+      int nTaggedLen = 0;
+      if (nLen)
+         nTaggedLen = (nLen << TAG_BITS) | (apultra_get_index_tag((unsigned int)nIndex) & ((1 << TAG_BITS) - 1));
+      intervals[i] = ((unsigned long long)nIndex) | (((unsigned long long)nTaggedLen) << LCP_SHIFT);
+   }
+
+   /**
+    * Build intervals for finding matches
+    *
+    * Methodology and code fragment taken from wimlib (CC0 license):
+    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
+    */
+   unsigned long long * const SA_and_LCP = intervals;
+   unsigned long long *pos_data = pCompressor->pos_data;
+   unsigned long long next_interval_idx;
+   unsigned long long *top = pCompressor->open_intervals;
+   unsigned long long prev_pos = SA_and_LCP[0] & POS_MASK;
+
+   *top = 0;
+   intervals[0] = 0;
+   next_interval_idx = 1;
+
+   for (r = 1; r < nInWindowSize; r++) {
+      const unsigned long long next_pos = SA_and_LCP[r] & POS_MASK;
+      const unsigned long long next_lcp = SA_and_LCP[r] & LCP_MASK;
+      const unsigned long long top_lcp = *top & LCP_MASK;
+
+      if (next_lcp == top_lcp) {
+         /* Continuing the deepest open interval  */
+         pos_data[prev_pos] = *top;
+      }
+      else if (next_lcp > top_lcp) {
+         /* Opening a new interval  */
+         *++top = next_lcp | next_interval_idx++;
+         pos_data[prev_pos] = *top;
+      }
+      else {
+         /* Closing the deepest open interval  */
+         pos_data[prev_pos] = *top;
+         for (;;) {
+            const unsigned long long closed_interval_idx = *top-- & POS_MASK;
+            const unsigned long long superinterval_lcp = *top & LCP_MASK;
+
+            if (next_lcp == superinterval_lcp) {
+               /* Continuing the superinterval */
+               intervals[closed_interval_idx] = *top;
+               break;
+            }
+            else if (next_lcp > superinterval_lcp) {
+               /* Creating a new interval that is a
+                * superinterval of the one being
+                * closed, but still a subinterval of
+                * its superinterval  */
+               *++top = next_lcp | next_interval_idx++;
+               intervals[closed_interval_idx] = *top;
+               break;
+            }
+            else {
+               /* Also closing the superinterval  */
+               intervals[closed_interval_idx] = *top;
+            }
+         }
+      }
+      prev_pos = next_pos;
+   }
+
+   /* Close any still-open intervals.  */
+   pos_data[prev_pos] = *top;
+   for (; top > pCompressor->open_intervals; top--)
+      intervals[*top & POS_MASK] = *(top - 1);
+
+   /* Success */
+   return 0;
+}
+
+/**
+ * Find matches at the specified offset in the input window
+ *
+ * @param pCompressor compression context
+ * @param nOffset offset to find matches at, in the input window
+ * @param pMatches pointer to returned matches
+ * @param pMatchDepth pointer to returned match depths
+ * @param pMatch1 pointer to 1-byte length, 4 bit offset match
+ * @param nMaxMatches maximum number of matches to return (0 for none)
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return number of matches
+ */
+int apultra_find_matches_at(apultra_compressor *pCompressor, const int nOffset, apultra_match *pMatches, unsigned short *pMatchDepth, unsigned char *pMatch1, const int nMaxMatches, const int nBlockFlags) {
+   unsigned long long *intervals = pCompressor->intervals;
+   unsigned long long *pos_data = pCompressor->pos_data;
+   unsigned long long ref;
+   unsigned long long super_ref;
+   unsigned long long match_pos;
+   apultra_match *matchptr;
+   unsigned short *depthptr;
+   const int nMaxOffset = pCompressor->max_offset;
+
+   *pMatch1 = 0;
+
+   /**
+    * Find matches using intervals
+    *
+    * Taken from wimlib (CC0 license):
+    * https://wimlib.net/git/?p=wimlib;a=blob_plain;f=src/lcpit_matchfinder.c;h=a2d6a1e0cd95200d1f3a5464d8359d5736b14cbe;hb=HEAD
+    */
+
+    /* Get the deepest lcp-interval containing the current suffix. */
+   ref = pos_data[nOffset];
+
+   pos_data[nOffset] = 0;
+
+   /* Ascend until we reach a visited interval, the root, or a child of the
+    * root.  Link unvisited intervals to the current suffix as we go.  */
+   while ((super_ref = intervals[ref & POS_MASK]) & LCP_MASK) {
+      intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
+      ref = super_ref;
+   }
+
+   if (super_ref == 0) {
+      /* In this case, the current interval may be any of:
+       * (1) the root;
+       * (2) an unvisited child of the root */
+
+      if (ref != 0)  /* Not the root?  */
+         intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
+      return 0;
+   }
+
+   /* Ascend indirectly via pos_data[] links.  */
+   match_pos = super_ref & EXCL_VISITED_MASK;
+   matchptr = pMatches;
+   depthptr = pMatchDepth;
+   int nPrevOffset = 0;
+   int nPrevLen = 0;
+   int nCurDepth = 0;
+   unsigned short *cur_depth = NULL;
+   
+   if (nOffset >= match_pos && (nBlockFlags & 3) == 3) {
+      int nMatchOffset = (int)(nOffset - match_pos);
+      int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+      if ((matchptr - pMatches) < nMaxMatches) {
+         if (nMatchOffset <= nMaxOffset) {
+            if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+               nCurDepth++;
+               *cur_depth = nCurDepth;
+            }
+            else {
+               nCurDepth = 0;
+
+               cur_depth = depthptr;
+               matchptr->length = nMatchLen;
+               matchptr->offset = nMatchOffset;
+               *depthptr = 0;
+               matchptr++;
+               depthptr++;
+            }
+
+            nPrevLen = nMatchLen;
+            nPrevOffset = nMatchOffset;
+         }
+      }
+   }
+
+   for (;;) {
+      if ((super_ref = pos_data[match_pos]) > ref) {
+         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
+
+         if (nOffset >= match_pos && (nBlockFlags & 3) == 3) {
+            int nMatchOffset = (int)(nOffset - match_pos);
+            int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+            if ((matchptr - pMatches) < nMaxMatches) {
+               if (nMatchOffset <= nMaxOffset && abs(nMatchOffset - nPrevOffset) >= 128) {
+                  if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+                     nCurDepth++;
+                     *cur_depth = nCurDepth | 0x8000;
+                  }
+                  else {
+                     nCurDepth = 0;
+
+                     cur_depth = depthptr;
+                     matchptr->length = nMatchLen;
+                     matchptr->offset = nMatchOffset;
+                     *depthptr = 0x8000;
+                     matchptr++;
+                     depthptr++;
+                  }
+
+                  nPrevLen = nMatchLen;
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
+
+      while ((super_ref = pos_data[match_pos]) > ref) {
+         match_pos = intervals[super_ref & POS_MASK] & EXCL_VISITED_MASK;
+
+         if (nOffset > match_pos && (nBlockFlags & 3) == 3) {
+            int nMatchOffset = (int)(nOffset - match_pos);
+            int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+            if ((matchptr - pMatches) < nMaxMatches) {
+               if (nMatchOffset <= nMaxOffset && (nMatchLen >= 3 || (nMatchLen >= 2 && (matchptr - pMatches) < (nMaxMatches - 1))) && nMatchLen < 1280 && abs(nMatchOffset - nPrevOffset) >= 128) {
+                  if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+                     nCurDepth++;
+                     *cur_depth = nCurDepth | 0x8000;
+                  }
+                  else {
+                     nCurDepth = 0;
+
+                     cur_depth = depthptr;
+                     matchptr->length = nMatchLen;
+                     matchptr->offset = nMatchOffset;
+                     *depthptr = 0x8000;
+                     matchptr++;
+                     depthptr++;
+                  }
+
+                  nPrevLen = nMatchLen;
+                  nPrevOffset = nMatchOffset;
+               }
+            }
+         }
+      }
+
+      intervals[ref & POS_MASK] = nOffset | VISITED_FLAG;
+      pos_data[match_pos] = (unsigned long long)ref;
+
+      int nMatchOffset = (int)(nOffset - match_pos);
+      int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+      if ((matchptr - pMatches) < nMaxMatches) {
+         if (nMatchOffset <= nMaxOffset && nMatchOffset != nPrevOffset) {
+            if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+               nCurDepth++;
+               *cur_depth = nCurDepth;
+            }
+            else {
+               nCurDepth = 0;
+
+               cur_depth = depthptr;
+               matchptr->length = nMatchLen;
+               matchptr->offset = nMatchOffset;
+               *depthptr = 0;
+               matchptr++;
+               depthptr++;
+            }
+
+            nPrevLen = nMatchLen;
+            nPrevOffset = nMatchOffset;
+         }
+      }
+
+      if (nMatchOffset && nMatchOffset < 16 && nMatchLen)
+         *pMatch1 = nMatchOffset;
+
+      if (super_ref == 0)
+         break;
+      ref = super_ref;
+      match_pos = intervals[ref & POS_MASK] & EXCL_VISITED_MASK;
+
+      if (nOffset > match_pos && (nBlockFlags & 3) == 3) {
+         int nMatchOffset = (int)(nOffset - match_pos);
+         int nMatchLen = (int)(ref >> (LCP_SHIFT + TAG_BITS));
+
+         if ((matchptr - pMatches) < nMaxMatches) {
+            if (nMatchOffset <= nMaxOffset && nMatchLen >= 2 && abs(nMatchOffset - nPrevOffset) >= 128) {
+               if (nPrevOffset && nPrevLen > 2 && nMatchOffset == (nPrevOffset - 1) && nMatchLen == (nPrevLen - 1) && cur_depth && nCurDepth < LCP_MAX) {
+                  nCurDepth++;
+                  *cur_depth = nCurDepth | 0x8000;
+               }
+               else {
+                  nCurDepth = 0;
+
+                  cur_depth = depthptr;
+                  matchptr->length = nMatchLen;
+                  matchptr->offset = nMatchOffset;
+                  *depthptr = 0x8000;
+                  matchptr++;
+                  depthptr++;
+               }
+
+               nPrevLen = nMatchLen;
+               nPrevOffset = nMatchOffset;
+            }
+         }
+      }
+   }
+
+   return (int)(matchptr - pMatches);
+}
+
+/**
+ * Skip previously compressed bytes
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically 0)
+ * @param nEndOffset offset to skip to in input window (typically the number of previously compressed bytes)
+ */
+void apultra_skip_matches(apultra_compressor *pCompressor, const int nStartOffset, const int nEndOffset) {
+   apultra_match match;
+   unsigned short depth;
+   unsigned char match1;
+   int i;
+
+   /* Skipping still requires scanning for matches, as this also performs a lazy update of the intervals. However,
+    * we don't store the matches. */
+   for (i = nStartOffset; i < nEndOffset; i++) {
+      apultra_find_matches_at(pCompressor, i, &match, &depth, &match1, 0, 0);
+   }
+}
+
+/**
+ * Find all matches for the data to be compressed
+ *
+ * @param pCompressor compression context
+ * @param nMatchesPerOffset maximum number of matches to store for each offset
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ */
+void apultra_find_all_matches(apultra_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset, const int nBlockFlags) {
+   apultra_match *pMatch = pCompressor->match;
+   unsigned short *pMatchDepth = pCompressor->match_depth;
+   unsigned char *pMatch1 = pCompressor->match1;
+   int i;
+
+   for (i = nStartOffset; i < nEndOffset; i++) {
+      int nMatches = apultra_find_matches_at(pCompressor, i, pMatch, pMatchDepth, pMatch1, nMatchesPerOffset, nBlockFlags);
+
+      while (nMatches < nMatchesPerOffset) {
+         pMatch[nMatches].length = 0;
+         pMatch[nMatches].offset = 0;
+         pMatchDepth[nMatches] = 0;
+         nMatches++;
+      }
+
+      pMatch += nMatchesPerOffset;
+      pMatchDepth += nMatchesPerOffset;
+      pMatch1++;
+   }
+}
diff --git a/tools/z64compress/src/enc/apultra/matchfinder.h b/tools/z64compress/src/enc/apultra/matchfinder.h
new file mode 100644
index 000000000..4a6935435
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/matchfinder.h
@@ -0,0 +1,94 @@
+/*
+ * matchfinder.h - LZ match finder definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _MATCHFINDER_H
+#define _MATCHFINDER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Forward declarations */
+typedef struct _apultra_match apultra_match;
+typedef struct _apultra_compressor apultra_compressor;
+
+/**
+ * Parse input data, build suffix array and overlaid data structures to speed up match finding
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nInWindowSize total input size in bytes (previously compressed bytes + bytes to compress)
+ *
+ * @return 0 for success, non-zero for failure
+ */
+int apultra_build_suffix_array(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nInWindowSize);
+
+/**
+ * Find matches at the specified offset in the input window
+ *
+ * @param pCompressor compression context
+ * @param nOffset offset to find matches at, in the input window
+ * @param pMatches pointer to returned matches
+ * @param pMatchDepth pointer to returned match depths
+ * @param pMatch1 pointer to 1-byte length, 4 bit offset match
+ * @param nMaxMatches maximum number of matches to return (0 for none)
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return number of matches
+ */
+int apultra_find_matches_at(apultra_compressor *pCompressor, const int nOffset, apultra_match *pMatches, unsigned short *pMatchDepth, unsigned char *pMatch1, const int nMaxMatches, const int nBlockFlags);
+
+/**
+ * Skip previously compressed bytes
+ *
+ * @param pCompressor compression context
+ * @param nStartOffset current offset in input window (typically 0)
+ * @param nEndOffset offset to skip to in input window (typically the number of previously compressed bytes)
+ */
+void apultra_skip_matches(apultra_compressor *pCompressor, const int nStartOffset, const int nEndOffset);
+
+/**
+ * Find all matches for the data to be compressed
+ *
+ * @param pCompressor compression context
+ * @param nMatchesPerOffset maximum number of matches to store for each offset
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ */
+void apultra_find_all_matches(apultra_compressor *pCompressor, const int nMatchesPerOffset, const int nStartOffset, const int nEndOffset, const int nBlockFlags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MATCHFINDER_H */
diff --git a/tools/z64compress/src/enc/apultra/shrink.c b/tools/z64compress/src/enc/apultra/shrink.c
new file mode 100644
index 000000000..ece2144e8
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/shrink.c
@@ -0,0 +1,1731 @@
+/*
+ * shrink.c - compressor implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include "libapultra.h"
+#include "matchfinder.h"
+#include "shrink.h"
+#include "format.h"
+
+#define TOKEN_CODE_LARGE_MATCH   2 /* 10 */
+#define TOKEN_SIZE_LARGE_MATCH   2
+
+#define TOKEN_CODE_7BIT_MATCH    6 /* 110 */
+#define TOKEN_SIZE_7BIT_MATCH    3
+
+#define TOKEN_CODE_4BIT_MATCH    7 /* 111 */
+#define TOKEN_SIZE_4BIT_MATCH    3
+
+#define CountShift(N,bits)  if ((N)>>(bits)) { (N)>>=(bits); (n) += (bits); }
+
+/** Gamma2 bit counts for common values, up to 255 */
+static char _gamma2_size[256] = {
+   0, 0, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+};
+
+/**
+ * Write bitpacked value to output (compressed) buffer
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nValue value to write
+ * @param nBits number of least significant bits to write in value
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ *
+ * @return updated write index into output buffer, or -1 in case of an error
+ */
+static int apultra_write_bits(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, const int nValue, const int nBits, int *nCurBitsOffset, int *nCurBitShift) {
+   int i;
+
+   if (nOutOffset < 0) return -1;
+
+   for (i = nBits - 1; i >= 0; i--) {
+      if ((*nCurBitsOffset) == INT_MIN) {
+         /* Allocate a new byte in the stream to pack bits in */
+         if (nOutOffset >= nMaxOutDataSize) return -1;
+         (*nCurBitsOffset) = nOutOffset;
+         (*nCurBitShift) = 7;
+         pOutData[nOutOffset++] = 0;
+      }
+
+      pOutData[(*nCurBitsOffset)] |= ((nValue >> i) & 1) << (*nCurBitShift);
+
+      (*nCurBitShift) --;
+      if ((*nCurBitShift) == -1) {
+         /* Current byte is full */
+         (*nCurBitsOffset) = INT_MIN;
+      }
+   }
+
+   return nOutOffset;
+}
+
+/**
+ * Get size of gamma2 encoded value
+ *
+ * @param nValue value of evaluate (2..n)
+ *
+ * @return number of bits required
+ */
+static int apultra_get_gamma2_size(int nValue) {
+   if (nValue >= 0 && nValue < 256)
+      return _gamma2_size[nValue];
+   else {
+      unsigned int n = 0;
+      CountShift(nValue, 16);
+      CountShift(nValue, 8);
+      CountShift(nValue, 4);
+      CountShift(nValue, 2);
+      CountShift(nValue, 1);
+
+      return n << 1;
+   }
+}
+
+/**
+ * Write gamma2 encoded value to output (compressed) buffer
+ *
+ * @param pOutData pointer to output buffer
+ * @param nOutOffset current write index into output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nValue value of write (2..n)
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ *
+ * @return updated write index into output buffer, or -1 in case of an error
+ */
+static int apultra_write_gamma2_value(unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int nValue, int *nCurBitsOffset, int *nCurBitShift) {
+   int msb = 30;
+   while ((nValue >> msb--) == 0);
+
+   while (msb > 0) {
+      int bit = (nValue >> msb) & 1;
+   
+      nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, bit, 1, nCurBitsOffset, nCurBitShift);
+      msb--;
+      nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 1, 1, nCurBitsOffset, nCurBitShift);
+   }
+
+   nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, nValue & 1, 1, nCurBitsOffset, nCurBitShift);
+   nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 0, 1, nCurBitsOffset, nCurBitShift);
+   return nOutOffset;
+}
+
+/**
+ * Get the number of extra bits required to represent a match offset
+ *
+ * @param nLength match length
+ * @param nMatchOffset match offset
+ * @param nFollowsLiteral non-zero if the match follows a literal, zero if it immediately follows another match
+ *
+ * @return number of extra bits required
+ */
+static inline int apultra_get_offset_varlen_size(const int nLength, const int nMatchOffset, const int nFollowsLiteral) {
+   if (nLength <= 3 && nMatchOffset < 128)
+      return 8 + TOKEN_SIZE_7BIT_MATCH;
+   else {
+      if (nFollowsLiteral)
+         return 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 3);
+      else
+         return 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 2);
+   }
+}
+
+/**
+ * Get the number of extra bits required to represent a match length
+ *
+ * @param nLength match length
+ * @param nMatchOffset match offset
+ *
+ * @return number of extra bits required
+ */
+static inline int apultra_get_match_varlen_size(int nLength, const int nMatchOffset) {
+   if (nLength <= 3 && nMatchOffset < 128)
+      return 0;
+   else {
+      if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+         return apultra_get_gamma2_size(nLength - 2);
+      else if (nMatchOffset < MINMATCH3_OFFSET)
+         return apultra_get_gamma2_size(nLength);
+      else
+         return apultra_get_gamma2_size(nLength - 1);
+   }
+}
+
+/**
+ * Insert forward rep candidate
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param i input data window position whose matches are being considered
+ * @param nMatchOffset match offset to use as rep candidate
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nArrivalsPerPosition maximum number of arrivals per input buffer position
+ * @param nDepth current insertion depth
+ */
+static void apultra_insert_forward_match(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int i, const int nMatchOffset, const int nStartOffset, const int nEndOffset, const int nArrivalsPerPosition, int nDepth) {
+   const apultra_arrival *arrival = pCompressor->arrival + ((i - nStartOffset) * nArrivalsPerPosition);
+   const int *rle_len = (int*)pCompressor->intervals /* reuse */;
+   int* visited = ((int*)pCompressor->pos_data) - nStartOffset /* reuse */;
+   int* visited2 = visited + (nEndOffset - nStartOffset) /* reuse */;
+   int j;
+
+   for (j = 0; j < nArrivalsPerPosition && arrival[j].from_slot; j++) {
+      if (arrival[j].follows_literal) {
+         int nRepOffset = arrival[j].rep_offset;
+
+         if (nMatchOffset != nRepOffset && nRepOffset) {
+            int nRepPos = arrival[j].rep_pos;
+
+            if (nRepPos >= nStartOffset &&
+               nRepPos < nEndOffset &&
+               visited[nRepPos] != nMatchOffset) {
+
+               visited[nRepPos] = nMatchOffset;
+
+               if (visited2[nRepPos] != nMatchOffset && nRepPos >= nMatchOffset && pCompressor->match[((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT) + NMATCHES_PER_INDEX - 1].length == 0) {
+                  const unsigned char* pInWindowAtRepOffset = pInWindow + nRepPos;
+
+                  if (pInWindowAtRepOffset[0] == pInWindowAtRepOffset[-nMatchOffset]) {
+                     int nLen0 = rle_len[nRepPos - nMatchOffset];
+                     int nLen1 = rle_len[nRepPos];
+                     int nMinLen = (nLen0 < nLen1) ? nLen0 : nLen1;
+
+                     int nMaxRepLen = nEndOffset - nRepPos;
+                     if (nMaxRepLen > LCP_MAX)
+                        nMaxRepLen = LCP_MAX;
+
+                     if (nMinLen > nMaxRepLen)
+                        nMinLen = nMaxRepLen;
+
+                     const unsigned char* pInWindowMax = pInWindowAtRepOffset + nMaxRepLen;
+                     pInWindowAtRepOffset += nMinLen;
+
+                     while ((pInWindowAtRepOffset + 8) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nMatchOffset, 8))
+                        pInWindowAtRepOffset += 8;
+                     while ((pInWindowAtRepOffset + 4) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nMatchOffset, 4))
+                        pInWindowAtRepOffset += 4;
+                     while (pInWindowAtRepOffset < pInWindowMax && pInWindowAtRepOffset[0] == pInWindowAtRepOffset[-nMatchOffset])
+                        pInWindowAtRepOffset++;
+
+                     int nCurRepLen = (int)(pInWindowAtRepOffset - (pInWindow + nRepPos));
+
+                     if (nCurRepLen >= 2) {
+                        apultra_match* fwd_match = pCompressor->match + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+                        unsigned short* fwd_depth = pCompressor->match_depth + ((nRepPos - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+                        int r;
+
+                        for (r = 0; fwd_match[r].length >= MIN_MATCH_SIZE; r++) {
+                           if (fwd_match[r].offset == nMatchOffset && (fwd_depth[r] & 0x3fff) == 0) {
+                              if ((int)fwd_match[r].length < nCurRepLen) {
+                                 fwd_match[r].length = nCurRepLen;
+                                 fwd_depth[r] = 0;
+                              }
+                              r = NMATCHES_PER_INDEX;
+                              break;
+                           }
+                        }
+
+                        if (r < NMATCHES_PER_INDEX) {
+                           visited2[nRepPos] = nMatchOffset;
+
+                           fwd_match[r].offset = nMatchOffset;
+                           fwd_match[r].length = nCurRepLen;
+                           fwd_depth[r] = 0;
+
+                           if (nDepth < 9)
+                              apultra_insert_forward_match(pCompressor, pInWindow, nRepPos, nMatchOffset, nStartOffset, nEndOffset, nArrivalsPerPosition, nDepth + 1);
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+/**
+ * Attempt to pick optimal matches, so as to produce the smallest possible output that decompresses to the same input
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nInsertForwardReps non-zero to insert forward repmatch candidates, zero to use the previously inserted candidates
+ * @param nCurRepMatchOffset starting rep offset for this block
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ * @param nArrivalsPerPosition maximum number of arrivals per input buffer position
+ */
+static void apultra_optimize_forward(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, const int nInsertForwardReps, const int *nCurRepMatchOffset, const int nBlockFlags, const int nArrivalsPerPosition) {
+   apultra_arrival *arrival = pCompressor->arrival - (nStartOffset * nArrivalsPerPosition);
+   const int* rle_len = (int*)pCompressor->intervals /* reuse */;
+   int* visited = ((int*)pCompressor->pos_data) - nStartOffset /* reuse */;
+   int i, j, n;
+
+   if ((nEndOffset - nStartOffset) > pCompressor->block_size) return;
+
+   memset(arrival + (nStartOffset * nArrivalsPerPosition), 0, sizeof(apultra_arrival) * ((nEndOffset - nStartOffset + 1) * nArrivalsPerPosition));
+
+   arrival[nStartOffset * nArrivalsPerPosition].from_slot = -1;
+   arrival[nStartOffset * nArrivalsPerPosition].rep_offset = *nCurRepMatchOffset;
+
+   for (i = (nStartOffset * nArrivalsPerPosition); i != ((nEndOffset+1) * nArrivalsPerPosition); i++) {
+      arrival[i].cost = 0x40000000;
+   }
+
+   if (nInsertForwardReps) {
+      memset(visited + nStartOffset, 0, 2 * (nEndOffset - nStartOffset) * sizeof(int));
+   }
+
+   for (i = nStartOffset; i != nEndOffset; i++) {
+      apultra_arrival *cur_arrival = &arrival[i * nArrivalsPerPosition];
+      int m;
+      
+      const unsigned char nMatch1Offs = pCompressor->match1[i - nStartOffset];
+      int nShortOffset;
+      int nShortLen;
+      int nLiteralScore;
+      int nLiteralCost;
+
+      if ((pInWindow[i] != 0 && nMatch1Offs == 0) || (i == nStartOffset && (nBlockFlags & 1))) {
+         nShortOffset = 0;
+         nShortLen = 0;
+         nLiteralCost = 9 /* literal bit + literal byte */;
+      }
+      else {
+         nShortOffset = (pInWindow[i] == 0) ? 0 : nMatch1Offs;
+         nShortLen = 1;
+         nLiteralCost = 4 + TOKEN_SIZE_4BIT_MATCH /* command and offset cost; no length cost */;
+      }
+
+      nLiteralScore = nShortOffset ? 3 : 1;
+
+      if (cur_arrival[nArrivalsPerPosition].from_slot) {
+         for (j = 0; j < nArrivalsPerPosition && cur_arrival[j].from_slot; j++) {
+            int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+            int nCodingChoiceCost = nPrevCost + nLiteralCost;
+            int nScore = cur_arrival[j].score + nLiteralScore;
+
+            apultra_arrival* pDestSlots = &cur_arrival[nArrivalsPerPosition];
+            if (nCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 1].cost ||
+               (nCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 1].cost && nScore < pDestSlots[nArrivalsPerPosition - 1].score)) {
+               int nRepOffset = cur_arrival[j].rep_offset;
+               int exists = 0;
+
+               for (n = 0;
+                  pDestSlots[n].cost < nCodingChoiceCost;
+                  n++) {
+                  if (pDestSlots[n].rep_offset == nRepOffset) {
+                     exists = 1;
+                     break;
+                  }
+               }
+
+               if (!exists) {
+                  for (;
+                     n < nArrivalsPerPosition && pDestSlots[n].cost == nCodingChoiceCost && nScore >= pDestSlots[n].score;
+                     n++) {
+                     if (pDestSlots[n].rep_offset == nRepOffset) {
+                        exists = 1;
+                        break;
+                     }
+                  }
+
+                  if (!exists) {
+                     if (n < nArrivalsPerPosition) {
+                        int nn;
+
+                        for (nn = n;
+                           nn < nArrivalsPerPosition && pDestSlots[nn].cost == nCodingChoiceCost;
+                           nn++) {
+                           if (pDestSlots[nn].rep_offset == nRepOffset) {
+                              exists = 1;
+                              break;
+                           }
+                        }
+
+                        if (!exists) {
+                           int z;
+
+                           for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                              if (pDestSlots[z].rep_offset == nRepOffset)
+                                 break;
+                           }
+
+                           apultra_arrival* pDestArrival = &pDestSlots[n];
+                           memmove(&pDestSlots[n + 1],
+                              &pDestSlots[n],
+                              sizeof(apultra_arrival) * (z - n));
+
+                           pDestArrival->cost = nCodingChoiceCost;
+                           pDestArrival->from_pos = i;
+                           pDestArrival->from_slot = j + 1;
+                           pDestArrival->follows_literal = 1;
+                           pDestArrival->rep_offset = nRepOffset;
+                           pDestArrival->short_offset = nShortOffset;
+                           pDestArrival->rep_pos = cur_arrival[j].rep_pos;
+                           pDestArrival->match_len = nShortLen;
+                           pDestArrival->score = nScore;
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+      else {
+         for (j = 0; j < nArrivalsPerPosition && cur_arrival[j].from_slot; j++) {
+            int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+            int nCodingChoiceCost = nPrevCost + nLiteralCost;
+            int nScore = cur_arrival[j].score + nLiteralScore;
+
+            apultra_arrival* pDestArrival = &cur_arrival[nArrivalsPerPosition + j];
+
+            pDestArrival->cost = nCodingChoiceCost;
+            pDestArrival->from_pos = i;
+            pDestArrival->from_slot = j + 1;
+            pDestArrival->follows_literal = 1;
+            pDestArrival->rep_offset = cur_arrival[j].rep_offset;
+            pDestArrival->short_offset = nShortOffset;
+            pDestArrival->rep_pos = cur_arrival[j].rep_pos;
+            pDestArrival->match_len = nShortLen;
+            pDestArrival->score = nScore;
+         }
+      }
+
+      if (i == nStartOffset && (nBlockFlags & 1)) continue;
+
+      const apultra_match *match = pCompressor->match + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+      const unsigned short *match_depth = pCompressor->match_depth + ((i - nStartOffset) << MATCHES_PER_INDEX_SHIFT);
+      int nNumArrivalsForThisPos = j, nOverallMinRepLen = 0, nOverallMaxRepLen = 0;
+
+      int nRepLenForArrival[NARRIVALS_PER_POSITION_MAX];
+      memset(nRepLenForArrival, 0, nArrivalsPerPosition * sizeof(int));
+
+      int nRepMatchArrivalIdx[NARRIVALS_PER_POSITION_MAX + 1];
+      int nNumRepMatchArrivals = 0;
+
+      int nMaxRepLenForPos = nEndOffset - i;
+      if (nMaxRepLenForPos > LCP_MAX)
+         nMaxRepLenForPos = LCP_MAX;
+      const unsigned char* pInWindowStart = pInWindow + i;
+      const unsigned char* pInWindowMax = pInWindowStart + nMaxRepLenForPos;
+      const int nLen1 = rle_len[i];
+
+      for (j = 0; j < nNumArrivalsForThisPos && (i + 2) <= nEndOffset; j++) {
+         if (cur_arrival[j].follows_literal) {
+            int nRepOffset = cur_arrival[j].rep_offset;
+
+            if (nRepOffset && i >= nRepOffset) {
+               if (pInWindowStart[0] == pInWindowStart[-nRepOffset]) {
+                  int nLen0 = rle_len[i - nRepOffset];
+                  int nMinLen = (nLen0 < nLen1) ? nLen0 : nLen1;
+
+                  if (nMinLen > nMaxRepLenForPos)
+                     nMinLen = nMaxRepLenForPos;
+
+                  const unsigned char* pInWindowAtRepOffset = pInWindowStart + nMinLen;
+                  while ((pInWindowAtRepOffset + 8) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nRepOffset, 8))
+                     pInWindowAtRepOffset += 8;
+                  while ((pInWindowAtRepOffset + 4) < pInWindowMax && !memcmp(pInWindowAtRepOffset, pInWindowAtRepOffset - nRepOffset, 4))
+                     pInWindowAtRepOffset += 4;
+                  while (pInWindowAtRepOffset < pInWindowMax && pInWindowAtRepOffset[0] == pInWindowAtRepOffset[-nRepOffset])
+                     pInWindowAtRepOffset++;
+
+                  int nCurMaxLen = (int)(pInWindowAtRepOffset - pInWindowStart);
+
+                  if (nCurMaxLen >= 2) {
+                     nRepLenForArrival[j] = nCurMaxLen;
+                     nRepMatchArrivalIdx[nNumRepMatchArrivals++] = j;
+
+                     if (nOverallMaxRepLen < nCurMaxLen)
+                        nOverallMaxRepLen = nCurMaxLen;
+                  }
+               }
+            }
+         }
+      }
+      nRepMatchArrivalIdx[nNumRepMatchArrivals] = -1;
+
+      for (m = 0; m < NMATCHES_PER_INDEX && match[m].length; m++) {
+         const int nOrigMatchLen = match[m].length;
+         const int nOrigMatchOffset = match[m].offset;
+         const unsigned int nOrigMatchDepth = match_depth[m] & 0x3fff;
+         const int nScorePenalty = 3 + ((match_depth[m] & 0x8000) >> 15);
+         unsigned int d;
+
+         for (d = 0; d <= nOrigMatchDepth; d += (nOrigMatchDepth ? nOrigMatchDepth : 1)) {
+            const int nMatchOffset = nOrigMatchOffset - d;
+            int nMatchLen = nOrigMatchLen - d;
+
+            if ((i + nMatchLen) > nEndOffset)
+               nMatchLen = nEndOffset - i;
+
+            if (nInsertForwardReps) {
+               apultra_insert_forward_match(pCompressor, pInWindow, i, nMatchOffset, nStartOffset, nEndOffset, nArrivalsPerPosition, 0);
+            }
+
+            if (nMatchLen >= 2) {
+               int nStartingMatchLen, nJumpMatchLen, k;
+               int nNoRepMatchOffsetCostForLit[2], nNoRepMatchOffsetCostDelta;
+               int nMinMatchLenForOffset;
+               int nNoRepCostAdjusment = (nMatchLen >= LCP_MAX) ? 1 : 0;
+
+               if (nMatchOffset < MINMATCH3_OFFSET)
+                  nMinMatchLenForOffset = 2;
+               else {
+                  if (nMatchOffset < MINMATCH4_OFFSET)
+                     nMinMatchLenForOffset = 3;
+                  else
+                     nMinMatchLenForOffset = 4;
+               }
+
+               if (nMatchLen >= LEAVE_ALONE_MATCH_SIZE && i >= nMatchLen)
+                  nStartingMatchLen = nMatchLen;
+               else
+                  nStartingMatchLen = 2;
+
+               if ((nBlockFlags & 3) == 3 && nMatchLen > 90 && i >= 90)
+                  nJumpMatchLen = 90;
+               else
+                  nJumpMatchLen = nMatchLen + 1;
+
+               if (nStartingMatchLen <= 3 && nMatchOffset < 128) {
+                  nNoRepMatchOffsetCostForLit[0] = 8 + TOKEN_SIZE_7BIT_MATCH;
+                  nNoRepMatchOffsetCostForLit[1] = 8 + TOKEN_SIZE_7BIT_MATCH;
+               }
+               else {
+                  nNoRepMatchOffsetCostForLit[0] = 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 2);
+                  nNoRepMatchOffsetCostForLit[1] = 8 + TOKEN_SIZE_LARGE_MATCH + apultra_get_gamma2_size((nMatchOffset >> 8) + 3);
+               }
+               nNoRepMatchOffsetCostDelta = nNoRepMatchOffsetCostForLit[1] - nNoRepMatchOffsetCostForLit[0];
+
+               for (k = nStartingMatchLen; k <= nMatchLen; k++) {
+                  int nRepMatchMatchLenCost = apultra_get_gamma2_size(k);
+                  apultra_arrival *pDestSlots = &cur_arrival[k * nArrivalsPerPosition];
+ 
+                  /* Insert non-repmatch candidate */
+
+                  if (k >= nMinMatchLenForOffset) {
+                     int nNoRepMatchMatchLenCost;
+
+                     if (k <= 3 && nMatchOffset < 128)
+                        nNoRepMatchMatchLenCost = 0;
+                     else {
+                        if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                           nNoRepMatchMatchLenCost = apultra_get_gamma2_size(k - 2);
+                        else if (nMatchOffset < MINMATCH3_OFFSET)
+                           nNoRepMatchMatchLenCost = nRepMatchMatchLenCost;
+                        else
+                           nNoRepMatchMatchLenCost = apultra_get_gamma2_size(k - 1);
+                     }
+
+                     for (j = 0; j < nNumArrivalsForThisPos; j++) {
+                        if (nMatchOffset != cur_arrival[j].rep_offset || cur_arrival[j].follows_literal == 0) {
+                           int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+                           int nMatchCmdCost = nNoRepMatchMatchLenCost + nNoRepMatchOffsetCostForLit[cur_arrival[j].follows_literal];
+                           int nCodingChoiceCost = nPrevCost + nMatchCmdCost;
+
+                           if (nCodingChoiceCost <= (pDestSlots[nArrivalsPerPosition - 1].cost + 1)) {
+                              int nScore = cur_arrival[j].score + nScorePenalty;
+
+                              if (nCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 2].cost ||
+                                 (nCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 2].cost && nScore < pDestSlots[nArrivalsPerPosition - 2].score)) {
+                                 int exists = 0;
+
+                                 for (n = 0;
+                                    pDestSlots[n].cost < nCodingChoiceCost;
+                                    n++) {
+                                    if (pDestSlots[n].rep_offset == nMatchOffset) {
+                                       exists = 1;
+                                       break;
+                                    }
+                                 }
+
+                                 if (!exists) {
+                                    int nRevisedCodingChoiceCost = nCodingChoiceCost - nNoRepCostAdjusment;
+
+                                    for (;
+                                       n < nArrivalsPerPosition - 1 && pDestSlots[n].cost == nRevisedCodingChoiceCost && nScore >= pDestSlots[n].score;
+                                       n++) {
+                                       if (pDestSlots[n].rep_offset == nMatchOffset) {
+                                          exists = 1;
+                                          break;
+                                       }
+                                    }
+
+                                    if (!exists) {
+                                       if (n < nArrivalsPerPosition - 1) {
+                                          int nn;
+
+                                          for (nn = n;
+                                             nn < nArrivalsPerPosition && pDestSlots[nn].cost == nCodingChoiceCost;
+                                             nn++) {
+                                             if (pDestSlots[nn].rep_offset == nMatchOffset) {
+                                                exists = 1;
+                                                break;
+                                             }
+                                          }
+
+                                          if (!exists) {
+                                             int z;
+
+                                             for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                                                if (pDestSlots[z].rep_offset == nMatchOffset)
+                                                   break;
+                                             }
+
+                                             apultra_arrival* pDestArrival = &pDestSlots[n];
+                                             memmove(&pDestSlots[n + 1],
+                                                &pDestSlots[n],
+                                                sizeof(apultra_arrival) * (z - n));
+
+                                             pDestArrival->cost = nRevisedCodingChoiceCost;
+                                             pDestArrival->from_pos = i;
+                                             pDestArrival->from_slot = j + 1;
+                                             pDestArrival->follows_literal = 0;
+                                             pDestArrival->rep_offset = nMatchOffset;
+                                             pDestArrival->short_offset = 0;
+                                             pDestArrival->rep_pos = i;
+                                             pDestArrival->match_len = k;
+                                             pDestArrival->score = nScore;
+                                          }
+                                       }
+                                    }
+                                 }
+                                 else {
+                                    if ((nCodingChoiceCost - pDestSlots[n].cost) >= nNoRepMatchOffsetCostDelta)
+                                       break;
+                                 }
+                              }
+                              if (cur_arrival[j].follows_literal == 0 || nNoRepMatchOffsetCostDelta == 0)
+                                 break;
+                           }
+                           else {
+                              break;
+                           }
+                        }
+                     }
+                  }
+
+                  /* Insert repmatch candidate */
+
+                  if (k > nOverallMinRepLen && k <= nOverallMaxRepLen) {
+                     int nRepMatchCmdCost = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + nRepMatchMatchLenCost;
+                     int nCurRepMatchArrival;
+
+                     if (k <= 90)
+                        nOverallMinRepLen = k;
+                     else if (nOverallMaxRepLen == k)
+                        nOverallMaxRepLen--;
+                     
+                     for (nCurRepMatchArrival = 0; (j = nRepMatchArrivalIdx[nCurRepMatchArrival]) >= 0; nCurRepMatchArrival++) {
+                        if (nRepLenForArrival[j] >= k) {
+                           int nPrevCost = cur_arrival[j].cost & 0x3fffffff;
+                           int nRepCodingChoiceCost = nPrevCost + nRepMatchCmdCost;
+                           int nScore = cur_arrival[j].score + 2;
+
+                           if (nRepCodingChoiceCost < pDestSlots[nArrivalsPerPosition - 1].cost ||
+                              (nRepCodingChoiceCost == pDestSlots[nArrivalsPerPosition - 1].cost && nScore < pDestSlots[nArrivalsPerPosition - 1].score)) {
+                              int nRepOffset = cur_arrival[j].rep_offset;
+                              int exists = 0;
+
+                              for (n = 0;
+                                 pDestSlots[n].cost < nRepCodingChoiceCost;
+                                 n++) {
+                                 if (pDestSlots[n].rep_offset == nRepOffset) {
+                                    exists = 1;
+                                    break;
+                                 }
+                              }
+
+                              if (!exists) {
+                                 for (;
+                                    n < nArrivalsPerPosition && pDestSlots[n].cost == nRepCodingChoiceCost && nScore >= pDestSlots[n].score;
+                                    n++) {
+                                    if (pDestSlots[n].rep_offset == nRepOffset) {
+                                       exists = 1;
+                                       break;
+                                    }
+                                 }
+
+                                 if (!exists) {
+                                    if (n < nArrivalsPerPosition) {
+                                       int nn;
+
+                                       for (nn = n;
+                                          nn < nArrivalsPerPosition && pDestSlots[nn].cost == nRepCodingChoiceCost;
+                                          nn++) {
+                                          if (pDestSlots[nn].rep_offset == nRepOffset) {
+                                             exists = 1;
+                                             break;
+                                          }
+                                       }
+
+                                       if (!exists) {
+                                          int z;
+
+                                          for (z = n; z < nArrivalsPerPosition - 1 && pDestSlots[z].from_slot; z++) {
+                                             if (pDestSlots[z].rep_offset == nRepOffset)
+                                                break;
+                                          }
+
+                                          apultra_arrival* pDestArrival = &pDestSlots[n];
+                                          memmove(&pDestSlots[n + 1],
+                                             &pDestSlots[n],
+                                             sizeof(apultra_arrival) * (z - n));
+
+                                          pDestArrival->cost = nRepCodingChoiceCost;
+                                          pDestArrival->from_pos = i;
+                                          pDestArrival->from_slot = j + 1;
+                                          pDestArrival->follows_literal = 0;
+                                          pDestArrival->rep_offset = nRepOffset;
+                                          pDestArrival->short_offset = 0;
+                                          pDestArrival->rep_pos = i;
+                                          pDestArrival->match_len = k;
+                                          pDestArrival->score = nScore;
+                                       }
+                                    }
+                                 }
+                              }
+                           }
+                           else {
+                              break;
+                           }
+                        }
+                     }
+                  }
+
+                  if (k == 3 && nMatchOffset < 128) {
+                     nNoRepMatchOffsetCostForLit[0] = 8 + TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size((nMatchOffset >> 8) + 2) */;
+                     nNoRepMatchOffsetCostForLit[1] = 8 + TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size((nMatchOffset >> 8) + 3) */;
+                  }
+
+                  if (k == nJumpMatchLen)
+                     k = nMatchLen - 1;
+               }
+            }
+
+            if (nOrigMatchLen >= 512)
+               break;
+         }
+      }
+   }
+   
+   if (!nInsertForwardReps) {
+      const apultra_arrival* end_arrival = &arrival[(i * nArrivalsPerPosition) + 0];
+      apultra_final_match* pBestMatch = pCompressor->best_match - nStartOffset;
+
+      while (end_arrival->from_slot > 0 && end_arrival->from_pos >= 0 && (int)end_arrival->from_pos < nEndOffset) {
+         pBestMatch[end_arrival->from_pos].length = end_arrival->match_len;
+         if (end_arrival->match_len >= 2)
+            pBestMatch[end_arrival->from_pos].offset = end_arrival->rep_offset;
+         else
+            pBestMatch[end_arrival->from_pos].offset = end_arrival->short_offset;
+
+         end_arrival = &arrival[(end_arrival->from_pos * nArrivalsPerPosition) + (end_arrival->from_slot - 1)];
+      }
+   }
+}
+
+/**
+ * Attempt to replace matches by literals when it makes the final bitstream smaller, and merge large matches
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param pBestMatch optimal matches to evaluate and update
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param nCurRepMatchOffset starting rep offset for this block
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return non-zero if the number of tokens was reduced, 0 if it wasn't
+ */
+static int apultra_reduce_commands(apultra_compressor *pCompressor, const unsigned char *pInWindow, apultra_final_match *pBestMatch, const int nStartOffset, const int nEndOffset, const int *nCurRepMatchOffset, const int nBlockFlags) {
+   int i;
+   int nRepMatchOffset = *nCurRepMatchOffset;
+   int nFollowsLiteral = 0;
+   int nDidReduce = 0;
+   int nLastMatchLen = 0;
+   const unsigned char *match1 = pCompressor->match1 - nStartOffset;
+
+   for (i = nStartOffset + ((nBlockFlags & 1) ? 1 : 0); i < nEndOffset; ) {
+      apultra_final_match *pMatch = pBestMatch + i;
+
+      if (pMatch->length <= 1 &&
+         (i + 1) < nEndOffset &&
+         pBestMatch[i + 1].length >= 2 &&
+         pBestMatch[i + 1].length < MAX_VARLEN &&
+         pBestMatch[i + 1].offset &&
+         i >= pBestMatch[i + 1].offset &&
+         (i + pBestMatch[i + 1].length + 1) <= nEndOffset &&
+         !memcmp(pInWindow + i - (pBestMatch[i + 1].offset), pInWindow + i, pBestMatch[i + 1].length + 1)) {
+         if ((pBestMatch[i + 1].offset < MINMATCH3_OFFSET || (pBestMatch[i + 1].length + 1) >= 3 || (pBestMatch[i + 1].offset == nRepMatchOffset && nFollowsLiteral)) &&
+            (pBestMatch[i + 1].offset < MINMATCH4_OFFSET || (pBestMatch[i + 1].length + 1) >= 4 || (pBestMatch[i + 1].offset == nRepMatchOffset && nFollowsLiteral))) {
+
+            int nCurPartialCommandSize = (pMatch->length == 1) ? (TOKEN_SIZE_4BIT_MATCH + 4) : (1 /* literal bit */ + 8 /* literal size */);
+            if (pBestMatch[i + 1].offset == nRepMatchOffset /* always follows a literal, the one at the current position */) {
+               nCurPartialCommandSize += TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[i + 1].length);
+            }
+            else {
+               nCurPartialCommandSize += apultra_get_offset_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset, 1) + apultra_get_match_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset);
+            }
+
+            int nReducedPartialCommandSize;
+            if (pBestMatch[i + 1].offset == nRepMatchOffset && nFollowsLiteral) {
+               nReducedPartialCommandSize = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[i + 1].length);
+            }
+            else {
+               nReducedPartialCommandSize = apultra_get_offset_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset, nFollowsLiteral) + apultra_get_match_varlen_size(pBestMatch[i + 1].length, pBestMatch[i + 1].offset);
+            }
+
+            if (nReducedPartialCommandSize < nCurPartialCommandSize || (nFollowsLiteral == 0 && nLastMatchLen >= LCP_MAX)) {
+               /* Merge */
+               pBestMatch[i].length = pBestMatch[i + 1].length + 1;
+               pBestMatch[i].offset = pBestMatch[i + 1].offset;
+               pBestMatch[i + 1].length = 0;
+               pBestMatch[i + 1].offset = 0;
+               nDidReduce = 1;
+               continue;
+            }
+         }
+      }
+
+      if (pMatch->length >= 2) {
+         if (pMatch->length < 32 && /* Don't waste time considering large matches, they will always win over literals */
+             (i + pMatch->length) < nEndOffset /* Don't consider the last match in the block, we can only reduce a match inbetween other tokens */) {
+            int nNextIndex = i + pMatch->length;
+            int nNextFollowsLiteral = 0;
+            int nCannotEncode = 0;
+
+            while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < 2) {
+               nNextIndex++;
+               nNextFollowsLiteral = 1;
+            }
+
+            if (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length >= 2) {
+               if (nRepMatchOffset && nRepMatchOffset != pMatch->offset && pBestMatch[nNextIndex].offset && pMatch->offset != pBestMatch[nNextIndex].offset &&
+                  nNextFollowsLiteral) {
+                  /* Try to gain a match forward */
+                  if (i >= pBestMatch[nNextIndex].offset && (i - pBestMatch[nNextIndex].offset + pMatch->length) <= nEndOffset) {
+                     if ((pBestMatch[nNextIndex].offset < MINMATCH3_OFFSET || pMatch->length >= 3) &&
+                        (pBestMatch[nNextIndex].offset < MINMATCH4_OFFSET || pMatch->length >= 4)) {
+                        int nMaxLen = 0;
+                        const unsigned char* pInWindowAtPos = pInWindow + i;
+                        while (nMaxLen < pMatch->length && pInWindowAtPos[nMaxLen - pBestMatch[nNextIndex].offset] == pInWindowAtPos[nMaxLen])
+                           nMaxLen++;
+
+                        if (nMaxLen >= pMatch->length) {
+                           /* Replace */
+                           pMatch->offset = pBestMatch[nNextIndex].offset;
+                           nDidReduce = 1;
+                        }
+                        else if (nMaxLen >= 2) {
+                           if ((nFollowsLiteral && nRepMatchOffset == pBestMatch[nNextIndex].offset) ||
+                              ((pBestMatch[nNextIndex].offset < MINMATCH3_OFFSET || nMaxLen >= 3) &&
+                               (pBestMatch[nNextIndex].offset < MINMATCH4_OFFSET || nMaxLen >= 4))) {
+
+                              int nPartialSizeBefore, nPartialSizeAfter, j;
+
+                              nPartialSizeBefore = apultra_get_offset_varlen_size(pMatch->length, pMatch->offset, nFollowsLiteral);
+                              nPartialSizeBefore += apultra_get_match_varlen_size(pMatch->length, pMatch->offset);
+
+                              nPartialSizeBefore += apultra_get_offset_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset, 1);
+                              nPartialSizeBefore += apultra_get_match_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset);
+
+                              nPartialSizeAfter = apultra_get_offset_varlen_size(nMaxLen, pBestMatch[nNextIndex].offset, nFollowsLiteral);
+                              if (nFollowsLiteral && nRepMatchOffset == pBestMatch[nNextIndex].offset)
+                                 nPartialSizeAfter += apultra_get_gamma2_size(nMaxLen);
+                              else
+                                 nPartialSizeAfter += apultra_get_match_varlen_size(nMaxLen, pBestMatch[nNextIndex].offset);
+
+                              nPartialSizeAfter += TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */;
+                              nPartialSizeAfter += apultra_get_gamma2_size(pBestMatch[nNextIndex].length);
+
+                              for (j = nMaxLen; j < pMatch->length; j++) {
+                                 if (pInWindow[i + j] == 0 || match1[i + j])
+                                    nPartialSizeAfter += TOKEN_SIZE_4BIT_MATCH + 4;
+                                 else
+                                    nPartialSizeAfter += 1 /* literal bit */ + 8 /* literal byte */;
+                              }
+
+                              if (nPartialSizeAfter < nPartialSizeBefore) {
+                                 /* We gain a repmatch that is shorter than the original match as this is the best we can do, so it is followed by extra literals, but
+                                  * we have calculated that this is shorter */
+
+                                 int nOrigLen = pMatch->length;
+                                 int j;
+
+                                 pMatch->offset = pBestMatch[nNextIndex].offset;
+                                 pMatch->length = nMaxLen;
+
+                                 for (j = nMaxLen; j < nOrigLen; j++) {
+                                    pBestMatch[i + j].offset = match1[i + j];
+                                    pBestMatch[i + j].length = (pInWindow[i + j] && match1[i+j] == 0) ? 0 : 1;
+                                 }
+
+                                 nDidReduce = 1;
+                                 continue;
+                              }
+                           }
+                        }
+                     }
+                  }
+               }
+
+               /* Calculate this command's current cost */
+
+               int nCurCommandSize;
+               if (pMatch->offset == nRepMatchOffset && nFollowsLiteral) {
+                  nCurCommandSize = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pMatch->length);
+               }
+               else {
+                  nCurCommandSize = apultra_get_offset_varlen_size(pMatch->length, pMatch->offset, nFollowsLiteral) + apultra_get_match_varlen_size(pMatch->length, pMatch->offset);
+               }
+
+               /* Calculate the next command's current cost */
+               int nNextCommandSize;
+               if (pBestMatch[nNextIndex].offset == pMatch->offset && nNextFollowsLiteral && pBestMatch[nNextIndex].length >= 2) {
+                  nNextCommandSize = TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[nNextIndex].length);
+               }
+               else {
+                  nNextCommandSize = apultra_get_offset_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset, nNextFollowsLiteral) + apultra_get_match_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset);
+               }
+
+               int nOriginalCombinedCommandSize = nCurCommandSize + nNextCommandSize;
+
+               /* Calculate the cost of replacing this match command by literals + the effect on the cost of the next command */
+               int nReducedCommandSize = 0;
+               int j;
+
+               for (j = 0; j < pMatch->length; j++) {
+                  if (pInWindow[i + j] == 0 || match1[i + j])
+                     nReducedCommandSize += TOKEN_SIZE_4BIT_MATCH + 4;
+                  else
+                     nReducedCommandSize += 1 /* literal bit */ + 8;
+               }
+
+               if (pBestMatch[nNextIndex].offset == nRepMatchOffset /* the new command would always follow literals, the ones we create */ && pBestMatch[nNextIndex].length >= 2) {
+                  nReducedCommandSize += TOKEN_SIZE_LARGE_MATCH + 2 /* apultra_get_gamma2_size(2) */ + apultra_get_gamma2_size(pBestMatch[nNextIndex].length);
+               }
+               else {
+                  if ((pBestMatch[nNextIndex].length < 3 && pBestMatch[nNextIndex].offset >= MINMATCH3_OFFSET) ||
+                     (pBestMatch[nNextIndex].length < 4 && pBestMatch[nNextIndex].offset >= MINMATCH4_OFFSET)) {
+                     /* This match length can only be encoded with a rep-match */
+                     nCannotEncode = 1;
+                  }
+                  else {
+                     nReducedCommandSize += apultra_get_offset_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset, 1 /* follows literals */) + apultra_get_match_varlen_size(pBestMatch[nNextIndex].length, pBestMatch[nNextIndex].offset);
+                  }
+               }
+
+               if (!nCannotEncode && nOriginalCombinedCommandSize > nReducedCommandSize) {
+                  /* Reduce */
+                  int nMatchLen = pMatch->length;
+                  int j;
+
+                  for (j = 0; j < nMatchLen; j++) {
+                     pBestMatch[i + j].offset = match1[i + j];
+                     pBestMatch[i + j].length = (pInWindow[i + j] && match1[i + j] == 0) ? 0 : 1;
+                  }
+
+                  nDidReduce = 1;
+                  continue;
+               }
+            }
+         }
+
+         if ((i + pMatch->length) < nEndOffset && pMatch->offset > 0 &&
+            pBestMatch[i + pMatch->length].offset > 0 &&
+            pBestMatch[i + pMatch->length].length >= 2 &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) >= LEAVE_ALONE_MATCH_SIZE &&
+            (pMatch->length + pBestMatch[i + pMatch->length].length) <= MAX_VARLEN &&
+            (i + pMatch->length) >= pMatch->offset &&
+            (i + pMatch->length) >= pBestMatch[i + pMatch->length].offset &&
+            (i + pMatch->length + pBestMatch[i + pMatch->length].length) <= nEndOffset &&
+            !memcmp(pInWindow + i + pMatch->length - pMatch->offset,
+               pInWindow + i + pMatch->length - pBestMatch[i + pMatch->length].offset,
+               pBestMatch[i + pMatch->length].length)) {
+            int nMatchLen = pMatch->length;
+
+            /* Join large matches */
+
+            int nNextIndex = i + pMatch->length + pBestMatch[i + pMatch->length].length;
+            int nNextFollowsLiteral = 0;
+            int nCannotEncode = 0;
+
+            while (nNextIndex < nEndOffset && pBestMatch[nNextIndex].length < 2) {
+               nNextIndex++;
+               nNextFollowsLiteral = 1;
+            }
+
+            if (nNextIndex < nEndOffset && nNextFollowsLiteral && pBestMatch[nNextIndex].length >= 2 &&
+               pBestMatch[nNextIndex].offset == pBestMatch[i + pMatch->length].offset) {
+               if ((pBestMatch[nNextIndex].offset >= MINMATCH3_OFFSET && pBestMatch[nNextIndex].length < 3) ||
+                  (pBestMatch[nNextIndex].offset >= MINMATCH4_OFFSET && pBestMatch[nNextIndex].length < 4)) {
+                  nCannotEncode = 1;
+               }
+            }
+
+            if (!nCannotEncode) {
+               pMatch->length += pBestMatch[i + nMatchLen].length;
+               pBestMatch[i + nMatchLen].offset = 0;
+               pBestMatch[i + nMatchLen].length = -1;
+               nDidReduce = 1;
+               continue;
+            }
+         }
+
+         nRepMatchOffset = pMatch->offset;
+         nFollowsLiteral = 0;
+         nLastMatchLen = pMatch->length;
+
+         i += pMatch->length;
+      }
+      else {
+         /* 4 bits offset (1 byte match) or literal */
+         i++;
+         nFollowsLiteral = 1;
+         nLastMatchLen = 0;
+      }
+   }
+
+   return nDidReduce;
+}
+
+/**
+ * Emit a block of compressed data
+ *
+ * @param pCompressor compression context
+ * @param pBestMatch optimal matches to emit
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nStartOffset current offset in input window (typically the number of previously compressed bytes)
+ * @param nEndOffset offset to end finding matches at (typically the size of the total input window in bytes
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ * @param nFollowsLiteral non-zero if the next command to be issued follows a literal, 0 if not
+ * @param nCurRepMatchOffset starting rep offset for this block, updated after the block is compressed successfully
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int apultra_write_block(apultra_compressor *pCompressor, apultra_final_match *pBestMatch, const unsigned char *pInWindow, const int nStartOffset, const int nEndOffset, unsigned char *pOutData, int nOutOffset, const int nMaxOutDataSize, int *nCurBitsOffset, int *nCurBitShift, int *nFollowsLiteral, int *nCurRepMatchOffset, const int nBlockFlags) {
+   int i;
+   int nRepMatchOffset = *nCurRepMatchOffset;
+   const int nMaxOffset = pCompressor->max_offset;
+
+   if (nBlockFlags & 1) {
+      if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+         return -1;
+      pOutData[nOutOffset++] = pInWindow[nStartOffset];
+      *nFollowsLiteral = 1;
+   }
+
+   for (i = nStartOffset + ((nBlockFlags & 1) ? 1 : 0); i < nEndOffset; ) {
+      const apultra_final_match *pMatch = pBestMatch + i;
+
+      if (pMatch->length >= 2) {
+         int nMatchOffset = pMatch->offset;
+         int nMatchLen = pMatch->length;
+
+         if (nMatchOffset < MIN_OFFSET || nMatchOffset > nMaxOffset)
+            return -1;
+
+         if (nMatchOffset == nRepMatchOffset && *nFollowsLiteral) {
+            /* Rep-match */
+            nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_LARGE_MATCH, TOKEN_SIZE_LARGE_MATCH, nCurBitsOffset, nCurBitShift);
+            nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 0 /* length of 2 encoded as gamma 2 */, 2, nCurBitsOffset, nCurBitShift);
+
+            /* The match length isn't encoded in the command, emit elias gamma value */
+            nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen, nCurBitsOffset, nCurBitShift);
+            if (nOutOffset < 0) return -1;
+
+            *nFollowsLiteral = 0;
+
+            pCompressor->stats.num_rep_matches++;
+         }
+         else {
+            if (nMatchLen <= 3 && nMatchOffset < 128) {
+               /* 7 bits offset + 1 bit length */
+               nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_7BIT_MATCH, TOKEN_SIZE_7BIT_MATCH, nCurBitsOffset, nCurBitShift);
+
+               if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+                  return -1;
+               pOutData[nOutOffset++] = ((nMatchOffset) & 0x7f) << 1 | (nMatchLen - 2);
+
+               *nFollowsLiteral = 0;
+               nRepMatchOffset = nMatchOffset;
+
+               pCompressor->stats.num_7bit_matches++;
+            }
+            else {
+               /* 8+n bits offset */
+               nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_LARGE_MATCH, TOKEN_SIZE_LARGE_MATCH, nCurBitsOffset, nCurBitShift);
+
+               if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+                  return -1;
+               if (*nFollowsLiteral)
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, (nMatchOffset >> 8) + 3, nCurBitsOffset, nCurBitShift);
+               else
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, (nMatchOffset >> 8) + 2, nCurBitsOffset, nCurBitShift);
+               pOutData[nOutOffset++] = nMatchOffset & 0xff;
+
+               /* The match length isn't encoded in the command, emit elias gamma value */
+
+               if (nMatchOffset < 128 || nMatchOffset >= MINMATCH4_OFFSET)
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen - 2, nCurBitsOffset, nCurBitShift);
+               else if (nMatchOffset < MINMATCH3_OFFSET)
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen, nCurBitsOffset, nCurBitShift);
+               else
+                  nOutOffset = apultra_write_gamma2_value(pOutData, nOutOffset, nMaxOutDataSize, nMatchLen - 1, nCurBitsOffset, nCurBitShift);
+               if (nOutOffset < 0) return -1;
+
+               *nFollowsLiteral = 0;
+               nRepMatchOffset = nMatchOffset;
+
+               pCompressor->stats.num_variable_matches++;
+            }
+         }
+
+         if (nMatchOffset < pCompressor->stats.min_offset || pCompressor->stats.min_offset == -1)
+            pCompressor->stats.min_offset = nMatchOffset;
+         if (nMatchOffset > pCompressor->stats.max_offset)
+            pCompressor->stats.max_offset = nMatchOffset;
+         pCompressor->stats.total_offsets += (long long)nMatchOffset;
+
+         if (nMatchLen < pCompressor->stats.min_match_len || pCompressor->stats.min_match_len == -1)
+            pCompressor->stats.min_match_len = nMatchLen;
+         if (nMatchLen > pCompressor->stats.max_match_len)
+            pCompressor->stats.max_match_len = nMatchLen;
+         pCompressor->stats.total_match_lens += nMatchLen;
+         pCompressor->stats.match_divisor++;
+
+         if (nMatchOffset == 1) {
+            if (nMatchLen < pCompressor->stats.min_rle1_len || pCompressor->stats.min_rle1_len == -1)
+               pCompressor->stats.min_rle1_len = nMatchLen;
+            if (nMatchLen > pCompressor->stats.max_rle1_len)
+               pCompressor->stats.max_rle1_len = nMatchLen;
+            pCompressor->stats.total_rle1_lens += nMatchLen;
+            pCompressor->stats.rle1_divisor++;
+         }
+         else if (nMatchOffset == 2) {
+            if (nMatchLen < pCompressor->stats.min_rle2_len || pCompressor->stats.min_rle2_len == -1)
+               pCompressor->stats.min_rle2_len = nMatchLen;
+            if (nMatchLen > pCompressor->stats.max_rle2_len)
+               pCompressor->stats.max_rle2_len = nMatchLen;
+            pCompressor->stats.total_rle2_lens += nMatchLen;
+            pCompressor->stats.rle2_divisor++;
+         }
+
+         i += nMatchLen;
+
+         pCompressor->stats.commands_divisor++;
+      }
+      else if (pMatch->length == 1) {
+         int nMatchOffset = pMatch->offset;
+
+         /* 4 bits offset */
+
+         if (nMatchOffset < 0 || nMatchOffset > 15)
+            return -1;
+
+         nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_4BIT_MATCH, TOKEN_SIZE_4BIT_MATCH, nCurBitsOffset, nCurBitShift);
+         nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, nMatchOffset, 4, nCurBitsOffset, nCurBitShift);
+         if (nOutOffset < 0) return -1;
+
+         pCompressor->stats.num_4bit_matches++;
+         pCompressor->stats.commands_divisor++;
+
+         i++;
+         *nFollowsLiteral = 1;
+      }
+      else {
+         /* Literal */
+
+         nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, 0 /* literal */, 1, nCurBitsOffset, nCurBitShift);
+
+         if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+            return -1;
+         pOutData[nOutOffset++] = pInWindow[i];
+
+         pCompressor->stats.num_literals++;
+         pCompressor->stats.commands_divisor++;
+         i++;
+         *nFollowsLiteral = 1;
+      }
+
+      int nCurSafeDist = (i - nStartOffset) - nOutOffset;
+      if (nCurSafeDist >= 0 && pCompressor->stats.safe_dist < nCurSafeDist)
+         pCompressor->stats.safe_dist = nCurSafeDist;
+   }
+
+   if (nBlockFlags & 2) {
+      /* 8 bits offset */
+
+      nOutOffset = apultra_write_bits(pOutData, nOutOffset, nMaxOutDataSize, TOKEN_CODE_7BIT_MATCH, TOKEN_SIZE_7BIT_MATCH, nCurBitsOffset, nCurBitShift);
+
+      if (nOutOffset < 0 || nOutOffset >= nMaxOutDataSize)
+         return -1;
+      pOutData[nOutOffset++] = 0x00;   /* Offset: EOD */
+      pCompressor->stats.num_eod++;
+      pCompressor->stats.commands_divisor++;
+
+      int nCurSafeDist = (i - nStartOffset) - nOutOffset;
+      if (nCurSafeDist >= 0 && pCompressor->stats.safe_dist < nCurSafeDist)
+         pCompressor->stats.safe_dist = nCurSafeDist;
+   }
+
+   *nCurRepMatchOffset = nRepMatchOffset;
+   return nOutOffset;
+}
+
+/**
+ * Select the most optimal matches, reduce the token count if possible, and then emit a block of compressed data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ * @param nCurFollowsLiteral non-zero if the next command to be issued follows a literal, 0 if not
+ * @param nCurRepMatchOffset starting rep offset for this block, updated after the block is compressed successfully
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int apultra_optimize_and_write_block(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize, int *nCurBitsOffset, int *nCurBitShift, int *nCurFollowsLiteral, int *nCurRepMatchOffset, const int nBlockFlags) {
+   int nOutOffset = 0;
+   const int nEndOffset = nPreviousBlockSize + nInDataSize;
+   const int nArrivalsPerPosition = pCompressor->max_arrivals;
+   int *rle_len = (int*)pCompressor->intervals /* reuse */;
+   int i, nPosition;
+
+   memset(pCompressor->best_match, 0, pCompressor->block_size * sizeof(apultra_final_match));
+
+   if ((nBlockFlags & 3) == 3) {
+      int *first_offset_for_byte = pCompressor->first_offset_for_byte;
+      int *next_offset_for_pos = pCompressor->next_offset_for_pos;
+
+      /* Supplement 2 and 3-byte matches */
+
+      memset(first_offset_for_byte, 0xff, sizeof(int) * 65536);
+      memset(next_offset_for_pos, 0xff, sizeof(int) * nInDataSize);
+
+      for (nPosition = nPreviousBlockSize; nPosition < (nEndOffset - 1); nPosition++) {
+         next_offset_for_pos[nPosition - nPreviousBlockSize] = first_offset_for_byte[((unsigned int)pInWindow[nPosition]) | (((unsigned int)pInWindow[nPosition + 1]) << 8)];
+         first_offset_for_byte[((unsigned int)pInWindow[nPosition]) | (((unsigned int)pInWindow[nPosition + 1]) << 8)] = nPosition;
+      }
+
+      for (nPosition = nPreviousBlockSize + 1; nPosition < (nEndOffset - 1); nPosition++) {
+         apultra_match *match = pCompressor->match + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+         unsigned short *match_depth = pCompressor->match_depth + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+         int m = 0, nInserted = 0;
+         int nMatchPos;
+
+         while (m < 15 && match[m].length)
+            m++;
+
+         for (nMatchPos = next_offset_for_pos[nPosition - nPreviousBlockSize]; m < 15 && nMatchPos >= 0; nMatchPos = next_offset_for_pos[nMatchPos - nPreviousBlockSize]) {
+            int nMatchOffset = nPosition - nMatchPos;
+
+            if (nMatchOffset <= pCompressor->max_offset) {
+               int nExistingMatchIdx;
+               int nAlreadyExists = 0;
+
+               for (nExistingMatchIdx = 0; nExistingMatchIdx < m; nExistingMatchIdx++) {
+                  if (match[nExistingMatchIdx].offset == nMatchOffset ||
+                     (match[nExistingMatchIdx].offset - (match_depth[nExistingMatchIdx] & 0x3fff)) == nMatchOffset) {
+                     nAlreadyExists = 1;
+                     break;
+                  }
+               }
+
+               if (!nAlreadyExists) {
+                  match[m].length = (nPosition < (nEndOffset - 2) && pInWindow[nMatchPos + 2] == pInWindow[nPosition + 2]) ? 3 : 2;
+                  match[m].offset = nMatchOffset;
+                  match_depth[m] = 0x4000;
+                  m++;
+                  nInserted++;
+                  if (nInserted >= 6)
+                     break;
+               }
+            }
+            else {
+               break;
+            }
+         }
+      }
+   }
+
+   i = 0;
+   while (i < nEndOffset) {
+      int nRangeStartIdx = i;
+      unsigned char c = pInWindow[nRangeStartIdx];
+      do {
+         i++;
+      }
+      while (i < nEndOffset && pInWindow[i] == c);
+      while (nRangeStartIdx < i) {
+         rle_len[nRangeStartIdx] = i - nRangeStartIdx;
+         nRangeStartIdx++;
+      }
+   }
+
+   apultra_optimize_forward(pCompressor, pInWindow, nPreviousBlockSize, nEndOffset, 1 /* nInsertForwardReps */, nCurRepMatchOffset, nBlockFlags, nArrivalsPerPosition);
+
+   if ((nBlockFlags & 3) == 3 && nArrivalsPerPosition == NARRIVALS_PER_POSITION_MAX) {
+      const int* next_offset_for_pos = pCompressor->next_offset_for_pos;
+      int* offset_cache = pCompressor->offset_cache;
+
+      /* Supplement matches further */
+
+      memset(offset_cache, 0xff, sizeof(int) * 2048);
+
+      for (nPosition = nPreviousBlockSize + 1; nPosition < (nEndOffset - 1); nPosition++) {
+         apultra_match* match = pCompressor->match + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+
+         if (match[0].length < 8) {
+            unsigned short* match_depth = pCompressor->match_depth + ((nPosition - nPreviousBlockSize) << MATCHES_PER_INDEX_SHIFT);
+            int m = 0, nInserted = 0;
+            int nMatchPos;
+
+            while (m < 46 && match[m].length) {
+               offset_cache[match[m].offset & 2047] = nPosition;
+               offset_cache[(match[m].offset - (match_depth[m] & 0x3fff)) & 2047] = nPosition;
+               m++;
+            }
+
+            for (nMatchPos = next_offset_for_pos[nPosition - nPreviousBlockSize]; m < 46 && nMatchPos >= 0; nMatchPos = next_offset_for_pos[nMatchPos - nPreviousBlockSize]) {
+               int nMatchOffset = nPosition - nMatchPos;
+
+               if (nMatchOffset <= pCompressor->max_offset) {
+                  int nAlreadyExists = 0;
+
+                  if (offset_cache[nMatchOffset & 2047] == nPosition) {
+                     int nExistingMatchIdx;
+
+                     for (nExistingMatchIdx = 0; nExistingMatchIdx < m; nExistingMatchIdx++) {
+                        if (match[nExistingMatchIdx].offset == nMatchOffset ||
+                           (match[nExistingMatchIdx].offset - (match_depth[nExistingMatchIdx] & 0x3fff)) == nMatchOffset) {
+                           nAlreadyExists = 1;
+
+                           if (match_depth[nExistingMatchIdx] == 0x4000) {
+                              int nMatchLen = 2;
+                              while (nMatchLen < 16 && nPosition < (nEndOffset - nMatchLen) && pInWindow[nMatchPos + nMatchLen] == pInWindow[nPosition + nMatchLen])
+                                 nMatchLen++;
+                              if (nMatchLen > (int)match[nExistingMatchIdx].length)
+                                 match[nExistingMatchIdx].length = nMatchLen;
+                           }
+
+                           break;
+                        }
+                     }
+                  }
+
+                  if (!nAlreadyExists) {
+                     int nForwardPos = nPosition + 2 + 1;
+                     int nGotMatch = 0;
+
+                     while (nForwardPos >= nMatchOffset && (nForwardPos + 2) < nEndOffset && nForwardPos < (nPosition + 2 + 1 + 5)) {
+                        if (!memcmp(pInWindow + nForwardPos, pInWindow + nForwardPos - nMatchOffset, 2)) {
+                           nGotMatch = 1;
+                           break;
+                        }
+                        nForwardPos++;
+                     }
+
+                     if (nGotMatch) {
+                        int nMatchLen = 2;
+                        while (nMatchLen < 16 && nPosition < (nEndOffset - nMatchLen) && pInWindow[nMatchPos + nMatchLen] == pInWindow[nPosition + nMatchLen])
+                           nMatchLen++;
+                        match[m].length = nMatchLen;
+                        match[m].offset = nMatchOffset;
+                        match_depth[m] = 0;
+                        m++;
+
+                        apultra_insert_forward_match(pCompressor, pInWindow, nPosition, nMatchOffset, nPreviousBlockSize, nEndOffset, nArrivalsPerPosition, 8);
+
+                        nInserted++;
+                        if (nInserted >= 18 || (nInserted >= 15 && m >= 38))
+                           break;
+                     }
+                  }
+               }
+               else {
+                  break;
+               }
+            }
+         }
+      }
+   }
+
+   /* Pick optimal matches */
+   apultra_optimize_forward(pCompressor, pInWindow, nPreviousBlockSize, nEndOffset, 0 /* nInsertForwardReps */, nCurRepMatchOffset, nBlockFlags, nArrivalsPerPosition);
+
+   /* Apply reduction and merge pass */
+   int nDidReduce;
+   int nPasses = 0;
+   do {
+      nDidReduce = apultra_reduce_commands(pCompressor, pInWindow, pCompressor->best_match - nPreviousBlockSize, nPreviousBlockSize, nEndOffset, nCurRepMatchOffset, nBlockFlags);
+      nPasses++;
+   } while (nDidReduce && nPasses < 20);
+
+   /* Write compressed block */
+
+   return apultra_write_block(pCompressor, pCompressor->best_match - nPreviousBlockSize, pInWindow, nPreviousBlockSize, nEndOffset, pOutData, nOutOffset, nMaxOutDataSize, nCurBitsOffset, nCurBitShift, nCurFollowsLiteral, nCurRepMatchOffset, nBlockFlags);
+}
+
+/* Forward declaration */
+static void apultra_compressor_destroy(apultra_compressor *pCompressor);
+
+/**
+ * Initialize compression context
+ *
+ * @param pCompressor compression context to initialize
+ * @param nBlockSize maximum size of input data (bytes to compress only)
+ * @param nMaxWindowSize maximum size of input data window (previously compressed bytes + bytes to compress)
+ * @param nMaxArrivals maximum number of arrivals per position
+ * @param nFlags compression flags
+ *
+ * @return 0 for success, non-zero for failure
+ */
+static int apultra_compressor_init(apultra_compressor *pCompressor, const int nBlockSize, const int nMaxWindowSize, const int nMaxArrivals, const int nFlags) {
+   int nResult;
+
+   nResult = divsufsort_init(&pCompressor->divsufsort_context);
+   pCompressor->intervals = NULL;
+   pCompressor->pos_data = NULL;
+   pCompressor->open_intervals = NULL;
+   pCompressor->match = NULL;
+   pCompressor->match_depth = NULL;
+   pCompressor->match1 = NULL;
+   pCompressor->best_match = NULL;
+   pCompressor->arrival = NULL;
+   pCompressor->first_offset_for_byte = NULL;
+   pCompressor->next_offset_for_pos = NULL;
+   pCompressor->offset_cache = NULL;
+   pCompressor->flags = nFlags;
+   pCompressor->block_size = nBlockSize;
+   pCompressor->max_arrivals = nMaxArrivals;
+
+   memset(&pCompressor->stats, 0, sizeof(pCompressor->stats));
+   pCompressor->stats.min_match_len = -1;
+   pCompressor->stats.min_offset = -1;
+   pCompressor->stats.min_rle1_len = -1;
+   pCompressor->stats.min_rle2_len = -1;
+
+   if (!nResult) {
+      pCompressor->intervals = (unsigned long long *)malloc(nMaxWindowSize * sizeof(unsigned long long));
+
+      if (pCompressor->intervals) {
+         pCompressor->pos_data = (unsigned long long *)malloc(nMaxWindowSize * sizeof(unsigned long long));
+
+         if (pCompressor->pos_data) {
+            pCompressor->open_intervals = (unsigned long long *)malloc((LCP_AND_TAG_MAX + 1) * sizeof(unsigned long long));
+
+            if (pCompressor->open_intervals) {
+               pCompressor->arrival = (apultra_arrival *)malloc((nBlockSize + 1) * nMaxArrivals * sizeof(apultra_arrival));
+
+               if (pCompressor->arrival) {
+                  pCompressor->best_match = (apultra_final_match *)malloc(nBlockSize * sizeof(apultra_final_match));
+
+                  if (pCompressor->best_match) {
+                     pCompressor->match = (apultra_match *)malloc(nBlockSize * NMATCHES_PER_INDEX * sizeof(apultra_match));
+                     if (pCompressor->match) {
+                        pCompressor->match_depth = (unsigned short *)malloc(nBlockSize * NMATCHES_PER_INDEX * sizeof(unsigned short));
+                        if (pCompressor->match_depth) {
+                           pCompressor->match1 = (unsigned char *)malloc(nBlockSize * sizeof(unsigned char));
+                           if (pCompressor->match1) {
+                              pCompressor->first_offset_for_byte = (int*)malloc(65536 * sizeof(int));
+                              if (pCompressor->first_offset_for_byte) {
+                                 pCompressor->next_offset_for_pos = (int*)malloc(nBlockSize * sizeof(int));
+                                 if (pCompressor->next_offset_for_pos) {
+                                    if (nMaxArrivals == NARRIVALS_PER_POSITION_MAX) {
+                                       pCompressor->offset_cache = (int*)malloc(2048 * sizeof(int));
+                                       if (pCompressor->offset_cache) {
+                                          return 0;
+                                       }
+                                    }
+                                    else {
+                                       return 0;
+                                    }
+                                 }
+                              }
+                           }
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      }
+   }
+
+   apultra_compressor_destroy(pCompressor);
+   return 100;
+}
+
+/**
+ * Clean up compression context and free up any associated resources
+ *
+ * @param pCompressor compression context to clean up
+ */
+static void apultra_compressor_destroy(apultra_compressor *pCompressor) {
+   divsufsort_destroy(&pCompressor->divsufsort_context);
+
+   if (pCompressor->offset_cache) {
+      free(pCompressor->offset_cache);
+      pCompressor->offset_cache = NULL;
+   }
+
+   if (pCompressor->next_offset_for_pos) {
+      free(pCompressor->next_offset_for_pos);
+      pCompressor->next_offset_for_pos = NULL;
+   }
+
+   if (pCompressor->first_offset_for_byte) {
+      free(pCompressor->first_offset_for_byte);
+      pCompressor->first_offset_for_byte = NULL;
+   }
+
+   if (pCompressor->match1) {
+      free(pCompressor->match1);
+      pCompressor->match1 = NULL;
+   }
+
+   if (pCompressor->match_depth) {
+      free(pCompressor->match_depth);
+      pCompressor->match_depth = NULL;
+   }
+
+   if (pCompressor->match) {
+      free(pCompressor->match);
+      pCompressor->match = NULL;
+   }
+
+   if (pCompressor->arrival) {
+      free(pCompressor->arrival);
+      pCompressor->arrival = NULL;
+   }
+
+   if (pCompressor->best_match) {
+      free(pCompressor->best_match);
+      pCompressor->best_match = NULL;
+   }
+
+   if (pCompressor->open_intervals) {
+      free(pCompressor->open_intervals);
+      pCompressor->open_intervals = NULL;
+   }
+
+   if (pCompressor->pos_data) {
+      free(pCompressor->pos_data);
+      pCompressor->pos_data = NULL;
+   }
+
+   if (pCompressor->intervals) {
+      free(pCompressor->intervals);
+      pCompressor->intervals = NULL;
+   }
+}
+
+/**
+ * Compress one block of data
+ *
+ * @param pCompressor compression context
+ * @param pInWindow pointer to input data window (previously compressed bytes + bytes to compress)
+ * @param nPreviousBlockSize number of previously compressed bytes (or 0 for none)
+ * @param nInDataSize number of input bytes to compress
+ * @param pOutData pointer to output buffer
+ * @param nMaxOutDataSize maximum size of output buffer, in bytes
+ * @param nCurBitsOffset write index into output buffer, of current byte being filled with bits
+ * @param nCurBitShift bit shift count
+ * @param nCurFollowsLiteral non-zero if the next command to be issued follows a literal, 0 if not
+ * @param nCurRepMatchOffset starting rep offset for this block, updated after the block is compressed successfully
+ * @param nBlockFlags bit 0: 1 for first block, 0 otherwise; bit 1: 1 for last block, 0 otherwise
+ *
+ * @return size of compressed data in output buffer, or -1 if the data is uncompressible
+ */
+static int apultra_compressor_shrink_block(apultra_compressor *pCompressor, const unsigned char *pInWindow, const int nPreviousBlockSize, const int nInDataSize, unsigned char *pOutData, const int nMaxOutDataSize, int *nCurBitsOffset, int *nCurBitShift, int *nCurFollowsLiteral, int *nCurRepMatchOffset, const int nBlockFlags) {
+   int nCompressedSize;
+
+   if (apultra_build_suffix_array(pCompressor, pInWindow, nPreviousBlockSize + nInDataSize))
+      nCompressedSize = -1;
+   else {
+      if (nPreviousBlockSize) {
+         apultra_skip_matches(pCompressor, 0, nPreviousBlockSize);
+      }
+      apultra_find_all_matches(pCompressor, NMATCHES_PER_INDEX, nPreviousBlockSize, nPreviousBlockSize + nInDataSize, nBlockFlags);
+
+      nCompressedSize = apultra_optimize_and_write_block(pCompressor, pInWindow, nPreviousBlockSize, nInDataSize, pOutData, nMaxOutDataSize, nCurBitsOffset, nCurBitShift, nCurFollowsLiteral, nCurRepMatchOffset, nBlockFlags);
+   }
+
+   return nCompressedSize;
+}
+
+/**
+ * Get maximum compressed size of input(source) data
+ *
+ * @param nInputSize input(source) size in bytes
+ *
+ * @return maximum compressed size
+ */
+size_t apultra_get_max_compressed_size(size_t nInputSize) {
+   return ((nInputSize * 9 /* literals + literal bits */ + 1 /* match bit */ + 2 /* 7+1 command bits */ + 8 /* EOD offset bits */) + 7) >> 3;
+}
+
+/**
+ * Compress memory
+ *
+ * @param pInputData pointer to input(source) data to compress
+ * @param pOutBuffer buffer for compressed data
+ * @param nInputSize input(source) size in bytes
+ * @param nMaxOutBufferSize maximum capacity of compression buffer
+ * @param nFlags compression flags (set to 0)
+ * @param nMaxWindowSize maximum window size to use (0 for default)
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param progress progress function, called after compressing each block, or NULL for none
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
+ *
+ * @return actual compressed size, or -1 for error
+ */
+size_t apultra_compress(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
+      const unsigned int nFlags, size_t nMaxWindowSize, size_t nDictionarySize, void(*progress)(long long nOriginalSize, long long nCompressedSize), apultra_stats *pStats) {
+   apultra_compressor compressor;
+   size_t nOriginalSize = 0;
+   size_t nCompressedSize = 0L;
+   int nResult;
+   int nMaxArrivals = NARRIVALS_PER_POSITION_SMALL;
+   int nError = 0;
+   const int nBlockSize = (nInputSize < BLOCK_SIZE) ? ((nInputSize < 1024) ? 1024 : (int)nInputSize) : BLOCK_SIZE;
+   const int nMaxOutBlockSize = (int)apultra_get_max_compressed_size(nBlockSize);
+
+   if (nDictionarySize < nInputSize) {
+      int nInDataSize = (int)(nInputSize - nDictionarySize);
+      if (nInDataSize > nBlockSize)
+         nInDataSize = nBlockSize;
+
+      if (nInDataSize > 0 && (nDictionarySize + nInDataSize) >= nInputSize) {
+         if (nInputSize <= 262144)
+            nMaxArrivals = NARRIVALS_PER_POSITION_MAX;
+         else
+            nMaxArrivals = NARRIVALS_PER_POSITION_NORMAL;
+      }
+   }
+
+   nResult = apultra_compressor_init(&compressor, nBlockSize, nBlockSize * 2, nMaxArrivals, nFlags);
+   if (nResult != 0) {
+      return -1;
+   }
+
+   compressor.max_offset = nMaxWindowSize ? (int)nMaxWindowSize : MAX_OFFSET;
+
+   int nPreviousBlockSize = 0;
+   int nNumBlocks = 0;
+   int nCurBitsOffset = INT_MIN, nCurBitShift = 0, nCurFollowsLiteral = 0;
+   int nBlockFlags = 1;
+   int nCurRepMatchOffset = 0;
+
+   if (nDictionarySize) {
+      nOriginalSize = (int)nDictionarySize;
+      nPreviousBlockSize = (int)nDictionarySize;
+   }
+
+   while (nOriginalSize < nInputSize && !nError) {
+      int nInDataSize;
+
+      nInDataSize = (int)(nInputSize - nOriginalSize);
+      if (nInDataSize > nBlockSize)
+         nInDataSize = nBlockSize;
+
+      if (nInDataSize > 0) {
+         int nOutDataSize;
+         int nOutDataEnd = (int)(nMaxOutBufferSize - nCompressedSize);
+
+         if (nOutDataEnd > nMaxOutBlockSize)
+            nOutDataEnd = nMaxOutBlockSize;
+
+         if ((nOriginalSize + nInDataSize) >= nInputSize)
+            nBlockFlags |= 2;
+         nOutDataSize = apultra_compressor_shrink_block(&compressor, pInputData + nOriginalSize - nPreviousBlockSize, nPreviousBlockSize, nInDataSize, pOutBuffer + nCompressedSize, nOutDataEnd,
+            &nCurBitsOffset, &nCurBitShift, &nCurFollowsLiteral, &nCurRepMatchOffset, nBlockFlags);
+         nBlockFlags &= (~1);
+
+         if (nOutDataSize >= 0) {
+            /* Write compressed block */
+
+            if (!nError) {
+               nOriginalSize += nInDataSize;
+               nCompressedSize += nOutDataSize;
+               if (nCurBitsOffset != INT_MIN)
+                  nCurBitsOffset -= nOutDataSize;
+            }
+         }
+         else {
+            nError = -1;
+         }
+
+         nPreviousBlockSize = nInDataSize;
+         nNumBlocks++;
+      }
+
+      if (!nError && nOriginalSize < nInputSize) {
+         if (progress)
+            progress(nOriginalSize, nCompressedSize);
+      }
+   }
+
+   if (progress)
+      progress(nOriginalSize, nCompressedSize);
+   if (pStats)
+      *pStats = compressor.stats;
+
+   apultra_compressor_destroy(&compressor);
+
+   if (nError) {
+      return -1;
+   }
+   else {
+      return nCompressedSize;
+   }
+}
diff --git a/tools/z64compress/src/enc/apultra/shrink.h b/tools/z64compress/src/enc/apultra/shrink.h
new file mode 100644
index 000000000..bd905936f
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/shrink.h
@@ -0,0 +1,174 @@
+/*
+ * shrink.h - compressor definitions
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by cap by Sven-�ke Dahl. https://github.com/svendahl/cap
+ * Also inspired by Charles Bloom's compression blog. http://cbloomrants.blogspot.com/
+ * With ideas from LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help and support from spke <zxintrospec@gmail.com>
+ *
+ */
+
+#ifndef _SHRINK_H
+#define _SHRINK_H
+
+#include "divsufsort.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LCP_BITS 15
+#define TAG_BITS 4
+#define LCP_MAX ((1U<<(LCP_BITS - TAG_BITS)) - 1)
+#define LCP_AND_TAG_MAX ((1U<<LCP_BITS) - 1)
+#define LCP_SHIFT (63-LCP_BITS)
+#define LCP_MASK (((1ULL<<LCP_BITS) - 1) << LCP_SHIFT)
+#define POS_MASK ((1ULL<<LCP_SHIFT) - 1)
+#define VISITED_FLAG 0x8000000000000000ULL
+#define EXCL_VISITED_MASK  0x7fffffffffffffffULL
+
+#define NARRIVALS_PER_POSITION_MAX 62
+#define NARRIVALS_PER_POSITION_NORMAL 46
+#define NARRIVALS_PER_POSITION_SMALL 9
+
+#define NMATCHES_PER_INDEX 64
+#define MATCHES_PER_INDEX_SHIFT 6
+
+#define LEAVE_ALONE_MATCH_SIZE 120
+
+/** One match option */
+typedef struct _apultra_match {
+   unsigned int length:11;
+   unsigned int offset:21;
+} apultra_match;
+
+/** One finalized match */
+typedef struct _apultra_final_match {
+   int length;
+   int offset;
+} apultra_final_match;
+
+/** Forward arrival slot */
+typedef struct {
+   int cost;
+
+   unsigned int from_pos:21;
+   int from_slot:7;
+   unsigned int follows_literal:1;
+
+   unsigned int rep_offset:21;
+   unsigned int short_offset:4;
+   unsigned int rep_pos:21;
+   unsigned int match_len:11;
+
+   int score;
+} apultra_arrival;
+
+/** Compression statistics */
+typedef struct _apultra_stats {
+   int num_literals;
+   int num_4bit_matches;
+   int num_7bit_matches;
+   int num_variable_matches;
+   int num_rep_matches;
+   int num_eod;
+
+   int safe_dist;
+
+   int min_offset;
+   int max_offset;
+   long long total_offsets;
+
+   int min_match_len;
+   int max_match_len;
+   int total_match_lens;
+
+   int min_rle1_len;
+   int max_rle1_len;
+   int total_rle1_lens;
+
+   int min_rle2_len;
+   int max_rle2_len;
+   int total_rle2_lens;
+
+   int commands_divisor;
+   int match_divisor;
+   int rle1_divisor;
+   int rle2_divisor;
+} apultra_stats;
+
+/** Compression context */
+typedef struct _apultra_compressor {
+   divsufsort_ctx_t divsufsort_context;
+   unsigned long long *intervals;
+   unsigned long long *pos_data;
+   unsigned long long *open_intervals;
+   apultra_match *match;
+   unsigned short *match_depth;
+   unsigned char *match1;
+   apultra_final_match *best_match;
+   apultra_arrival *arrival;
+   int *first_offset_for_byte;
+   int *next_offset_for_pos;
+   int *offset_cache;
+   int flags;
+   int block_size;
+   int max_offset;
+   int max_arrivals;
+   apultra_stats stats;
+} apultra_compressor;
+
+/**
+ * Get maximum compressed size of input(source) data
+ *
+ * @param nInputSize input(source) size in bytes
+ *
+ * @return maximum compressed size
+ */
+size_t apultra_get_max_compressed_size(size_t nInputSize);
+
+/**
+ * Compress memory
+ *
+ * @param pInputData pointer to input(source) data to compress
+ * @param pOutBuffer buffer for compressed data
+ * @param nInputSize input(source) size in bytes
+ * @param nMaxOutBufferSize maximum capacity of compression buffer
+ * @param nFlags compression flags (set to 0)
+ * @param nMaxWindowSize maximum window size to use (0 for default)
+ * @param nDictionarySize size of dictionary in front of input data (0 for none)
+ * @param progress progress function, called after compressing each block, or NULL for none
+ * @param pStats pointer to compression stats that are filled if this function is successful, or NULL
+ *
+ * @return actual compressed size, or -1 for error
+ */
+size_t apultra_compress(const unsigned char *pInputData, unsigned char *pOutBuffer, size_t nInputSize, size_t nMaxOutBufferSize,
+   const unsigned int nFlags, size_t nMaxWindowSize, size_t nDictionarySize, void(*progress)(long long nOriginalSize, long long nCompressedSize), apultra_stats *pStats);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SHRINK_H */
diff --git a/tools/z64compress/src/enc/apultra/shrink_context.c b/tools/z64compress/src/enc/apultra/shrink_context.c
new file mode 100644
index 000000000..8db9a6c9e
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/shrink_context.c
@@ -0,0 +1,39 @@
+/*
+ * shrink_context.c - compression context implementation
+ *
+ * Copyright (C) 2019 Emmanuel Marty
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/*
+ * Uses the libdivsufsort library Copyright (c) 2003-2008 Yuta Mori
+ *
+ * Inspired by LZ4 by Yann Collet. https://github.com/lz4/lz4
+ * With help, ideas, optimizations and speed measurements by spke <zxintrospec@gmail.com>
+ * With ideas from Lizard by Przemyslaw Skibinski and Yann Collet. https://github.com/inikep/lizard
+ * Also with ideas from smallz4 by Stephan Brumme. https://create.stephan-brumme.com/smallz4/
+ *
+ */
+
+#include <stdlib.h>
+#include <string.h>
+//#include "shrink_context.h"
+//#include "shrink_block.h"
+#include "format.h"
+#include "matchfinder.h"
+//#include "lib.h"
diff --git a/tools/z64compress/src/enc/apultra/sssort.c b/tools/z64compress/src/enc/apultra/sssort.c
new file mode 100644
index 000000000..4a18fd2ab
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/sssort.c
@@ -0,0 +1,815 @@
+/*
+ * sssort.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "divsufsort_private.h"
+
+
+/*- Private Functions -*/
+
+static const saint_t lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+saint_t
+ss_ilg(saidx_t n) {
+#if SS_BLOCKSIZE == 0
+# if defined(BUILD_DIVSUFSORT64)
+  return (n >> 32) ?
+          ((n >> 48) ?
+            ((n >> 56) ?
+              56 + lg_table[(n >> 56) & 0xff] :
+              48 + lg_table[(n >> 48) & 0xff]) :
+            ((n >> 40) ?
+              40 + lg_table[(n >> 40) & 0xff] :
+              32 + lg_table[(n >> 32) & 0xff])) :
+          ((n & 0xffff0000) ?
+            ((n & 0xff000000) ?
+              24 + lg_table[(n >> 24) & 0xff] :
+              16 + lg_table[(n >> 16) & 0xff]) :
+            ((n & 0x0000ff00) ?
+               8 + lg_table[(n >>  8) & 0xff] :
+               0 + lg_table[(n >>  0) & 0xff]));
+# else
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+# endif
+#elif SS_BLOCKSIZE < 256
+  return lg_table[n];
+#else
+  return (n & 0xff00) ?
+          8 + lg_table[(n >> 8) & 0xff] :
+          0 + lg_table[(n >> 0) & 0xff];
+#endif
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+#if SS_BLOCKSIZE != 0
+
+static const saint_t sqq_table[256] = {
+  0,  16,  22,  27,  32,  35,  39,  42,  45,  48,  50,  53,  55,  57,  59,  61,
+ 64,  65,  67,  69,  71,  73,  75,  76,  78,  80,  81,  83,  84,  86,  87,  89,
+ 90,  91,  93,  94,  96,  97,  98,  99, 101, 102, 103, 104, 106, 107, 108, 109,
+110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
+156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
+169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
+181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
+192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
+202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
+212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
+221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
+230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
+239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
+247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
+};
+
+static INLINE
+saidx_t
+ss_isqrt(saidx_t x) {
+  saidx_t y, e;
+
+  if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
+  e = (x & 0xffff0000) ?
+        ((x & 0xff000000) ?
+          24 + lg_table[(x >> 24) & 0xff] :
+          16 + lg_table[(x >> 16) & 0xff]) :
+        ((x & 0x0000ff00) ?
+           8 + lg_table[(x >>  8) & 0xff] :
+           0 + lg_table[(x >>  0) & 0xff]);
+
+  if(e >= 16) {
+    y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
+    if(e >= 24) { y = (y + 1 + x / y) >> 1; }
+    y = (y + 1 + x / y) >> 1;
+  } else if(e >= 8) {
+    y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
+  } else {
+    return sqq_table[x] >> 4;
+  }
+
+  return (x < (y * y)) ? y - 1 : y;
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Compares two suffixes. */
+static INLINE
+saint_t
+ss_compare(const sauchar_t *T,
+           const saidx_t *p1, const saidx_t *p2,
+           saidx_t depth) {
+  const sauchar_t *U1, *U2, *U1n, *U2n;
+
+  for(U1 = T + depth + *p1,
+      U2 = T + depth + *p2,
+      U1n = T + *(p1 + 1) + 2,
+      U2n = T + *(p2 + 1) + 2;
+      (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
+      ++U1, ++U2) {
+  }
+
+  return U1 < U1n ?
+        (U2 < U2n ? *U1 - *U2 : 1) :
+        (U2 < U2n ? -1 : 0);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
+
+/* Insertionsort for small size groups */
+static
+void
+ss_insertionsort(const sauchar_t *T, const saidx_t *PA,
+                 saidx_t *first, saidx_t *last, saidx_t depth) {
+  saidx_t *i, *j;
+  saidx_t t;
+  saint_t r;
+
+  for(i = last - 2; first <= i; --i) {
+    for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
+      do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
+      if(last <= j) { break; }
+    }
+    if(r == 0) { *j = ~*j; }
+    *(j - 1) = t;
+  }
+}
+
+#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+void
+ss_fixdown(const sauchar_t *Td, const saidx_t *PA,
+           saidx_t *SA, saidx_t i, saidx_t size) {
+  saidx_t j, k;
+  saidx_t v;
+  saint_t c, d, e;
+
+  for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = Td[PA[SA[k = j++]]];
+    if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+ss_heapsort(const sauchar_t *Td, const saidx_t *PA, saidx_t *SA, saidx_t size) {
+  saidx_t i, m;
+  saidx_t t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    ss_fixdown(Td, PA, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+saidx_t *
+ss_median3(const sauchar_t *Td, const saidx_t *PA,
+           saidx_t *v1, saidx_t *v2, saidx_t *v3) {
+  saidx_t *t;
+  if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
+  if(Td[PA[*v2]] > Td[PA[*v3]]) {
+    if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+saidx_t *
+ss_median5(const sauchar_t *Td, const saidx_t *PA,
+           saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) {
+  saidx_t *t;
+  if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
+  if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
+  if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
+  if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+saidx_t *
+ss_pivot(const sauchar_t *Td, const saidx_t *PA, saidx_t *first, saidx_t *last) {
+  saidx_t *middle;
+  saidx_t t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return ss_median3(Td, PA, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = ss_median3(Td, PA, first, first + t, first + (t << 1));
+  middle = ss_median3(Td, PA, middle - t, middle, middle + t);
+  last   = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return ss_median3(Td, PA, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Binary partition for substrings. */
+static INLINE
+saidx_t *
+ss_partition(const saidx_t *PA,
+                    saidx_t *first, saidx_t *last, saidx_t depth) {
+  saidx_t *a, *b;
+  saidx_t t;
+  for(a = first - 1, b = last;;) {
+    for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
+    for(; (a < --b) && ((PA[*b] + depth) <  (PA[*b + 1] + 1));) { }
+    if(b <= a) { break; }
+    t = ~*b;
+    *b = *a;
+    *a = t;
+  }
+  if(first < a) { *first = ~*first; }
+  return a;
+}
+
+/* Multikey introsort for medium size groups. */
+static
+void
+ss_mintrosort(const sauchar_t *T, const saidx_t *PA,
+              saidx_t *first, saidx_t *last,
+              saidx_t depth) {
+#define STACK_SIZE SS_MISORT_STACKSIZE
+  struct { saidx_t *a, *b, c; saint_t d; } stack[STACK_SIZE];
+  const sauchar_t *Td;
+  saidx_t *a, *b, *c, *d, *e, *f;
+  saidx_t s, t;
+  saint_t ssize;
+  saint_t limit;
+  saint_t v, x = 0;
+
+  for(ssize = 0, limit = ss_ilg(last - first);;) {
+
+    if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
+#if 1 < SS_INSERTIONSORT_THRESHOLD
+      if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
+#endif
+      STACK_POP(first, last, depth, limit);
+      continue;
+    }
+
+    Td = T + depth;
+    if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
+    if(limit < 0) {
+      for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
+        if((x = Td[PA[*a]]) != v) {
+          if(1 < (a - first)) { break; }
+          v = x;
+          first = a;
+        }
+      }
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, a, depth);
+      }
+      if((a - first) <= (last - a)) {
+        if(1 < (a - first)) {
+          STACK_PUSH(a, last, depth, -1);
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        } else {
+          first = a, limit = -1;
+        }
+      } else {
+        if(1 < (last - a)) {
+          STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
+          first = a, limit = -1;
+        } else {
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        }
+      }
+      continue;
+    }
+
+    /* choose pivot */
+    a = ss_pivot(Td, PA, first, last);
+    v = Td[PA[*a]];
+    SWAP(*first, *a);
+
+    /* partition */
+    for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
+    if(((a = b) < last) && (x < v)) {
+      for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+    }
+    for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
+    if((b < (d = c)) && (x > v)) {
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+    for(; b < c;) {
+      SWAP(*b, *c);
+      for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+
+    if(a <= d) {
+      c = b - 1;
+
+      if((s = a - first) > (t = b - a)) { s = t; }
+      for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+      if((s = d - c) > (t = last - d - 1)) { s = t; }
+      for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+
+      a = first + (b - a), c = last - (d - c);
+      b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
+
+      if((a - first) <= (last - c)) {
+        if((last - c) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(c, last, depth, limit);
+          last = a;
+        } else if((a - first) <= (c - b)) {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          last = a;
+        } else {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(first, a, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      } else {
+        if((a - first) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(first, a, depth, limit);
+          first = c;
+        } else if((last - c) <= (c - b)) {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          first = c;
+        } else {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(c, last, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      }
+    } else {
+      limit += 1;
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, last, depth);
+        limit = ss_ilg(last - first);
+      }
+      depth += 1;
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if SS_BLOCKSIZE != 0
+
+static INLINE
+void
+ss_blockswap(saidx_t *a, saidx_t *b, saidx_t n) {
+  saidx_t t;
+  for(; 0 < n; --n, ++a, ++b) {
+    t = *a, *a = *b, *b = t;
+  }
+}
+
+static INLINE
+void
+ss_rotate(saidx_t *first, saidx_t *middle, saidx_t *last) {
+  saidx_t *a, *b, t;
+  saidx_t l, r;
+  l = middle - first, r = last - middle;
+  for(; (0 < l) && (0 < r);) {
+    if(l == r) { ss_blockswap(first, middle, l); break; }
+    if(l < r) {
+      a = last - 1, b = middle - 1;
+      t = *a;
+      do {
+        *a-- = *b, *b-- = *a;
+        if(b < first) {
+          *a = t;
+          last = a;
+          if((r -= l + 1) <= l) { break; }
+          a -= 1, b = middle - 1;
+          t = *a;
+        }
+      } while(1);
+    } else {
+      a = first, b = middle;
+      t = *a;
+      do {
+        *a++ = *b, *b++ = *a;
+        if(last <= b) {
+          *a = t;
+          first = a + 1;
+          if((l -= r + 1) <= r) { break; }
+          a += 1, b = middle;
+          t = *a;
+        }
+      } while(1);
+    }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static
+void
+ss_inplacemerge(const sauchar_t *T, const saidx_t *PA,
+                saidx_t *first, saidx_t *middle, saidx_t *last,
+                saidx_t depth) {
+  const saidx_t *p;
+  saidx_t *a, *b;
+  saidx_t len, half;
+  saint_t q, r;
+  saint_t x;
+
+  for(;;) {
+    if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
+    else                { x = 0; p = PA +  *(last - 1); }
+    for(a = first, len = middle - first, half = len >> 1, r = -1;
+        0 < len;
+        len = half, half >>= 1) {
+      b = a + half;
+      q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
+      if(q < 0) {
+        a = b + 1;
+        half -= (len & 1) ^ 1;
+      } else {
+        r = q;
+      }
+    }
+    if(a < middle) {
+      if(r == 0) { *a = ~*a; }
+      ss_rotate(a, middle, last);
+      last -= middle - a;
+      middle = a;
+      if(first == middle) { break; }
+    }
+    --last;
+    if(x != 0) { while(*--last < 0) { } }
+    if(middle == last) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Merge-forward with internal buffer. */
+static
+void
+ss_mergeforward(const sauchar_t *T, const saidx_t *PA,
+                saidx_t *first, saidx_t *middle, saidx_t *last,
+                saidx_t *buf, saidx_t depth) {
+  saidx_t *a, *b, *c, *bufend;
+  saidx_t t;
+  saint_t r;
+
+  bufend = buf + (middle - first) - 1;
+  ss_blockswap(buf, first, middle - first);
+
+  for(t = *(a = first), b = buf, c = middle;;) {
+    r = ss_compare(T, PA + *b, PA + *c, depth);
+    if(r < 0) {
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+    } else if(r > 0) {
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    } else {
+      *c = ~*c;
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    }
+  }
+}
+
+/* Merge-backward with internal buffer. */
+static
+void
+ss_mergebackward(const sauchar_t *T, const saidx_t *PA,
+                 saidx_t *first, saidx_t *middle, saidx_t *last,
+                 saidx_t *buf, saidx_t depth) {
+  const saidx_t *p1, *p2;
+  saidx_t *a, *b, *c, *bufend;
+  saidx_t t;
+  saint_t r;
+  saint_t x;
+
+  bufend = buf + (last - middle) - 1;
+  ss_blockswap(buf, middle, last - middle);
+
+  x = 0;
+  if(*bufend < 0)       { p1 = PA + ~*bufend; x |= 1; }
+  else                  { p1 = PA +  *bufend; }
+  if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
+  else                  { p2 = PA +  *(middle - 1); }
+  for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
+    r = ss_compare(T, p1, p2, depth);
+    if(0 < r) {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = *b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+    } else if(r < 0) {
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    } else {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = ~*b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    }
+  }
+}
+
+/* D&C based merge. */
+static
+void
+ss_swapmerge(const sauchar_t *T, const saidx_t *PA,
+             saidx_t *first, saidx_t *middle, saidx_t *last,
+             saidx_t *buf, saidx_t bufsize, saidx_t depth) {
+#define STACK_SIZE SS_SMERGE_STACKSIZE
+#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
+#define MERGE_CHECK(a, b, c)\
+  do {\
+    if(((c) & 1) ||\
+       (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
+      *(a) = ~*(a);\
+    }\
+    if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
+      *(b) = ~*(b);\
+    }\
+  } while(0)
+  struct { saidx_t *a, *b, *c; saint_t d; } stack[STACK_SIZE];
+  saidx_t *l, *r, *lm, *rm;
+  saidx_t m, len, half;
+  saint_t ssize;
+  saint_t check, next;
+
+  for(check = 0, ssize = 0;;) {
+    if((last - middle) <= bufsize) {
+      if((first < middle) && (middle < last)) {
+        ss_mergebackward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    if((middle - first) <= bufsize) {
+      if(first < middle) {
+        ss_mergeforward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
+        0 < len;
+        len = half, half >>= 1) {
+      if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
+                       PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
+        m += half + 1;
+        half -= (len & 1) ^ 1;
+      }
+    }
+
+    if(0 < m) {
+      lm = middle - m, rm = middle + m;
+      ss_blockswap(lm, middle, m);
+      l = r = middle, next = 0;
+      if(rm < last) {
+        if(*rm < 0) {
+          *rm = ~*rm;
+          if(first < lm) { for(; *--l < 0;) { } next |= 4; }
+          next |= 1;
+        } else if(first < lm) {
+          for(; *r < 0; ++r) { }
+          next |= 2;
+        }
+      }
+
+      if((l - first) <= (last - r)) {
+        STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
+        middle = lm, last = l, check = (check & 3) | (next & 4);
+      } else {
+        if((next & 2) && (r == middle)) { next ^= 6; }
+        STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
+        first = r, middle = rm, check = (next & 3) | (check & 4);
+      }
+    } else {
+      if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
+        *middle = ~*middle;
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+/* Substring sort */
+void
+sssort(const sauchar_t *T, const saidx_t *PA,
+       saidx_t *first, saidx_t *last,
+       saidx_t *buf, saidx_t bufsize,
+       saidx_t depth, saidx_t n, saint_t lastsuffix) {
+  saidx_t *a;
+#if SS_BLOCKSIZE != 0
+  saidx_t *b, *middle, *curbuf;
+  saidx_t j, k, curbufsize, limit;
+#endif
+  saidx_t i;
+
+  if(lastsuffix != 0) { ++first; }
+
+#if SS_BLOCKSIZE == 0
+  ss_mintrosort(T, PA, first, last, depth);
+#else
+  if((bufsize < SS_BLOCKSIZE) &&
+      (bufsize < (last - first)) &&
+      (bufsize < (limit = ss_isqrt(last - first)))) {
+    if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
+    buf = middle = last - limit, bufsize = limit;
+  } else {
+    middle = last, limit = 0;
+  }
+  for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#endif
+    curbufsize = last - (a + SS_BLOCKSIZE);
+    curbuf = a + SS_BLOCKSIZE;
+    if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
+    for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
+      ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
+    }
+  }
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+  ss_mintrosort(T, PA, a, middle, depth);
+#elif 1 < SS_BLOCKSIZE
+  ss_insertionsort(T, PA, a, middle, depth);
+#endif
+  for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+    if(i & 1) {
+      ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
+      a -= k;
+    }
+  }
+  if(limit != 0) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, middle, last, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, middle, last, depth);
+#endif
+    ss_inplacemerge(T, PA, first, middle, last, depth);
+  }
+#endif
+
+  if(lastsuffix != 0) {
+    /* Insert last type B* suffix. */
+    saidx_t PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
+    for(a = first, i = *(first - 1);
+        (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
+        ++a) {
+      *(a - 1) = *a;
+    }
+    *(a - 1) = i;
+  }
+}
diff --git a/tools/z64compress/src/enc/apultra/trsort.c b/tools/z64compress/src/enc/apultra/trsort.c
new file mode 100644
index 000000000..6fe3e67ba
--- /dev/null
+++ b/tools/z64compress/src/enc/apultra/trsort.c
@@ -0,0 +1,586 @@
+/*
+ * trsort.c for libdivsufsort
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "divsufsort_private.h"
+
+
+/*- Private Functions -*/
+
+static const saint_t lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+static INLINE
+saint_t
+tr_ilg(saidx_t n) {
+#if defined(BUILD_DIVSUFSORT64)
+  return (n >> 32) ?
+          ((n >> 48) ?
+            ((n >> 56) ?
+              56 + lg_table[(n >> 56) & 0xff] :
+              48 + lg_table[(n >> 48) & 0xff]) :
+            ((n >> 40) ?
+              40 + lg_table[(n >> 40) & 0xff] :
+              32 + lg_table[(n >> 32) & 0xff])) :
+          ((n & 0xffff0000) ?
+            ((n & 0xff000000) ?
+              24 + lg_table[(n >> 24) & 0xff] :
+              16 + lg_table[(n >> 16) & 0xff]) :
+            ((n & 0x0000ff00) ?
+               8 + lg_table[(n >>  8) & 0xff] :
+               0 + lg_table[(n >>  0) & 0xff]));
+#else
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+#endif
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Simple insertionsort for small size groups. */
+static
+void
+tr_insertionsort(const saidx_t *ISAd, saidx_t *first, saidx_t *last) {
+  saidx_t *a, *b;
+  saidx_t t, r;
+
+  for(a = first + 1; a < last; ++a) {
+    for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
+      do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
+      if(b < first) { break; }
+    }
+    if(r == 0) { *b = ~*b; }
+    *(b + 1) = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_fixdown(const saidx_t *ISAd, saidx_t *SA, saidx_t i, saidx_t size) {
+  saidx_t j, k;
+  saidx_t v;
+  saidx_t c, d, e;
+
+  for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = ISAd[SA[k = j++]];
+    if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+tr_heapsort(const saidx_t *ISAd, saidx_t *SA, saidx_t size) {
+  saidx_t i, m;
+  saidx_t t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    tr_fixdown(ISAd, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+saidx_t *
+tr_median3(const saidx_t *ISAd, saidx_t *v1, saidx_t *v2, saidx_t *v3) {
+  saidx_t *t;
+  if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
+  if(ISAd[*v2] > ISAd[*v3]) {
+    if(ISAd[*v1] > ISAd[*v3]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+saidx_t *
+tr_median5(const saidx_t *ISAd,
+           saidx_t *v1, saidx_t *v2, saidx_t *v3, saidx_t *v4, saidx_t *v5) {
+  saidx_t *t;
+  if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
+  if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
+  if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
+  if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(ISAd[*v3] > ISAd[*v4]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+saidx_t *
+tr_pivot(const saidx_t *ISAd, saidx_t *first, saidx_t *last) {
+  saidx_t *middle;
+  saidx_t t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return tr_median3(ISAd, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = tr_median3(ISAd, first, first + t, first + (t << 1));
+  middle = tr_median3(ISAd, middle - t, middle, middle + t);
+  last   = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return tr_median3(ISAd, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+typedef struct _trbudget_t trbudget_t;
+struct _trbudget_t {
+  saidx_t chance;
+  saidx_t remain;
+  saidx_t incval;
+  saidx_t count;
+};
+
+static INLINE
+void
+trbudget_init(trbudget_t *budget, saidx_t chance, saidx_t incval) {
+  budget->chance = chance;
+  budget->remain = budget->incval = incval;
+}
+
+static INLINE
+saint_t
+trbudget_check(trbudget_t *budget, saidx_t size) {
+  if(size <= budget->remain) { budget->remain -= size; return 1; }
+  if(budget->chance == 0) { budget->count += size; return 0; }
+  budget->remain += budget->incval - size;
+  budget->chance -= 1;
+  return 1;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_partition(const saidx_t *ISAd,
+             saidx_t *first, saidx_t *middle, saidx_t *last,
+             saidx_t **pa, saidx_t **pb, saidx_t v) {
+  saidx_t *a, *b, *c, *d, *e, *f;
+  saidx_t t, s;
+  saidx_t x = 0;
+
+  for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
+  if(((a = b) < last) && (x < v)) {
+    for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+  }
+  for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
+  if((b < (d = c)) && (x > v)) {
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+  for(; b < c;) {
+    SWAP(*b, *c);
+    for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+
+  if(a <= d) {
+    c = b - 1;
+    if((s = a - first) > (t = b - a)) { s = t; }
+    for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    if((s = d - c) > (t = last - d - 1)) { s = t; }
+    for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    first += (b - a), last -= (d - c);
+  }
+  *pa = first, *pb = last;
+}
+
+static
+void
+tr_copy(saidx_t *ISA, const saidx_t *SA,
+        saidx_t *first, saidx_t *a, saidx_t *b, saidx_t *last,
+        saidx_t depth) {
+  /* sort suffixes of middle partition
+     by using sorted order of suffixes of left and right partition. */
+  saidx_t *c, *d, *e;
+  saidx_t s, v;
+
+  v = b - SA - 1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      ISA[s] = d - SA;
+    }
+  }
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      ISA[s] = d - SA;
+    }
+  }
+}
+
+static
+void
+tr_partialcopy(saidx_t *ISA, const saidx_t *SA,
+               saidx_t *first, saidx_t *a, saidx_t *b, saidx_t *last,
+               saidx_t depth) {
+  saidx_t *c, *d, *e;
+  saidx_t s, v;
+  saidx_t rank, lastrank, newrank = -1;
+
+  v = b - SA - 1;
+  lastrank = -1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+
+  lastrank = -1;
+  for(e = d; first <= e; --e) {
+    rank = ISA[*e];
+    if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
+    if(newrank != rank) { ISA[*e] = newrank; }
+  }
+
+  lastrank = -1;
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+}
+
+static
+void
+tr_introsort(saidx_t *ISA, const saidx_t *ISAd,
+             saidx_t *SA, saidx_t *first, saidx_t *last,
+             trbudget_t *budget) {
+#define STACK_SIZE TR_STACKSIZE
+  struct { const saidx_t *a; saidx_t *b, *c; saint_t d, e; }stack[STACK_SIZE];
+  saidx_t *a, *b, *c;
+  saidx_t t;
+  saidx_t v, x = 0;
+  saidx_t incr = ISAd - ISA;
+  saint_t limit, next;
+  saint_t ssize, trlink = -1;
+
+  for(ssize = 0, limit = tr_ilg(last - first);;) {
+
+    if(limit < 0) {
+      if(limit == -1) {
+        /* tandem repeat partition */
+        tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
+
+        /* update ranks */
+        if(a < last) {
+          for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+        }
+        if(b < last) {
+          for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
+        }
+
+        /* push */
+        if(1 < (b - a)) {
+          STACK_PUSH5(NULL, a, b, 0, 0);
+          STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
+          trlink = ssize - 2;
+        }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
+            last = a, limit = tr_ilg(a - first);
+          } else if(1 < (last - b)) {
+            first = b, limit = tr_ilg(last - b);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
+            first = b, limit = tr_ilg(last - b);
+          } else if(1 < (a - first)) {
+            last = a, limit = tr_ilg(a - first);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      } else if(limit == -2) {
+        /* tandem repeat copy */
+        a = stack[--ssize].b, b = stack[ssize].c;
+        if(stack[ssize].d == 0) {
+          tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
+        } else {
+          if(0 <= trlink) { stack[trlink].d = -1; }
+          tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
+        }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      } else {
+        /* sorted partition */
+        if(0 <= *first) {
+          a = first;
+          do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
+          first = a;
+        }
+        if(first < last) {
+          a = first; do { *a = ~*a; } while(*++a < 0);
+          next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
+          if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
+
+          /* push */
+          if(trbudget_check(budget, a - first)) {
+            if((a - first) <= (last - a)) {
+              STACK_PUSH5(ISAd, a, last, -3, trlink);
+              ISAd += incr, last = a, limit = next;
+            } else {
+              if(1 < (last - a)) {
+                STACK_PUSH5(ISAd + incr, first, a, next, trlink);
+                first = a, limit = -3;
+              } else {
+                ISAd += incr, last = a, limit = next;
+              }
+            }
+          } else {
+            if(0 <= trlink) { stack[trlink].d = -1; }
+            if(1 < (last - a)) {
+              first = a, limit = -3;
+            } else {
+              STACK_POP5(ISAd, first, last, limit, trlink);
+            }
+          }
+        } else {
+          STACK_POP5(ISAd, first, last, limit, trlink);
+        }
+      }
+      continue;
+    }
+
+    if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
+      tr_insertionsort(ISAd, first, last);
+      limit = -3;
+      continue;
+    }
+
+    if(limit-- == 0) {
+      tr_heapsort(ISAd, first, last - first);
+      for(a = last - 1; first < a; a = b) {
+        for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
+      }
+      limit = -3;
+      continue;
+    }
+
+    /* choose pivot */
+    a = tr_pivot(ISAd, first, last);
+    SWAP(*first, *a);
+    v = ISAd[*first];
+
+    /* partition */
+    tr_partition(ISAd, first, first + 1, last, &a, &b, v);
+    if((last - first) != (b - a)) {
+      next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
+
+      /* update ranks */
+      for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+      if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
+
+      /* push */
+      if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
+        if((a - first) <= (last - b)) {
+          if((last - b) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              last = a;
+            } else if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((a - first) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        } else {
+          if((a - first) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              first = b;
+            } else if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((last - b) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        }
+      } else {
+        if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            last = a;
+          } else if(1 < (last - b)) {
+            first = b;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            first = b;
+          } else if(1 < (a - first)) {
+            last = a;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      }
+    } else {
+      if(trbudget_check(budget, last - first)) {
+        limit = tr_ilg(last - first), ISAd += incr;
+      } else {
+        if(0 <= trlink) { stack[trlink].d = -1; }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      }
+    }
+  }
+#undef STACK_SIZE
+}
+
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+/* Tandem repeat sort */
+void
+trsort(saidx_t *ISA, saidx_t *SA, saidx_t n, saidx_t depth) {
+  saidx_t *ISAd;
+  saidx_t *first, *last;
+  trbudget_t budget;
+  saidx_t t, skip, unsorted;
+
+  trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
+/*  trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
+  for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
+    first = SA;
+    skip = 0;
+    unsorted = 0;
+    do {
+      if((t = *first) < 0) { first -= t; skip += t; }
+      else {
+        if(skip != 0) { *(first + skip) = skip; skip = 0; }
+        last = SA + ISA[t] + 1;
+        if(1 < (last - first)) {
+          budget.count = 0;
+          tr_introsort(ISA, ISAd, SA, first, last, &budget);
+          if(budget.count != 0) { unsorted += budget.count; }
+          else { skip = first - last; }
+        } else if((last - first) == 1) {
+          skip = -1;
+        }
+        first = last;
+      }
+    } while(first < (SA + n));
+    if(skip != 0) { *(first + skip) = skip; }
+    if(unsorted == 0) { break; }
+  }
+}
diff --git a/tools/z64compress/src/enc/enc.h b/tools/z64compress/src/enc/enc.h
new file mode 100644
index 000000000..60bae3b96
--- /dev/null
+++ b/tools/z64compress/src/enc/enc.h
@@ -0,0 +1,59 @@
+#ifndef Z64COMPRESS_ENC_H_INCLUDED
+#define Z64COMPRESS_ENC_H_INCLUDED
+
+int yazenc(
+	void *src
+	, unsigned src_sz
+	, void *dst
+	, unsigned *dst_sz
+	, void *_ctx
+);
+void *yazCtx_new(void);
+void yazCtx_free(void *_ctx);
+int yazdec(void *_src, void *_dst, unsigned dstSz, unsigned *srcSz);
+
+int lzoenc(
+	void *src
+	, unsigned src_sz
+	, void *dst
+	, unsigned *dst_sz
+	, void *_ctx
+);
+void *lzoCtx_new(void);
+void lzoCtx_free(void *_ctx);
+
+int uclenc(
+	void *src
+	, unsigned src_sz
+	, void *dst
+	, unsigned *dst_sz
+	, void *_ctx
+);
+
+int zx7enc(
+	void *src
+	, unsigned src_sz
+	, void *dst
+	, unsigned *dst_sz
+	, void *_ctx
+);
+
+int
+zlibenc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+);
+
+int aplenc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+);
+
+#endif /* Z64COMPRESS_ENC_H_INCLUDED */
+
diff --git a/tools/z64compress/src/enc/libdeflate/.cirrus.yml b/tools/z64compress/src/enc/libdeflate/.cirrus.yml
new file mode 100644
index 000000000..a4f5cad51
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/.cirrus.yml
@@ -0,0 +1,10 @@
+task:
+  freebsd_instance:
+    matrix:
+      - image_family: freebsd-12-3
+      - image_family: freebsd-13-0
+  install_script: pkg install -y cmake
+  script:
+    - cmake -B build -DLIBDEFLATE_BUILD_TESTS=1
+    - cmake --build build
+    - ctest --test-dir build
diff --git a/tools/z64compress/src/enc/libdeflate/.github/workflows/ci.yml b/tools/z64compress/src/enc/libdeflate/.github/workflows/ci.yml
new file mode 100644
index 000000000..6902e8e16
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/.github/workflows/ci.yml
@@ -0,0 +1,192 @@
+name: CI
+on: [pull_request]
+
+jobs:
+  x86_64-build-and-test:
+    name: Build and test (x86_64, ${{ matrix.os }}, ${{ matrix.compiler }})
+    strategy:
+      matrix:
+        os: [ubuntu-20.04, ubuntu-18.04]
+        compiler: [gcc, clang]
+    runs-on: ${{ matrix.os }}
+    env:
+      CC: ${{ matrix.compiler }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang llvm libz-dev valgrind
+    - run: scripts/run_tests.sh
+
+  other-arch-build-and-test:
+    name: Build and test (${{ matrix.arch }}, Debian Bullseye, ${{ matrix.compiler }})
+    strategy:
+      matrix:
+        arch: [armv6, armv7, aarch64, s390x, ppc64le]
+        compiler: [gcc, clang]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: uraimo/run-on-arch-action@v2.2.0
+        with:
+          arch: ${{ matrix.arch }}
+          distro: bullseye
+          githubToken: ${{ github.token }}
+          install: |
+            apt-get update
+            apt-get install -y build-essential cmake clang llvm libz-dev
+          run: |
+            tests=(regular)
+            if [ ${{matrix.compiler}} = clang ]; then
+                tests+=(ubsan)
+            fi
+            CC=${{matrix.compiler}} scripts/run_tests.sh "${tests[@]}"
+
+  macos-build-and-test:
+    name: Build and test (macOS)
+    runs-on: macos-latest
+    env:
+      CFLAGS: -Werror -DLIBDEFLATE_ENABLE_ASSERTIONS
+    steps:
+    - uses: actions/checkout@v2
+    - run: cmake -B build -DLIBDEFLATE_BUILD_TESTS=1
+    - run: cmake --build build --verbose
+    - run: ctest --test-dir build
+
+  windows-msys2-build-and-test:
+    name: Build and test (Windows, MSYS2, ${{matrix.sys}})
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        include:
+        - { sys: mingw64, env: x86_64 }
+        - { sys: mingw32, env: i686 }
+    defaults:
+      run:
+        shell: msys2 {0}
+    env:
+      CFLAGS: -Werror -DLIBDEFLATE_ENABLE_ASSERTIONS
+    steps:
+    - uses: actions/checkout@v2
+    - uses: msys2/setup-msys2@v2
+      with:
+        msystem: ${{matrix.sys}}
+        update: true
+        install: >
+          make
+          mingw-w64-${{matrix.env}}-cc
+          mingw-w64-${{matrix.env}}-cmake
+          mingw-w64-${{matrix.env}}-ninja
+          mingw-w64-${{matrix.env}}-zlib
+    - run: cmake -B build -G Ninja -DLIBDEFLATE_BUILD_TESTS=1
+    - run: cmake --build build --verbose
+    - run: ctest --test-dir build
+
+  windows-visualstudio-build-and-test:
+    name: Build and test (Windows, Visual Studio ${{matrix.toolset}}, ${{matrix.platform.vs}})
+    strategy:
+      matrix:
+        platform: [ {vs: x64, vcpkg: x64-windows},
+                    {vs: Win32, vcpkg: x86-windows} ]
+        toolset: [v143, ClangCL]
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: microsoft/setup-msbuild@v1.1
+    - run: vcpkg install zlib:${{matrix.platform.vcpkg}}
+    - run: >
+        echo C:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\bin
+        | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+    - run: >
+        cmake -B build -G "Visual Studio 17 2022" -T ${{matrix.toolset}}
+        -A ${{matrix.platform.vs}} -DLIBDEFLATE_BUILD_TESTS=1
+        -DCMAKE_C_FLAGS="/W4 /WX /DLIBDEFLATE_ENABLE_ASSERTIONS /IC:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\include"
+        -DZLIB_LIBRARY=C:\vcpkg\packages\zlib_${{matrix.platform.vcpkg}}\lib\zlib.lib
+    - run: cmake --build build --verbose --config Debug
+    - run: ctest --test-dir build -C Debug
+
+  windows-visualstudio-build:
+    name: Build (Windows, Visual Studio ${{matrix.toolset}}, ${{matrix.platform}})
+    strategy:
+      matrix:
+        platform: [ARM64, ARM]
+        toolset: [v143, ClangCL]
+        exclude: # Exclude unsupported combinations
+        - platform: ARM
+          toolset: ClangCL
+    runs-on: windows-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: microsoft/setup-msbuild@v1.1
+    - run: >
+        cmake -B build -G "Visual Studio 17 2022" -T ${{matrix.toolset}}
+        -A ${{matrix.platform}} -DCMAKE_C_FLAGS="/W4 /WX"
+    - run: cmake --build build --verbose
+
+  run-clang-static-analyzer:
+    name: Run clang static analyzer
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y clang-tools
+    - run: scan-build cmake -B build -DLIBDEFLATE_BUILD_TESTS=1
+    - run: scan-build cmake --build build --verbose
+
+  run-shellcheck:
+    name: Run shellcheck
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y shellcheck
+    - name: Run shellcheck
+      run: shellcheck scripts/*.sh
+
+  cross-compile-for-windows:
+    name: Cross compile for Windows
+    runs-on: ubuntu-latest
+    env:
+      CFLAGS: -Werror -DLIBDEFLATE_ENABLE_ASSERTIONS
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install dependencies
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y gcc-mingw-w64-i686 gcc-mingw-w64-x86-64 libz-mingw-w64-dev
+    # Unfortunately Ubuntu doesn't have {i686,x86_64}-w64-mingw32-cmake like
+    # some distros have, so we have to provide our own toolchain files here.
+    - name: 32-bit build
+      run: |
+        scripts/cmake-helper.sh -DLIBDEFLATE_BUILD_TESTS=1 \
+            -DCMAKE_TOOLCHAIN_FILE=scripts/toolchain-i686-w64-mingw32.cmake
+        cmake --build build --verbose
+    - name: 64-bit build
+      run: |
+        scripts/cmake-helper.sh -DLIBDEFLATE_BUILD_TESTS=1 \
+            -DCMAKE_TOOLCHAIN_FILE=scripts/toolchain-x86_64-w64-mingw32.cmake
+        cmake --build build --verbose
+
+  cross-compile-for-android:
+    name: Cross compile for ${{matrix.abi}} Android on ${{matrix.os}}
+    strategy:
+      matrix:
+        os: [ubuntu-18.04, ubuntu-20.04, macos-latest]
+        abi: [armeabi-v7a, arm64-v8a, x86, x86_64]
+    runs-on: ${{matrix.os}}
+    env:
+      CFLAGS: -Werror -DLIBDEFLATE_ENABLE_ASSERTIONS
+    steps:
+    - uses: actions/checkout@v2
+    - run: |
+        scripts/cmake-helper.sh \
+            -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK_LATEST_HOME"/build/cmake/android.toolchain.cmake \
+            -DANDROID_ABI=${{matrix.abi}} \
+            -DANDROID_PLATFORM=28 \
+            -DLIBDEFLATE_BUILD_TESTS=1
+        cmake --build build --verbose
diff --git a/tools/z64compress/src/enc/libdeflate/.gitignore b/tools/z64compress/src/enc/libdeflate/.gitignore
new file mode 100644
index 000000000..3a696efc5
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/.gitignore
@@ -0,0 +1,3 @@
+/build*
+cscope*
+tags
diff --git a/tools/z64compress/src/enc/libdeflate/COPYING b/tools/z64compress/src/enc/libdeflate/COPYING
new file mode 100644
index 000000000..1f1b81cd5
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/COPYING
@@ -0,0 +1,21 @@
+Copyright 2016 Eric Biggers
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation files
+(the "Software"), to deal in the Software without restriction,
+including without limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of the Software,
+and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/tools/z64compress/src/enc/libdeflate/NEWS.md b/tools/z64compress/src/enc/libdeflate/NEWS.md
new file mode 100644
index 000000000..497ae2199
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/NEWS.md
@@ -0,0 +1,389 @@
+# libdeflate release notes
+
+## Version 1.15
+
+* libdeflate now uses CMake instead of a plain Makefile.
+
+* Improved MSVC support.  Enabled most architecture-specific code with MSVC,
+  fixed building with clang in MSVC compatibility mode, and other improvements.
+
+* When libdeflate is built with MinGW, the static library and import library are
+  now named using the MinGW convention (`*.a` and `*.dll.a`) instead of the
+  Visual Studio convention.  This affects the official Windows binaries.
+
+## Version 1.14
+
+Significantly improved decompression performance on all platforms.  Examples
+include (measuring DEFLATE only):
+
+| Platform                           | Speedup over v1.13 |
+|------------------------------------|--------------------|
+| x86_64 (Intel Comet Lake), gcc     | 1.287x             |
+| x86_64 (Intel Comet Lake), clang   | 1.437x             |
+| x86_64 (Intel Ice Lake), gcc       | 1.332x             |
+| x86_64 (Intel Ice Lake), clang     | 1.296x             |
+| x86_64 (Intel Sandy Bridge), gcc   | 1.162x             |
+| x86_64 (Intel Sandy Bridge), clang | 1.092x             |
+| x86_64 (AMD Zen 2), gcc            | 1.263x             |
+| x86_64 (AMD Zen 2), clang          | 1.259x             |
+| i386 (Intel Comet Lake), gcc       | 1.570x             |
+| i386 (Intel Comet Lake), clang     | 1.344x             |
+| arm64 (Apple M1), clang            | 1.306x             |
+| arm64 (Cortex-A76), clang          | 1.355x             |
+| arm64 (Cortex-A55), clang          | 1.190x             |
+| arm32 (Cortex-A76), clang          | 1.665x             |
+| arm32 (Cortex-A55), clang          | 1.283x             |
+
+Thanks to Dougall Johnson (https://dougallj.wordpress.com/) for ideas for many
+of the improvements.
+
+## Version 1.13
+
+* Changed the 32-bit Windows build of the library to use the default calling
+  convention (cdecl) instead of stdcall, reverting a change from libdeflate 1.4.
+
+* Fixed a couple macOS compatibility issues with the gzip program.
+
+## Version 1.12
+
+This release focuses on improving the performance of the CRC-32 and Adler-32
+checksum algorithms on x86 and ARM (both 32-bit and 64-bit).
+
+* Build updates:
+
+  * Fixed building libdeflate on Apple platforms.
+
+  * For Visual Studio builds, Visual Studio 2015 or later is now required.
+
+* CRC-32 algorithm updates:
+
+  * Improved CRC-32 performance on short inputs on x86 and ARM.
+
+  * Improved CRC-32 performance on Apple Silicon Macs by using a 12-way pmull
+    implementation.   Performance on large inputs on M1 is now about 67 GB/s,
+    compared to 8 GB/s before, or 31 GB/s with the Apple-provided zlib.
+
+  * Improved CRC-32 performance on some other ARM CPUs by reworking the code so
+    that multiple crc32 instructions can be issued in parallel.
+
+  * Improved CRC-32 performance on some x86 CPUs by increasing the stride length
+    of the pclmul implementation.
+
+* Adler-32 algorithm updates:
+
+  * Improved Adler-32 performance on some x86 CPUs by optimizing the AVX-2
+    implementation.  E.g., performance on Zen 1 improved from 19 to 30 GB/s, and
+    on Ice Lake from 35 to 41 GB/s (if the AVX-512 implementation is excluded).
+
+  * Removed the AVX-512 implementation of Adler-32 to avoid CPU frequency
+    downclocking, and because the AVX-2 implementation was made faster.
+
+  * Improved Adler-32 performance on some ARM CPUs by optimizing the NEON
+    implementation.  E.g., Apple M1 improved from about 36 to 52 GB/s.
+
+## Version 1.11
+
+* Library updates:
+
+  * Improved compression performance slightly.
+
+  * Detect arm64 CPU features on Apple platforms, which should improve
+    performance in some areas such as CRC-32 computation.
+
+* Program updates:
+
+  * The included `gzip` and `gunzip` programs now support the `-q` option.
+
+  * The included `gunzip` program now passes through non-gzip data when both
+    the `-f` and `-c` options are used.
+
+* Build updates:
+
+  * Avoided a build error on arm32 with certain gcc versions, by disabling
+    building `crc32_arm()` as dynamically-dispatched code when needed.
+
+  * Support building with the LLVM toolchain on Windows.
+
+  * Disabled the use of the "stdcall" ABI in static library builds on Windows.
+
+  * Use the correct `install_name` in macOS builds.
+
+  * Support Haiku builds.
+
+## Version 1.10
+
+* Added an additional check to the decompressor to make it quickly detect
+  certain bad inputs and not try to generate an unbounded amount of output.
+
+  Note: this was only a problem when decompressing with an unknown output size,
+  which isn't the recommended use case of libdeflate.  However,
+  `libdeflate-gunzip` has to do this, and it would run out of memory as it would
+  keep trying to allocate a larger output buffer.
+
+* Fixed a build error on Solaris.
+
+* Cleaned up a few things in the compression code.
+
+## Version 1.9
+
+* Made many improvements to the compression algorithms, and rebalanced the
+  compression levels:
+
+  * Heuristics were implemented which significantly improve the compression
+    ratio on data where short matches aren't useful, such as DNA sequencing
+    data.  This applies to all compression levels, but primarily to levels 1-9.
+
+  * Level 1 was made much faster, though it often compresses slightly worse than
+    before (but still better than zlib).
+
+  * Levels 8-9 were also made faster, though they often compress slightly worse
+    than before (but still better than zlib).  On some data, levels 8-9 are much
+    faster and compress much better than before; this change addressed an issue
+    where levels 8-9 did poorly on certain files.  The algorithm used by levels
+    8-9 is now more similar to that of levels 6-7 than to that of levels 10-12.
+
+  * Levels 2-3, 7, and 10-12 were strengthened slightly.
+
+  * Levels 4-6 were also strengthened slightly, but some of this improvement was
+    traded off to speed them up slightly as well.
+
+  * Levels 1-9 had their per-compressor memory usage greatly reduced.
+
+  As always, compression ratios will vary depending on the input data, and
+  compression speeds will vary depending on the input data and target platform.
+
+* `make install` will now install a pkg-config file for libdeflate.
+
+* The Makefile now supports the `DISABLE_SHARED` parameter to disable building
+  the shared library.
+
+* Improved the Android build support in the Makefile.
+
+## Version 1.8
+
+* Added `-t` (test) option to `libdeflate-gunzip`.
+
+* Unaligned access optimizations are now enabled on WebAssembly builds.
+
+* Fixed a build error when building with the Intel C Compiler (ICC).
+
+* Fixed a build error when building with uClibc.
+
+* libdeflate's CI system has switched from Travis CI to GitHub Actions.
+
+* Made some improvements to test scripts.
+
+## Version 1.7
+
+* Added support for compression level 0, "no compression".
+
+* Added an ARM CRC32 instruction accelerated implementation of CRC32.
+
+* Added support for linking the programs to the shared library version of
+  libdeflate rather than to the static library version.
+
+* Made the compression level affect the minimum input size at which compression
+  is attempted.
+
+* Fixed undefined behavior in x86 Adler32 implementation.  (No miscompilations
+  were observed in practice.)
+
+* Fixed undefined behavior in x86 CPU feature code.  (No miscompilations were
+  observed in practice.)
+
+* Fixed installing shared lib symlink on macOS.
+
+* Documented third-party bindings.
+
+* Made a lot of improvements to the testing scripts and the CI configuration
+  file.
+
+* Lots of other small improvements and cleanups.
+
+## Version 1.6
+
+* Prevented gcc 10 from miscompiling libdeflate (workaround for
+  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
+
+* Removed workaround for gcc 5 and earlier producing slow code on ARM32.  If
+  this affects you, please upgrade your compiler.
+
+* New API function: `libdeflate_zlib_decompress_ex()`.  It provides the actual
+  size of the stream that was decompressed, like the gzip and DEFLATE
+  equivalents.
+
+* `libdeflate_zlib_decompress()` now accepts trailing bytes after the end of the
+  stream, like the gzip and DEFLATE equivalents.
+
+* Added support for custom memory allocators.  (New API function:
+  `libdeflate_set_memory_allocator()`)
+
+* Added support for building the library in freestanding mode.
+
+* Building libdeflate no longer requires `CPPFLAGS=-Icommon`.
+
+## Version 1.5
+
+* Fixed up stdcall support on 32-bit Windows: the functions are now exported
+  using both suffixed and non-suffixed names, and fixed `libdeflate.h` to be
+  MSVC-compatible again.
+
+## Version 1.4
+
+* The 32-bit Windows build of libdeflate now uses the "stdcall" calling
+  convention instead of "cdecl".  If you're calling `libdeflate.dll` directly
+  from C or C++, you'll need to recompile your code.  If you're calling it from
+  another language, or calling it indirectly using `LoadLibrary()`, you'll need
+  to update your code to use the stdcall calling convention.
+
+* The Makefile now supports building libdeflate as a shared
+  library (`.dylib`) on macOS.
+
+* Fixed a bug where support for certain optimizations and optional features
+  (file access hints and more precise timestamps) was incorrectly omitted when
+  libdeflate was compiled with `-Werror`.
+
+* Added `make check` target to the Makefile.
+
+* Added CI configuration files.
+
+## Version 1.3
+
+* `make install` now supports customizing the directories into which binaries,
+  headers, and libraries are installed.
+
+* `make install` now installs into `/usr/local` by default.  To change it, use
+  e.g. `make install PREFIX=/usr`.
+
+* `make install` now works on more platforms.
+
+* The Makefile now supports overriding the optimization flags.
+
+* The compression functions now correctly handle an output data buffer >= 4 GiB
+  in size, and `gzip` and `gunzip` now correctly handle multi-gigabyte files (if
+  enough memory is available).
+
+## Version 1.2
+
+* Slight improvements to decompression speed.
+
+* Added an AVX-512BW implementation of Adler-32.
+
+* The Makefile now supports a user-specified installation `PREFIX`.
+
+* Fixed build error with some Visual Studio versions.
+
+## Version 1.1
+
+* Fixed crash in CRC-32 code when the prebuilt libdeflate for 32-bit Windows was
+  called by a program built with Visual Studio.
+
+* Improved the worst-case decompression speed of malicious data.
+
+* Fixed build error when compiling for an ARM processor without hardware
+  floating point support.
+
+* Improved performance on the PowerPC64 architecture.
+
+* Added soname to `libdeflate.so`, to make packaging easier.
+
+* Added `make install` target to the Makefile.
+
+* The Makefile now supports user-specified `CPPFLAGS`.
+
+* The Windows binary releases now include the import library for
+  `libdeflate.dll`.  `libdeflate.lib` is now the import library, and
+  `libdeflatestatic.lib` is the static library.
+
+## Version 1.0
+
+* Added support for multi-member gzip files.
+
+* Moved architecture-specific code into subdirectories.  If you aren't using the
+  provided Makefile to build libdeflate, you now need to compile `lib/*.c` and
+  `lib/*/*.c` instead of just `lib/*.c`.
+
+* Added an ARM PMULL implementation of CRC-32, which speeds up gzip compression
+  and decompression on 32-bit and 64-bit ARM processors that have the
+  Cryptography Extensions.
+
+* Improved detection of CPU features, resulting in accelerated functions being
+  used in more cases.  This includes:
+
+  * Detect CPU features on 32-bit x86, not just 64-bit as was done previously.
+
+  * Detect CPU features on ARM, both 32 and 64-bit.  (Limited to Linux only
+    currently.)
+
+## Version 0.8
+
+* Build fixes for certain platforms and compilers.
+
+* libdeflate now produces the same output on all CPU architectures.
+
+* Improved documentation for building libdeflate on Windows.
+
+## Version 0.7
+
+* Fixed a very rare bug that caused data to be compressed incorrectly.  The bug
+  affected compression levels 7 and below since libdeflate v0.2.  Although there
+  have been no user reports of the bug, and I believe it would have been highly
+  unlikely to encounter on realistic data, it could occur on data specially
+  crafted to reproduce it.
+
+* Fixed a compilation error when building with clang 3.7.
+
+## Version 0.6
+
+* Various improvements to the gzip program's behavior.
+
+* Faster CRC-32 on AVX-capable processors.
+
+* Other minor changes.
+
+## Version 0.5
+
+* The CRC-32 checksum algorithm has been optimized with carryless multiplication
+  instructions for `x86_64` (PCLMUL).  This speeds up gzip compression and
+  decompression.
+
+* Build fixes for certain platforms and compilers.
+
+* Added more test programs and scripts.
+
+* libdeflate is now entirely MIT-licensed.
+
+## Version 0.4
+
+* The Adler-32 checksum algorithm has been optimized with vector instructions
+  for `x86_64` (SSE2 and AVX2) and ARM (NEON).  This speeds up zlib compression
+  and decompression.
+
+* To avoid naming collisions, functions and definitions in libdeflate's API have
+  been renamed to be prefixed with `libdeflate_` or `LIBDEFLATE_`.  Programs
+  using the old API will need to be updated.
+
+* Various bug fixes and other improvements.
+
+## Version 0.3
+
+* Some bug fixes and other minor changes.
+
+## Version 0.2
+
+* Implemented a new block splitting algorithm which typically improves the
+  compression ratio slightly at all compression levels.
+
+* The compressor now outputs each block using the cheapest type (dynamic
+  Huffman, static Huffman, or uncompressed).
+
+* The gzip program has received an overhaul and now behaves more like the
+  standard version.
+
+* Build system updates, including: some build options were changed and some
+  build options were removed, and the default 'make' target now includes the
+  gzip program as well as the library.
+
+## Version 0.1
+
+* Initial official release.
diff --git a/tools/z64compress/src/enc/libdeflate/README.md b/tools/z64compress/src/enc/libdeflate/README.md
new file mode 100644
index 000000000..f5bbd93c2
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/README.md
@@ -0,0 +1,204 @@
+# Overview
+
+libdeflate is a library for fast, whole-buffer DEFLATE-based compression and
+decompression.
+
+The supported formats are:
+
+- DEFLATE (raw)
+- zlib (a.k.a. DEFLATE with a zlib wrapper)
+- gzip (a.k.a. DEFLATE with a gzip wrapper)
+
+libdeflate is heavily optimized.  It is significantly faster than the zlib
+library, both for compression and decompression, and especially on x86
+processors.  In addition, libdeflate provides optional high compression modes
+that provide a better compression ratio than the zlib's "level 9".
+
+libdeflate itself is a library.  The following command-line programs which use
+this library are also included:
+
+* `libdeflate-gzip`, a program which can be a drop-in replacement for standard
+  `gzip` under some circumstances.  Note that `libdeflate-gzip` has some
+  limitations; it is provided for convenience and is **not** meant to be the
+  main use case of libdeflate.  It needs a lot of memory to process large files,
+  and it omits support for some infrequently-used options of GNU gzip.
+
+* `benchmark`, a test program that does round-trip compression and decompression
+  of the provided data, and measures the compression and decompression speed.
+  It can use libdeflate, zlib, or a combination of the two.
+
+* `checksum`, a test program that checksums the provided data with Adler-32 or
+  CRC-32, and optionally measures the speed.  It can use libdeflate or zlib.
+
+For the release notes, see the [NEWS file](NEWS.md).
+
+## Table of Contents
+
+- [Building](#building)
+  - [Using CMake](#using-cmake)
+  - [Directly integrating the library sources](#directly-integrating-the-library-sources)
+- [API](#api)
+- [Bindings for other programming languages](#bindings-for-other-programming-languages)
+- [DEFLATE vs. zlib vs. gzip](#deflate-vs-zlib-vs-gzip)
+- [Compression levels](#compression-levels)
+- [Motivation](#motivation)
+- [License](#license)
+
+# Building
+
+## Using CMake
+
+libdeflate uses [CMake](https://cmake.org/).  It can be built just like any
+other CMake project, e.g. with:
+
+    cmake -B build && cmake --build build
+
+By default the following targets are built:
+
+- The static library (normally called `libdeflate.a`)
+- The shared library (normally called `libdeflate.so`)
+- The `libdeflate-gzip` program, including its alias `libdeflate-gunzip`
+
+Besides the standard CMake build and installation options, there are some
+libdeflate-specific build options.  See `CMakeLists.txt` for the list of these
+options.  To set an option, add `-DOPTION=VALUE` to the `cmake` command.
+
+Prebuilt Windows binaries can be downloaded from
+https://github.com/ebiggers/libdeflate/releases.
+
+## Directly integrating the library sources
+
+Although the official build system is CMake, care has been taken to keep the
+library source files compilable directly, without a prerequisite configuration
+step.  Therefore, it is also fine to just add the library source files directly
+to your application, without using CMake.
+
+You should compile both `lib/*.c` and `lib/*/*.c`.  You don't need to worry
+about excluding irrelevant architecture-specific code, as this is already
+handled in the source files themselves using `#ifdef`s.
+
+It is strongly recommended to use either gcc or clang, and to use `-O2`.
+
+If you are doing a freestanding build with `-ffreestanding`, you must add
+`-DFREESTANDING` as well (matching what the `CMakeLists.txt` does).
+
+# API
+
+libdeflate has a simple API that is not zlib-compatible.  You can create
+compressors and decompressors and use them to compress or decompress buffers.
+See libdeflate.h for details.
+
+There is currently no support for streaming.  This has been considered, but it
+always significantly increases complexity and slows down fast paths.
+Unfortunately, at this point it remains a future TODO.  So: if your application
+compresses data in "chunks", say, less than 1 MB in size, then libdeflate is a
+great choice for you; that's what it's designed to do.  This is perfect for
+certain use cases such as transparent filesystem compression.  But if your
+application compresses large files as a single compressed stream, similarly to
+the `gzip` program, then libdeflate isn't for you.
+
+Note that with chunk-based compression, you generally should have the
+uncompressed size of each chunk stored outside of the compressed data itself.
+This enables you to allocate an output buffer of the correct size without
+guessing.  However, libdeflate's decompression routines do optionally provide
+the actual number of output bytes in case you need it.
+
+Windows developers: note that the calling convention of libdeflate.dll is
+"cdecl".  (libdeflate v1.4 through v1.12 used "stdcall" instead.)
+
+# Bindings for other programming languages
+
+The libdeflate project itself only provides a C library.  If you need to use
+libdeflate from a programming language other than C or C++, consider using the
+following bindings:
+
+* C#: [LibDeflate.NET](https://github.com/jzebedee/LibDeflate.NET)
+* Go: [go-libdeflate](https://github.com/4kills/go-libdeflate)
+* Java: [libdeflate-java](https://github.com/astei/libdeflate-java)
+* Julia: [LibDeflate.jl](https://github.com/jakobnissen/LibDeflate.jl)
+* Perl: [Gzip::Libdeflate](https://github.com/benkasminbullock/gzip-libdeflate)
+* Python: [deflate](https://github.com/dcwatson/deflate)
+* Ruby: [libdeflate-ruby](https://github.com/kaorimatz/libdeflate-ruby)
+* Rust: [libdeflater](https://github.com/adamkewley/libdeflater)
+
+Note: these are third-party projects which haven't necessarily been vetted by
+the authors of libdeflate.  Please direct all questions, bugs, and improvements
+for these bindings to their authors.
+
+# DEFLATE vs. zlib vs. gzip
+
+The DEFLATE format ([rfc1951](https://www.ietf.org/rfc/rfc1951.txt)), the zlib
+format ([rfc1950](https://www.ietf.org/rfc/rfc1950.txt)), and the gzip format
+([rfc1952](https://www.ietf.org/rfc/rfc1952.txt)) are commonly confused with
+each other as well as with the [zlib software library](http://zlib.net), which
+actually supports all three formats.  libdeflate (this library) also supports
+all three formats.
+
+Briefly, DEFLATE is a raw compressed stream, whereas zlib and gzip are different
+wrappers for this stream.  Both zlib and gzip include checksums, but gzip can
+include extra information such as the original filename.  Generally, you should
+choose a format as follows:
+
+- If you are compressing whole files with no subdivisions, similar to the `gzip`
+  program, you probably should use the gzip format.
+- Otherwise, if you don't need the features of the gzip header and footer but do
+  still want a checksum for corruption detection, you probably should use the
+  zlib format.
+- Otherwise, you probably should use raw DEFLATE.  This is ideal if you don't
+  need checksums, e.g. because they're simply not needed for your use case or
+  because you already compute your own checksums that are stored separately from
+  the compressed stream.
+
+Note that gzip and zlib streams can be distinguished from each other based on
+their starting bytes, but this is not necessarily true of raw DEFLATE streams.
+
+# Compression levels
+
+An often-underappreciated fact of compression formats such as DEFLATE is that
+there are an enormous number of different ways that a given input could be
+compressed.  Different algorithms and different amounts of computation time will
+result in different compression ratios, while remaining equally compatible with
+the decompressor.
+
+For this reason, the commonly used zlib library provides nine compression
+levels.  Level 1 is the fastest but provides the worst compression; level 9
+provides the best compression but is the slowest.  It defaults to level 6.
+libdeflate uses this same design but is designed to improve on both zlib's
+performance *and* compression ratio at every compression level.  In addition,
+libdeflate's levels go [up to 12](https://xkcd.com/670/) to make room for a
+minimum-cost-path based algorithm (sometimes called "optimal parsing") that can
+significantly improve on zlib's compression ratio.
+
+If you are using DEFLATE (or zlib, or gzip) in your application, you should test
+different levels to see which works best for your application.
+
+# Motivation
+
+Despite DEFLATE's widespread use mainly through the zlib library, in the
+compression community this format from the early 1990s is often considered
+obsolete.  And in a few significant ways, it is.
+
+So why implement DEFLATE at all, instead of focusing entirely on
+bzip2/LZMA/xz/LZ4/LZX/ZSTD/Brotli/LZHAM/LZFSE/[insert cool new format here]?
+
+To do something better, you need to understand what came before.  And it turns
+out that most ideas from DEFLATE are still relevant.  Many of the newer formats
+share a similar structure as DEFLATE, with different tweaks.  The effects of
+trivial but very useful tweaks, such as increasing the sliding window size, are
+often confused with the effects of nontrivial but less useful tweaks.  And
+actually, many of these formats are similar enough that common algorithms and
+optimizations (e.g. those dealing with LZ77 matchfinding) can be reused.
+
+In addition, comparing compressors fairly is difficult because the performance
+of a compressor depends heavily on optimizations which are not intrinsic to the
+compression format itself.  In this respect, the zlib library sometimes compares
+poorly to certain newer code because zlib is not well optimized for modern
+processors.  libdeflate addresses this by providing an optimized DEFLATE
+implementation which can be used for benchmarking purposes.  And, of course,
+real applications can use it as well.
+
+# License
+
+libdeflate is [MIT-licensed](COPYING).
+
+I am not aware of any patents or patent applications relevant to libdeflate.
diff --git a/tools/z64compress/src/enc/libdeflate/common_defs.h b/tools/z64compress/src/enc/libdeflate/common_defs.h
new file mode 100644
index 000000000..debdc7d41
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/common_defs.h
@@ -0,0 +1,716 @@
+/*
+ * common_defs.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef COMMON_DEFS_H
+#define COMMON_DEFS_H
+
+#include <stdbool.h>
+#include <stddef.h>	/* for size_t */
+#include <stdint.h>
+#ifdef _MSC_VER
+#  include <intrin.h>	/* for _BitScan*() and other intrinsics */
+#  include <stdlib.h>	/* for _byteswap_*() */
+   /* Disable MSVC warnings that are expected. */
+   /* /W2 */
+#  pragma warning(disable : 4146) /* unary minus on unsigned type */
+   /* /W3 */
+#  pragma warning(disable : 4018) /* signed/unsigned mismatch */
+#  pragma warning(disable : 4244) /* possible loss of data */
+#  pragma warning(disable : 4267) /* possible loss of precision */
+#  pragma warning(disable : 4310) /* cast truncates constant value */
+   /* /W4 */
+#  pragma warning(disable : 4100) /* unreferenced formal parameter */
+#  pragma warning(disable : 4127) /* conditional expression is constant */
+#  pragma warning(disable : 4189) /* local variable initialized but not referenced */
+#  pragma warning(disable : 4232) /* nonstandard extension used */
+#  pragma warning(disable : 4245) /* conversion from 'int' to 'unsigned int' */
+#  pragma warning(disable : 4295) /* array too small to include terminating null */
+#endif
+#ifndef FREESTANDING
+#  include <string.h>	/* for memcpy() */
+#endif
+
+/* ========================================================================== */
+/*                             Target architecture                            */
+/* ========================================================================== */
+
+/* If possible, define a compiler-independent ARCH_* macro. */
+#undef ARCH_X86_64
+#undef ARCH_X86_32
+#undef ARCH_ARM64
+#undef ARCH_ARM32
+#ifdef _MSC_VER
+#  if defined(_M_X64)
+#    define ARCH_X86_64
+#  elif defined(_M_IX86)
+#    define ARCH_X86_32
+#  elif defined(_M_ARM64)
+#    define ARCH_ARM64
+#  elif defined(_M_ARM)
+#    define ARCH_ARM32
+#  endif
+#else
+#  if defined(__x86_64__)
+#    define ARCH_X86_64
+#  elif defined(__i386__)
+#    define ARCH_X86_32
+#  elif defined(__aarch64__)
+#    define ARCH_ARM64
+#  elif defined(__arm__)
+#    define ARCH_ARM32
+#  endif
+#endif
+
+/* ========================================================================== */
+/*                              Type definitions                              */
+/* ========================================================================== */
+
+/* Fixed-width integer types */
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+/* ssize_t, if not available in <sys/types.h> */
+#ifdef _MSC_VER
+#  ifdef _WIN64
+     typedef long long ssize_t;
+#  else
+     typedef long ssize_t;
+#  endif
+#endif
+
+/*
+ * Word type of the target architecture.  Use 'size_t' instead of
+ * 'unsigned long' to account for platforms such as Windows that use 32-bit
+ * 'unsigned long' on 64-bit architectures.
+ */
+typedef size_t machine_word_t;
+
+/* Number of bytes in a word */
+#define WORDBYTES	((int)sizeof(machine_word_t))
+
+/* Number of bits in a word */
+#define WORDBITS	(8 * WORDBYTES)
+
+/* ========================================================================== */
+/*                         Optional compiler features                         */
+/* ========================================================================== */
+
+/* Compiler version checks.  Only use when absolutely necessary. */
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define GCC_PREREQ(major, minor)		\
+	(__GNUC__ > (major) ||			\
+	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
+#else
+#  define GCC_PREREQ(major, minor)	0
+#endif
+#ifdef __clang__
+#  ifdef __apple_build_version__
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__apple_build_version__ >= (apple_version))
+#  else
+#    define CLANG_PREREQ(major, minor, apple_version)	\
+	(__clang_major__ > (major) ||			\
+	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
+#  endif
+#else
+#  define CLANG_PREREQ(major, minor, apple_version)	0
+#endif
+
+/*
+ * Macros to check for compiler support for attributes and builtins.  clang
+ * implements these macros, but gcc doesn't, so generally any use of one of
+ * these macros must also be combined with a gcc version check.
+ */
+#ifndef __has_attribute
+#  define __has_attribute(attribute)	0
+#endif
+#ifndef __has_builtin
+#  define __has_builtin(builtin)	0
+#endif
+
+/* inline - suggest that a function be inlined */
+#ifdef _MSC_VER
+#  define inline		__inline
+#endif /* else assume 'inline' is usable as-is */
+
+/* forceinline - force a function to be inlined, if possible */
+#if defined(__GNUC__) || __has_attribute(always_inline)
+#  define forceinline		inline __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define forceinline		__forceinline
+#else
+#  define forceinline		inline
+#endif
+
+/* MAYBE_UNUSED - mark a function or variable as maybe unused */
+#if defined(__GNUC__) || __has_attribute(unused)
+#  define MAYBE_UNUSED		__attribute__((unused))
+#else
+#  define MAYBE_UNUSED
+#endif
+
+/*
+ * restrict - hint that writes only occur through the given pointer.
+ *
+ * Don't use MSVC's __restrict, since it has nonstandard behavior.
+ * Standard restrict is okay, if it is supported.
+ */
+#if !defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define restrict		__restrict__
+#  else
+#    define restrict
+#  endif
+#endif /* else assume 'restrict' is usable as-is */
+
+/* likely(expr) - hint that an expression is usually true */
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
+#  define likely(expr)		__builtin_expect(!!(expr), 1)
+#else
+#  define likely(expr)		(expr)
+#endif
+
+/* unlikely(expr) - hint that an expression is usually false */
+#if defined(__GNUC__) || __has_builtin(__builtin_expect)
+#  define unlikely(expr)	__builtin_expect(!!(expr), 0)
+#else
+#  define unlikely(expr)	(expr)
+#endif
+
+/* prefetchr(addr) - prefetch into L1 cache for read */
+#undef prefetchr
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
+#  define prefetchr(addr)	__builtin_prefetch((addr), 0)
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchr(addr)	_mm_prefetch((addr), _MM_HINT_T0)
+#  elif defined(ARCH_ARM64)
+#    define prefetchr(addr)	__prefetch2((addr), 0x00 /* prfop=PLDL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchr(addr)	__prefetch(addr)
+#  endif
+#endif
+#ifndef prefetchr
+#  define prefetchr(addr)
+#endif
+
+/* prefetchw(addr) - prefetch into L1 cache for write */
+#undef prefetchw
+#if defined(__GNUC__) || __has_builtin(__builtin_prefetch)
+#  define prefetchw(addr)	__builtin_prefetch((addr), 1)
+#elif defined(_MSC_VER)
+#  if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    define prefetchw(addr)	_m_prefetchw(addr)
+#  elif defined(ARCH_ARM64)
+#    define prefetchw(addr)	__prefetch2((addr), 0x10 /* prfop=PSTL1KEEP */)
+#  elif defined(ARCH_ARM32)
+#    define prefetchw(addr)	__prefetchw(addr)
+#  endif
+#endif
+#ifndef prefetchw
+#  define prefetchw(addr)
+#endif
+
+/*
+ * _aligned_attribute(n) - declare that the annotated variable, or variables of
+ * the annotated type, must be aligned on n-byte boundaries.
+ */
+#undef _aligned_attribute
+#if defined(__GNUC__) || __has_attribute(aligned)
+#  define _aligned_attribute(n)	__attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#  define _aligned_attribute(n)	__declspec(align(n))
+#endif
+
+/*
+ * _target_attribute(attrs) - override the compilation target for a function.
+ *
+ * This accepts one or more comma-separated suffixes to the -m prefix jointly
+ * forming the name of a machine-dependent option.  On gcc-like compilers, this
+ * enables codegen for the given targets, including arbitrary compiler-generated
+ * code as well as the corresponding intrinsics.  On other compilers this macro
+ * expands to nothing, though MSVC allows intrinsics to be used anywhere anyway.
+ */
+#if GCC_PREREQ(4, 4) || __has_attribute(target)
+#  define _target_attribute(attrs)	__attribute__((target(attrs)))
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	1
+#else
+#  define _target_attribute(attrs)
+#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	0
+#endif
+
+/* ========================================================================== */
+/*                          Miscellaneous macros                              */
+/* ========================================================================== */
+
+#define ARRAY_LEN(A)		(sizeof(A) / sizeof((A)[0]))
+#define MIN(a, b)		((a) <= (b) ? (a) : (b))
+#define MAX(a, b)		((a) >= (b) ? (a) : (b))
+#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
+#define STATIC_ASSERT(expr)	((void)sizeof(char[1 - 2 * !(expr)]))
+#define ALIGN(n, a)		(((n) + (a) - 1) & ~((a) - 1))
+#define ROUND_UP(n, d)		((d) * DIV_ROUND_UP((n), (d)))
+
+/* ========================================================================== */
+/*                           Endianness handling                              */
+/* ========================================================================== */
+
+/*
+ * CPU_IS_LITTLE_ENDIAN() - 1 if the CPU is little endian, or 0 if it is big
+ * endian.  When possible this is a compile-time macro that can be used in
+ * preprocessor conditionals.  As a fallback, a generic method is used that
+ * can't be used in preprocessor conditionals but should still be optimized out.
+ */
+#if defined(__BYTE_ORDER__) /* gcc v4.6+ and clang */
+#  define CPU_IS_LITTLE_ENDIAN()  (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#elif defined(_MSC_VER)
+#  define CPU_IS_LITTLE_ENDIAN()  true
+#else
+static forceinline bool CPU_IS_LITTLE_ENDIAN(void)
+{
+	union {
+		u32 w;
+		u8 b;
+	} u;
+
+	u.w = 1;
+	return u.b;
+}
+#endif
+
+/* bswap16(v) - swap the bytes of a 16-bit integer */
+static forceinline u16 bswap16(u16 v)
+{
+#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+	return __builtin_bswap16(v);
+#elif defined(_MSC_VER)
+	return _byteswap_ushort(v);
+#else
+	return (v << 8) | (v >> 8);
+#endif
+}
+
+/* bswap32(v) - swap the bytes of a 32-bit integer */
+static forceinline u32 bswap32(u32 v)
+{
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+	return __builtin_bswap32(v);
+#elif defined(_MSC_VER)
+	return _byteswap_ulong(v);
+#else
+	return ((v & 0x000000FF) << 24) |
+	       ((v & 0x0000FF00) << 8) |
+	       ((v & 0x00FF0000) >> 8) |
+	       ((v & 0xFF000000) >> 24);
+#endif
+}
+
+/* bswap64(v) - swap the bytes of a 64-bit integer */
+static forceinline u64 bswap64(u64 v)
+{
+#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+	return __builtin_bswap64(v);
+#elif defined(_MSC_VER)
+	return _byteswap_uint64(v);
+#else
+	return ((v & 0x00000000000000FF) << 56) |
+	       ((v & 0x000000000000FF00) << 40) |
+	       ((v & 0x0000000000FF0000) << 24) |
+	       ((v & 0x00000000FF000000) << 8) |
+	       ((v & 0x000000FF00000000) >> 8) |
+	       ((v & 0x0000FF0000000000) >> 24) |
+	       ((v & 0x00FF000000000000) >> 40) |
+	       ((v & 0xFF00000000000000) >> 56);
+#endif
+}
+
+#define le16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap16(v))
+#define le32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap32(v))
+#define le64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? (v) : bswap64(v))
+#define be16_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap16(v) : (v))
+#define be32_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap32(v) : (v))
+#define be64_bswap(v) (CPU_IS_LITTLE_ENDIAN() ? bswap64(v) : (v))
+
+/* ========================================================================== */
+/*                          Unaligned memory accesses                         */
+/* ========================================================================== */
+
+/*
+ * UNALIGNED_ACCESS_IS_FAST() - 1 if unaligned memory accesses can be performed
+ * efficiently on the target platform, otherwise 0.
+ */
+#if (defined(__GNUC__) || defined(__clang__)) && \
+	(defined(ARCH_X86_64) || defined(ARCH_X86_32) || \
+	 defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) || \
+	 /*
+	  * For all compilation purposes, WebAssembly behaves like any other CPU
+	  * instruction set. Even though WebAssembly engine might be running on
+	  * top of different actual CPU architectures, the WebAssembly spec
+	  * itself permits unaligned access and it will be fast on most of those
+	  * platforms, and simulated at the engine level on others, so it's
+	  * worth treating it as a CPU architecture with fast unaligned access.
+	  */ defined(__wasm__))
+#  define UNALIGNED_ACCESS_IS_FAST	1
+#elif defined(_MSC_VER)
+#  define UNALIGNED_ACCESS_IS_FAST	1
+#else
+#  define UNALIGNED_ACCESS_IS_FAST	0
+#endif
+
+/*
+ * Implementing unaligned memory accesses using memcpy() is portable, and it
+ * usually gets optimized appropriately by modern compilers.  I.e., each
+ * memcpy() of 1, 2, 4, or WORDBYTES bytes gets compiled to a load or store
+ * instruction, not to an actual function call.
+ *
+ * We no longer use the "packed struct" approach to unaligned accesses, as that
+ * is nonstandard, has unclear semantics, and doesn't receive enough testing
+ * (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94994).
+ *
+ * arm32 with __ARM_FEATURE_UNALIGNED in gcc 5 and earlier is a known exception
+ * where memcpy() generates inefficient code
+ * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67366).  However, we no longer
+ * consider that one case important enough to maintain different code for.
+ * If you run into it, please just use a newer version of gcc (or use clang).
+ */
+
+#ifdef FREESTANDING
+#  define MEMCOPY	__builtin_memcpy
+#else
+#  define MEMCOPY	memcpy
+#endif
+
+/* Unaligned loads and stores without endianness conversion */
+
+#define DEFINE_UNALIGNED_TYPE(type)				\
+static forceinline type						\
+load_##type##_unaligned(const void *p)				\
+{								\
+	type v;							\
+								\
+	MEMCOPY(&v, p, sizeof(v));				\
+	return v;						\
+}								\
+								\
+static forceinline void						\
+store_##type##_unaligned(type v, void *p)			\
+{								\
+	MEMCOPY(p, &v, sizeof(v));				\
+}
+
+DEFINE_UNALIGNED_TYPE(u16)
+DEFINE_UNALIGNED_TYPE(u32)
+DEFINE_UNALIGNED_TYPE(u64)
+DEFINE_UNALIGNED_TYPE(machine_word_t)
+
+#undef MEMCOPY
+
+#define load_word_unaligned	load_machine_word_t_unaligned
+#define store_word_unaligned	store_machine_word_t_unaligned
+
+/* Unaligned loads with endianness conversion */
+
+static forceinline u16
+get_unaligned_le16(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le16_bswap(load_u16_unaligned(p));
+	else
+		return ((u16)p[1] << 8) | p[0];
+}
+
+static forceinline u16
+get_unaligned_be16(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return be16_bswap(load_u16_unaligned(p));
+	else
+		return ((u16)p[0] << 8) | p[1];
+}
+
+static forceinline u32
+get_unaligned_le32(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le32_bswap(load_u32_unaligned(p));
+	else
+		return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
+			((u32)p[1] << 8) | p[0];
+}
+
+static forceinline u32
+get_unaligned_be32(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return be32_bswap(load_u32_unaligned(p));
+	else
+		return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
+			((u32)p[2] << 8) | p[3];
+}
+
+static forceinline u64
+get_unaligned_le64(const u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST)
+		return le64_bswap(load_u64_unaligned(p));
+	else
+		return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
+			((u64)p[5] << 40) | ((u64)p[4] << 32) |
+			((u64)p[3] << 24) | ((u64)p[2] << 16) |
+			((u64)p[1] << 8) | p[0];
+}
+
+static forceinline machine_word_t
+get_unaligned_leword(const u8 *p)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return get_unaligned_le32(p);
+	else
+		return get_unaligned_le64(p);
+}
+
+/* Unaligned stores with endianness conversion */
+
+static forceinline void
+put_unaligned_le16(u16 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(le16_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+	}
+}
+
+static forceinline void
+put_unaligned_be16(u16 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u16_unaligned(be16_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 8);
+		p[1] = (u8)(v >> 0);
+	}
+}
+
+static forceinline void
+put_unaligned_le32(u32 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(le32_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+		p[2] = (u8)(v >> 16);
+		p[3] = (u8)(v >> 24);
+	}
+}
+
+static forceinline void
+put_unaligned_be32(u32 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u32_unaligned(be32_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 24);
+		p[1] = (u8)(v >> 16);
+		p[2] = (u8)(v >> 8);
+		p[3] = (u8)(v >> 0);
+	}
+}
+
+static forceinline void
+put_unaligned_le64(u64 v, u8 *p)
+{
+	if (UNALIGNED_ACCESS_IS_FAST) {
+		store_u64_unaligned(le64_bswap(v), p);
+	} else {
+		p[0] = (u8)(v >> 0);
+		p[1] = (u8)(v >> 8);
+		p[2] = (u8)(v >> 16);
+		p[3] = (u8)(v >> 24);
+		p[4] = (u8)(v >> 32);
+		p[5] = (u8)(v >> 40);
+		p[6] = (u8)(v >> 48);
+		p[7] = (u8)(v >> 56);
+	}
+}
+
+static forceinline void
+put_unaligned_leword(machine_word_t v, u8 *p)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		put_unaligned_le32(v, p);
+	else
+		put_unaligned_le64(v, p);
+}
+
+/* ========================================================================== */
+/*                         Bit manipulation functions                         */
+/* ========================================================================== */
+
+/*
+ * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
+ * significant end) of the *most* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static forceinline unsigned
+bsr32(u32 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_clz)
+	return 31 - __builtin_clz(v);
+#elif defined(_MSC_VER)
+	unsigned long i;
+
+	_BitScanReverse(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	while ((v >>= 1) != 0)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsr64(u64 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_clzll)
+	return 63 - __builtin_clzll(v);
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanReverse64(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	while ((v >>= 1) != 0)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsrw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsr32(v);
+	else
+		return bsr64(v);
+}
+
+/*
+ * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
+ * significant end) of the *least* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static forceinline unsigned
+bsf32(u32 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_ctz)
+	return __builtin_ctz(v);
+#elif defined(_MSC_VER)
+	unsigned long i;
+
+	_BitScanForward(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	for (; (v & 1) == 0; v >>= 1)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsf64(u64 v)
+{
+#if defined(__GNUC__) || __has_builtin(__builtin_ctzll)
+	return __builtin_ctzll(v);
+#elif defined(_MSC_VER) && defined(_WIN64)
+	unsigned long i;
+
+	_BitScanForward64(&i, v);
+	return i;
+#else
+	unsigned i = 0;
+
+	for (; (v & 1) == 0; v >>= 1)
+		i++;
+	return i;
+#endif
+}
+
+static forceinline unsigned
+bsfw(machine_word_t v)
+{
+	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+	if (WORDBITS == 32)
+		return bsf32(v);
+	else
+		return bsf64(v);
+}
+
+/*
+ * rbit32(v): reverse the bits in a 32-bit integer.  This doesn't have a
+ * fallback implementation; use '#ifdef rbit32' to check if this is available.
+ */
+#undef rbit32
+#if (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM32) && \
+	(__ARM_ARCH >= 7 || (__ARM_ARCH == 6 && defined(__ARM_ARCH_6T2__)))
+static forceinline u32
+rbit32(u32 v)
+{
+	__asm__("rbit %0, %1" : "=r" (v) : "r" (v));
+	return v;
+}
+#define rbit32 rbit32
+#elif (defined(__GNUC__) || defined(__clang__)) && defined(ARCH_ARM64)
+static forceinline u32
+rbit32(u32 v)
+{
+	__asm__("rbit %w0, %w1" : "=r" (v) : "r" (v));
+	return v;
+}
+#define rbit32 rbit32
+#endif
+
+#endif /* COMMON_DEFS_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/adler32.c b/tools/z64compress/src/enc/libdeflate/lib/adler32.c
new file mode 100644
index 000000000..b743c6943
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/adler32.c
@@ -0,0 +1,131 @@
+/*
+ * adler32.c - Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+#include "libdeflate.h"
+
+/* The Adler-32 divisor, or "base", value */
+#define DIVISOR 65521
+
+/*
+ * MAX_CHUNK_LEN is the most bytes that can be processed without the possibility
+ * of s2 overflowing when it is represented as an unsigned 32-bit integer.  This
+ * value was computed using the following Python script:
+ *
+ *	divisor = 65521
+ *	count = 0
+ *	s1 = divisor - 1
+ *	s2 = divisor - 1
+ *	while True:
+ *		s1 += 0xFF
+ *		s2 += s1
+ *		if s2 > 0xFFFFFFFF:
+ *			break
+ *		count += 1
+ *	print(count)
+ *
+ * Note that to get the correct worst-case value, we must assume that every byte
+ * has value 0xFF and that s1 and s2 started with the highest possible values
+ * modulo the divisor.
+ */
+#define MAX_CHUNK_LEN	5552
+
+static u32 MAYBE_UNUSED
+adler32_generic(u32 adler, const u8 *p, size_t len)
+{
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+	const u8 * const end = p + len;
+
+	while (p != end) {
+		size_t chunk_len = MIN(end - p, MAX_CHUNK_LEN);
+		const u8 *chunk_end = p + chunk_len;
+		size_t num_unrolled_iterations = chunk_len / 4;
+
+		while (num_unrolled_iterations--) {
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+			s1 += *p++;
+			s2 += s1;
+		}
+		while (p != chunk_end) {
+			s1 += *p++;
+			s2 += s1;
+		}
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+
+	return (s2 << 16) | s1;
+}
+
+/* Include architecture-specific implementation(s) if available. */
+#undef DEFAULT_IMPL
+#undef arch_select_adler32_func
+typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
+#  include "arm/adler32_impl.h"
+#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#  include "x86/adler32_impl.h"
+#endif
+
+#ifndef DEFAULT_IMPL
+#  define DEFAULT_IMPL adler32_generic
+#endif
+
+#ifdef arch_select_adler32_func
+static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len);
+
+static volatile adler32_func_t adler32_impl = dispatch_adler32;
+
+/* Choose the best implementation at runtime. */
+static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
+{
+	adler32_func_t f = arch_select_adler32_func();
+
+	if (f == NULL)
+		f = DEFAULT_IMPL;
+
+	adler32_impl = f;
+	return f(adler, p, len);
+}
+#else
+/* The best implementation is statically known, so call it directly. */
+#define adler32_impl DEFAULT_IMPL
+#endif
+
+LIBDEFLATEAPI u32
+libdeflate_adler32(u32 adler, const void *buffer, size_t len)
+{
+	if (buffer == NULL) /* Return initial value. */
+		return 1;
+	return adler32_impl(adler, buffer, len);
+}
diff --git a/tools/z64compress/src/enc/libdeflate/lib/adler32_vec_template.h b/tools/z64compress/src/enc/libdeflate/lib/adler32_vec_template.h
new file mode 100644
index 000000000..98c086bbc
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/adler32_vec_template.h
@@ -0,0 +1,123 @@
+/*
+ * adler32_vec_template.h - template for vectorized Adler-32 implementations
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file contains a template for vectorized Adler-32 implementations.
+ *
+ * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
+ * implementation looks something like this:
+ *
+ *	do {
+ *		s1 += *p;
+ *		s2 += s1;
+ *	} while (++p != chunk_end);
+ *
+ * For vectorized calculation of s1, we only need to sum the input bytes.  They
+ * can be accumulated into multiple counters which are eventually summed
+ * together.
+ *
+ * For vectorized calculation of s2, the basic idea is that for each iteration
+ * that processes N bytes, we can perform the following vectorizable
+ * calculation:
+ *
+ *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
+ *
+ * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
+ * separate counters, then do the multiplications by N...1 just once at the end
+ * rather than once per iteration.
+ *
+ * Also, we must account for how previous bytes will affect s2 by doing the
+ * following at beginning of each iteration:
+ *
+ *	s2 += s1 * N
+ *
+ * Furthermore, like s1, "s2" can actually be multiple counters which are
+ * eventually summed together.
+ */
+
+static u32 ATTRIBUTES MAYBE_UNUSED
+FUNCNAME(u32 adler, const u8 *p, size_t len)
+{
+	const size_t max_chunk_len =
+		MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) -
+		(MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) % IMPL_SEGMENT_LEN);
+	u32 s1 = adler & 0xFFFF;
+	u32 s2 = adler >> 16;
+	const u8 * const end = p + len;
+	const u8 *vend;
+
+	/* Process a byte at a time until the needed alignment is reached. */
+	if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
+		do {
+			s1 += *p++;
+			s2 += s1;
+		} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+
+	/*
+	 * Process "chunks" of bytes using vector instructions.  Chunk lengths
+	 * are limited to MAX_CHUNK_LEN, which guarantees that s1 and s2 never
+	 * overflow before being reduced modulo DIVISOR.  For vector processing,
+	 * chunk lengths are also made evenly divisible by IMPL_SEGMENT_LEN and
+	 * may be further limited to IMPL_MAX_CHUNK_LEN.
+	 */
+	STATIC_ASSERT(IMPL_SEGMENT_LEN % IMPL_ALIGNMENT == 0);
+	vend = end - ((size_t)(end - p) % IMPL_SEGMENT_LEN);
+	while (p != vend) {
+		size_t chunk_len = MIN((size_t)(vend - p), max_chunk_len);
+
+		s2 += s1 * chunk_len;
+
+		FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_len),
+			       &s1, &s2);
+
+		p += chunk_len;
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+
+	/* Process any remaining bytes. */
+	if (p != end) {
+		do {
+			s1 += *p++;
+			s2 += s1;
+		} while (p != end);
+		s1 %= DIVISOR;
+		s2 %= DIVISOR;
+	}
+
+	return (s2 << 16) | s1;
+}
+
+#undef FUNCNAME
+#undef FUNCNAME_CHUNK
+#undef ATTRIBUTES
+#undef IMPL_ALIGNMENT
+#undef IMPL_SEGMENT_LEN
+#undef IMPL_MAX_CHUNK_LEN
diff --git a/tools/z64compress/src/enc/libdeflate/lib/arm/adler32_impl.h b/tools/z64compress/src/enc/libdeflate/lib/arm/adler32_impl.h
new file mode 100644
index 000000000..4083b2ef3
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/arm/adler32_impl.h
@@ -0,0 +1,272 @@
+/*
+ * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_ADLER32_IMPL_H
+#define LIB_ARM_ADLER32_IMPL_H
+
+#include "cpu_features.h"
+
+/* Regular NEON implementation */
+#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#  define adler32_neon		adler32_neon
+#  define FUNCNAME		adler32_neon
+#  define FUNCNAME_CHUNK	adler32_neon_chunk
+#  define IMPL_ALIGNMENT	16
+#  define IMPL_SEGMENT_LEN	64
+/* Prevent unsigned overflow of the 16-bit precision byte counters */
+#  define IMPL_MAX_CHUNK_LEN	(64 * (0xFFFF / 0xFF))
+#  if HAVE_NEON_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES	_target_attribute("fpu=neon")
+#    else
+#      define ATTRIBUTES	_target_attribute("+simd")
+#    endif
+#  endif
+#  include <arm_neon.h>
+static forceinline ATTRIBUTES void
+adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
+		   u32 *s1, u32 *s2)
+{
+	static const u16 _aligned_attribute(16) mults[64] = {
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const uint16x8_t mults_a = vld1q_u16(&mults[0]);
+	const uint16x8_t mults_b = vld1q_u16(&mults[8]);
+	const uint16x8_t mults_c = vld1q_u16(&mults[16]);
+	const uint16x8_t mults_d = vld1q_u16(&mults[24]);
+	const uint16x8_t mults_e = vld1q_u16(&mults[32]);
+	const uint16x8_t mults_f = vld1q_u16(&mults[40]);
+	const uint16x8_t mults_g = vld1q_u16(&mults[48]);
+	const uint16x8_t mults_h = vld1q_u16(&mults[56]);
+
+	uint32x4_t v_s1 = vdupq_n_u32(0);
+	uint32x4_t v_s2 = vdupq_n_u32(0);
+	/*
+	 * v_byte_sums_* contain the sum of the bytes at index i across all
+	 * 64-byte segments, for each index 0..63.
+	 */
+	uint16x8_t v_byte_sums_a = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_b = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_c = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_d = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_e = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_f = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_g = vdupq_n_u16(0);
+	uint16x8_t v_byte_sums_h = vdupq_n_u16(0);
+
+	do {
+		/* Load the next 64 bytes. */
+		const uint8x16_t bytes1 = *p++;
+		const uint8x16_t bytes2 = *p++;
+		const uint8x16_t bytes3 = *p++;
+		const uint8x16_t bytes4 = *p++;
+		uint16x8_t tmp;
+
+		/*
+		 * Accumulate the previous s1 counters into the s2 counters.
+		 * The needed multiplication by 64 is delayed to later.
+		 */
+		v_s2 = vaddq_u32(v_s2, v_s1);
+
+		/*
+		 * Add the 64 bytes to their corresponding v_byte_sums counters,
+		 * while also accumulating the sums of each adjacent set of 4
+		 * bytes into v_s1.
+		 */
+		tmp = vpaddlq_u8(bytes1);
+		v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
+		v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
+		tmp = vpadalq_u8(tmp, bytes2);
+		v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
+		v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
+		tmp = vpadalq_u8(tmp, bytes3);
+		v_byte_sums_e = vaddw_u8(v_byte_sums_e, vget_low_u8(bytes3));
+		v_byte_sums_f = vaddw_u8(v_byte_sums_f, vget_high_u8(bytes3));
+		tmp = vpadalq_u8(tmp, bytes4);
+		v_byte_sums_g = vaddw_u8(v_byte_sums_g, vget_low_u8(bytes4));
+		v_byte_sums_h = vaddw_u8(v_byte_sums_h, vget_high_u8(bytes4));
+		v_s1 = vpadalq_u16(v_s1, tmp);
+
+	} while (p != end);
+
+	/* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */
+#ifdef ARCH_ARM32
+#  define umlal2(a, b, c)  vmlal_u16((a), vget_high_u16(b), vget_high_u16(c))
+#else
+#  define umlal2	   vmlal_high_u16
+#endif
+	v_s2 = vqshlq_n_u32(v_s2, 6);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), vget_low_u16(mults_a));
+	v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), vget_low_u16(mults_b));
+	v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), vget_low_u16(mults_c));
+	v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d), vget_low_u16(mults_d));
+	v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e), vget_low_u16(mults_e));
+	v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f), vget_low_u16(mults_f));
+	v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g), vget_low_u16(mults_g));
+	v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g);
+	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h), vget_low_u16(mults_h));
+	v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h);
+#undef umlal2
+
+	/* Horizontal sum to finish up */
+#ifdef ARCH_ARM32
+	*s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) +
+	       vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3);
+	*s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) +
+	       vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3);
+#else
+	*s1 += vaddvq_u32(v_s1);
+	*s2 += vaddvq_u32(v_s2);
+#endif
+}
+#  include "../adler32_vec_template.h"
+#endif /* Regular NEON implementation */
+
+/* NEON+dotprod implementation */
+#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#  define adler32_neon_dotprod	adler32_neon_dotprod
+#  define FUNCNAME		adler32_neon_dotprod
+#  define FUNCNAME_CHUNK	adler32_neon_dotprod_chunk
+#  define IMPL_ALIGNMENT	16
+#  define IMPL_SEGMENT_LEN	64
+#  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
+#  if HAVE_DOTPROD_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef __clang__
+#      define ATTRIBUTES  _target_attribute("dotprod")
+     /*
+      * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
+      * default target is armv8.3-a or later in which case it must be omitted.
+      * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
+      */
+#    elif defined(__ARM_FEATURE_JCVT)
+#      define ATTRIBUTES  _target_attribute("+dotprod")
+#    else
+#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+dotprod")
+#    endif
+#  endif
+#  include <arm_neon.h>
+static forceinline ATTRIBUTES void
+adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end,
+			   u32 *s1, u32 *s2)
+{
+	static const u8 _aligned_attribute(16) mults[64] = {
+		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+	};
+	const uint8x16_t mults_a = vld1q_u8(&mults[0]);
+	const uint8x16_t mults_b = vld1q_u8(&mults[16]);
+	const uint8x16_t mults_c = vld1q_u8(&mults[32]);
+	const uint8x16_t mults_d = vld1q_u8(&mults[48]);
+	const uint8x16_t ones = vdupq_n_u8(1);
+	uint32x4_t v_s1_a = vdupq_n_u32(0);
+	uint32x4_t v_s1_b = vdupq_n_u32(0);
+	uint32x4_t v_s1_c = vdupq_n_u32(0);
+	uint32x4_t v_s1_d = vdupq_n_u32(0);
+	uint32x4_t v_s2_a = vdupq_n_u32(0);
+	uint32x4_t v_s2_b = vdupq_n_u32(0);
+	uint32x4_t v_s2_c = vdupq_n_u32(0);
+	uint32x4_t v_s2_d = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_a = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_b = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_c = vdupq_n_u32(0);
+	uint32x4_t v_s1_sums_d = vdupq_n_u32(0);
+	uint32x4_t v_s1;
+	uint32x4_t v_s2;
+	uint32x4_t v_s1_sums;
+
+	do {
+		uint8x16_t bytes_a = *p++;
+		uint8x16_t bytes_b = *p++;
+		uint8x16_t bytes_c = *p++;
+		uint8x16_t bytes_d = *p++;
+
+		v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a);
+		v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones);
+		v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a);
+
+		v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b);
+		v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones);
+		v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b);
+
+		v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c);
+		v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones);
+		v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c);
+
+		v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d);
+		v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones);
+		v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d);
+	} while (p != end);
+
+	v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), vaddq_u32(v_s1_c, v_s1_d));
+	v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), vaddq_u32(v_s2_c, v_s2_d));
+	v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, v_s1_sums_b),
+			      vaddq_u32(v_s1_sums_c, v_s1_sums_d));
+	v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6));
+
+	*s1 += vaddvq_u32(v_s1);
+	*s2 += vaddvq_u32(v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* NEON+dotprod implementation */
+
+#if defined(adler32_neon_dotprod) && HAVE_DOTPROD_NATIVE
+#define DEFAULT_IMPL	adler32_neon_dotprod
+#else
+static inline adler32_func_t
+arch_select_adler32_func(void)
+{
+	const u32 features MAYBE_UNUSED = get_arm_cpu_features();
+
+#ifdef adler32_neon_dotprod
+	if (HAVE_NEON(features) && HAVE_DOTPROD(features))
+		return adler32_neon_dotprod;
+#endif
+#ifdef adler32_neon
+	if (HAVE_NEON(features))
+		return adler32_neon;
+#endif
+	return NULL;
+}
+#define arch_select_adler32_func	arch_select_adler32_func
+#endif
+
+#endif /* LIB_ARM_ADLER32_IMPL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/arm/cpu_features.c b/tools/z64compress/src/enc/libdeflate/lib/arm/cpu_features.c
new file mode 100644
index 000000000..ed710bc6f
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/arm/cpu_features.c
@@ -0,0 +1,211 @@
+/*
+ * arm/cpu_features.c - feature detection for ARM CPUs
+ *
+ * Copyright 2018 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * ARM CPUs don't have a standard way for unprivileged programs to detect CPU
+ * features.  But an OS-specific way can be used when available.
+ */
+
+#ifdef __APPLE__
+#undef _ANSI_SOURCE
+#define _DARWIN_C_SOURCE /* for sysctlbyname() */
+#endif
+
+#include "../cpu_features_common.h" /* must be included first */
+#include "cpu_features.h"
+
+#if HAVE_DYNAMIC_ARM_CPU_FEATURES
+
+#ifdef __linux__
+/*
+ * On Linux, arm32 and arm64 CPU features can be detected by reading the
+ * AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv.
+ *
+ * Ideally we'd use the C library function getauxval(), but it's not guaranteed
+ * to be available: it was only added to glibc in 2.16, and in Android it was
+ * added to API level 18 for arm32 and level 21 for arm64.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#define AT_HWCAP	16
+#define AT_HWCAP2	26
+
+static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
+{
+	int fd;
+	unsigned long auxbuf[32];
+	int filled = 0;
+	int i;
+
+	fd = open("/proc/self/auxv", O_RDONLY);
+	if (fd < 0)
+		return;
+
+	for (;;) {
+		do {
+			int ret = read(fd, &((char *)auxbuf)[filled],
+				       sizeof(auxbuf) - filled);
+			if (ret <= 0) {
+				if (ret < 0 && errno == EINTR)
+					continue;
+				goto out;
+			}
+			filled += ret;
+		} while (filled < 2 * sizeof(long));
+
+		i = 0;
+		do {
+			unsigned long type = auxbuf[i];
+			unsigned long value = auxbuf[i + 1];
+
+			if (type == AT_HWCAP)
+				*hwcap = value;
+			else if (type == AT_HWCAP2)
+				*hwcap2 = value;
+			i += 2;
+			filled -= 2 * sizeof(long);
+		} while (filled >= 2 * sizeof(long));
+
+		memmove(auxbuf, &auxbuf[i], filled);
+	}
+out:
+	close(fd);
+}
+
+static u32 query_arm_cpu_features(void)
+{
+	u32 features = 0;
+	unsigned long hwcap = 0;
+	unsigned long hwcap2 = 0;
+
+	scan_auxv(&hwcap, &hwcap2);
+
+#ifdef ARCH_ARM32
+	STATIC_ASSERT(sizeof(long) == 4);
+	if (hwcap & (1 << 12))	/* HWCAP_NEON */
+		features |= ARM_CPU_FEATURE_NEON;
+	if (hwcap2 & (1 << 1))	/* HWCAP2_PMULL */
+		features |= ARM_CPU_FEATURE_PMULL;
+	if (hwcap2 & (1 << 4))	/* HWCAP2_CRC32 */
+		features |= ARM_CPU_FEATURE_CRC32;
+#else
+	STATIC_ASSERT(sizeof(long) == 8);
+	if (hwcap & (1 << 1))	/* HWCAP_ASIMD */
+		features |= ARM_CPU_FEATURE_NEON;
+	if (hwcap & (1 << 4))	/* HWCAP_PMULL */
+		features |= ARM_CPU_FEATURE_PMULL;
+	if (hwcap & (1 << 7))	/* HWCAP_CRC32 */
+		features |= ARM_CPU_FEATURE_CRC32;
+	if (hwcap & (1 << 17))	/* HWCAP_SHA3 */
+		features |= ARM_CPU_FEATURE_SHA3;
+	if (hwcap & (1 << 20))	/* HWCAP_ASIMDDP */
+		features |= ARM_CPU_FEATURE_DOTPROD;
+#endif
+	return features;
+}
+
+#elif defined(__APPLE__)
+/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static const struct {
+	const char *name;
+	u32 feature;
+} feature_sysctls[] = {
+	{ "hw.optional.neon",		  ARM_CPU_FEATURE_NEON },
+	{ "hw.optional.AdvSIMD",	  ARM_CPU_FEATURE_NEON },
+	{ "hw.optional.arm.FEAT_PMULL",	  ARM_CPU_FEATURE_PMULL },
+	{ "hw.optional.armv8_crc32",	  ARM_CPU_FEATURE_CRC32 },
+	{ "hw.optional.armv8_2_sha3",	  ARM_CPU_FEATURE_SHA3 },
+	{ "hw.optional.arm.FEAT_SHA3",	  ARM_CPU_FEATURE_SHA3 },
+	{ "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD },
+};
+
+static u32 query_arm_cpu_features(void)
+{
+	u32 features = 0;
+	size_t i;
+
+	for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) {
+		const char *name = feature_sysctls[i].name;
+		u32 val = 0;
+		size_t valsize = sizeof(val);
+
+		if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 &&
+		    valsize == sizeof(val) && val == 1)
+			features |= feature_sysctls[i].feature;
+	}
+	return features;
+}
+#elif defined(_WIN32)
+
+#include <windows.h>
+
+static u32 query_arm_cpu_features(void)
+{
+	u32 features = ARM_CPU_FEATURE_NEON;
+
+	if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
+		features |= ARM_CPU_FEATURE_PMULL;
+	if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
+		features |= ARM_CPU_FEATURE_CRC32;
+
+	/* FIXME: detect SHA3 and DOTPROD support too. */
+
+	return features;
+}
+#else
+#error "unhandled case"
+#endif
+
+static const struct cpu_feature arm_cpu_feature_table[] = {
+	{ARM_CPU_FEATURE_NEON,		"neon"},
+	{ARM_CPU_FEATURE_PMULL,		"pmull"},
+	{ARM_CPU_FEATURE_CRC32,		"crc32"},
+	{ARM_CPU_FEATURE_SHA3,		"sha3"},
+	{ARM_CPU_FEATURE_DOTPROD,	"dotprod"},
+};
+
+volatile u32 libdeflate_arm_cpu_features = 0;
+
+void libdeflate_init_arm_cpu_features(void)
+{
+	u32 features = query_arm_cpu_features();
+
+	disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
+					 ARRAY_LEN(arm_cpu_feature_table));
+
+	libdeflate_arm_cpu_features = features | ARM_CPU_FEATURES_KNOWN;
+}
+
+#endif /* HAVE_DYNAMIC_ARM_CPU_FEATURES */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/arm/cpu_features.h b/tools/z64compress/src/enc/libdeflate/lib/arm/cpu_features.h
new file mode 100644
index 000000000..548d31ea8
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/arm/cpu_features.h
@@ -0,0 +1,223 @@
+/*
+ * arm/cpu_features.h - feature detection for ARM CPUs
+ *
+ * Copyright 2018 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_CPU_FEATURES_H
+#define LIB_ARM_CPU_FEATURES_H
+
+#include "../lib_common.h"
+
+#define HAVE_DYNAMIC_ARM_CPU_FEATURES	0
+
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
+
+#if !defined(FREESTANDING) && \
+    (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \
+    (defined(__linux__) || \
+     (defined(__APPLE__) && defined(ARCH_ARM64)) || \
+     (defined(_WIN32) && defined(ARCH_ARM64)))
+#  undef HAVE_DYNAMIC_ARM_CPU_FEATURES
+#  define HAVE_DYNAMIC_ARM_CPU_FEATURES	1
+#endif
+
+#define ARM_CPU_FEATURE_NEON		0x00000001
+#define ARM_CPU_FEATURE_PMULL		0x00000002
+#define ARM_CPU_FEATURE_CRC32		0x00000004
+#define ARM_CPU_FEATURE_SHA3		0x00000008
+#define ARM_CPU_FEATURE_DOTPROD		0x00000010
+
+#define HAVE_NEON(features)	(HAVE_NEON_NATIVE    || ((features) & ARM_CPU_FEATURE_NEON))
+#define HAVE_PMULL(features)	(HAVE_PMULL_NATIVE   || ((features) & ARM_CPU_FEATURE_PMULL))
+#define HAVE_CRC32(features)	(HAVE_CRC32_NATIVE   || ((features) & ARM_CPU_FEATURE_CRC32))
+#define HAVE_SHA3(features)	(HAVE_SHA3_NATIVE    || ((features) & ARM_CPU_FEATURE_SHA3))
+#define HAVE_DOTPROD(features)	(HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))
+
+#if HAVE_DYNAMIC_ARM_CPU_FEATURES
+#define ARM_CPU_FEATURES_KNOWN		0x80000000
+extern volatile u32 libdeflate_arm_cpu_features;
+
+void libdeflate_init_arm_cpu_features(void);
+
+static inline u32 get_arm_cpu_features(void)
+{
+	if (libdeflate_arm_cpu_features == 0)
+		libdeflate_init_arm_cpu_features();
+	return libdeflate_arm_cpu_features;
+}
+#else /* HAVE_DYNAMIC_ARM_CPU_FEATURES */
+static inline u32 get_arm_cpu_features(void) { return 0; }
+#endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */
+
+/* NEON */
+#if defined(__ARM_NEON) || defined(ARCH_ARM64)
+#  define HAVE_NEON_NATIVE	1
+#else
+#  define HAVE_NEON_NATIVE	0
+#endif
+/*
+ * With both gcc and clang, NEON intrinsics require that the main target has
+ * NEON enabled already.  Exception: with gcc 6.1 and later (r230411 for arm32,
+ * r226563 for arm64), hardware floating point support is sufficient.
+ */
+#if HAVE_NEON_NATIVE || \
+	(HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP))
+#  define HAVE_NEON_INTRIN	1
+#else
+#  define HAVE_NEON_INTRIN	0
+#endif
+
+/* PMULL */
+#ifdef __ARM_FEATURE_CRYPTO
+#  define HAVE_PMULL_NATIVE	1
+#else
+#  define HAVE_PMULL_NATIVE	0
+#endif
+#if HAVE_PMULL_NATIVE || \
+	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+	 (GCC_PREREQ(6, 1) || __has_builtin(__builtin_neon_vmull_p64) || \
+	  defined(_MSC_VER)) && \
+	  /*
+	   * On arm32 with clang, the crypto intrinsics (which include pmull)
+	   * are not defined, even when using -mfpu=crypto-neon-fp-armv8,
+	   * because clang's <arm_neon.h> puts their definitions behind
+	   * __aarch64__.
+	   */ \
+	 !(defined(ARCH_ARM32) && defined(__clang__)))
+#  define HAVE_PMULL_INTRIN	CPU_IS_LITTLE_ENDIAN() /* untested on big endian */
+   /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */
+#  ifdef _MSC_VER
+#    define compat_vmull_p64(a, b)  vmull_p64(vcreate_p64(a), vcreate_p64(b))
+#  else
+#    define compat_vmull_p64(a, b)  vmull_p64((a), (b))
+#  endif
+#else
+#  define HAVE_PMULL_INTRIN	0
+#endif
+
+/* CRC32 */
+#ifdef __ARM_FEATURE_CRC32
+#  define HAVE_CRC32_NATIVE	1
+#else
+#  define HAVE_CRC32_NATIVE	0
+#endif
+/*
+ * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled in
+ * the main target has been affected by two gcc bugs, which we must avoid by
+ * only allowing gcc versions that have the corresponding fixes.  First, gcc
+ * commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a and
+ * hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed.  Second, gcc
+ * commit c1cdabe3aab8 ("arm: reorder assembler architecture directives
+ * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when binutils is
+ * 2.34 or later, due to https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439.
+ * We use the second set of prerequisites, as they are stricter and we have no
+ * way to detect the binutils version directly from a C source file.
+ */
+#if HAVE_CRC32_NATIVE || \
+	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+	 (__has_builtin(__builtin_arm_crc32b) || \
+	  GCC_PREREQ(11, 3) || \
+	  (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \
+	  (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0)) || \
+	  defined(_MSC_VER)))
+#  define HAVE_CRC32_INTRIN	1
+#else
+#  define HAVE_CRC32_INTRIN	0
+#endif
+
+/* SHA3 (needed for the eor3 instruction) */
+#if defined(ARCH_ARM64) && !defined(_MSC_VER)
+#  ifdef __ARM_FEATURE_SHA3
+#    define HAVE_SHA3_NATIVE	1
+#  else
+#    define HAVE_SHA3_NATIVE	0
+#  endif
+#  define HAVE_SHA3_TARGET	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+				 (GCC_PREREQ(8, 1) /* r256478 */ || \
+				  CLANG_PREREQ(7, 0, 10010463) /* r338010 */))
+#  define HAVE_SHA3_INTRIN	(HAVE_NEON_INTRIN && \
+				 (HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \
+				 (GCC_PREREQ(9, 1) /* r268049 */ || \
+				  __has_builtin(__builtin_neon_veor3q_v)))
+#else
+#  define HAVE_SHA3_NATIVE	0
+#  define HAVE_SHA3_TARGET	0
+#  define HAVE_SHA3_INTRIN	0
+#endif
+
+/* dotprod */
+#ifdef ARCH_ARM64
+#  ifdef __ARM_FEATURE_DOTPROD
+#    define HAVE_DOTPROD_NATIVE	1
+#  else
+#    define HAVE_DOTPROD_NATIVE	0
+#  endif
+#  if HAVE_DOTPROD_NATIVE || \
+	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+	 (GCC_PREREQ(8, 1) || __has_builtin(__builtin_neon_vdotq_v) || \
+	  defined(_MSC_VER)))
+#    define HAVE_DOTPROD_INTRIN	1
+#  else
+#    define HAVE_DOTPROD_INTRIN	0
+#  endif
+#else
+#  define HAVE_DOTPROD_NATIVE	0
+#  define HAVE_DOTPROD_INTRIN	0
+#endif
+
+/*
+ * Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are
+ * only defined when the corresponding __ARM_FEATURE_* macro is defined.  The
+ * intrinsics actually work in target attribute functions too if they are
+ * defined, though, so work around this by temporarily defining the
+ * corresponding __ARM_FEATURE_* macros while including the headers.
+ */
+#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
+	(defined(__clang__) || defined(ARCH_ARM32))
+#  define __ARM_FEATURE_CRC32	1
+#endif
+#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
+#  define __ARM_FEATURE_SHA3	1
+#endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+#  define __ARM_FEATURE_DOTPROD	1
+#endif
+#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
+	(defined(__clang__) || defined(ARCH_ARM32))
+#  include <arm_acle.h>
+#  undef __ARM_FEATURE_CRC32
+#endif
+#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
+#  include <arm_neon.h>
+#  undef __ARM_FEATURE_SHA3
+#endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+#  include <arm_neon.h>
+#  undef __ARM_FEATURE_DOTPROD
+#endif
+
+#endif /* ARCH_ARM32 || ARCH_ARM64 */
+
+#endif /* LIB_ARM_CPU_FEATURES_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_impl.h b/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_impl.h
new file mode 100644
index 000000000..e426a63d6
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_impl.h
@@ -0,0 +1,665 @@
+/*
+ * arm/crc32_impl.h - ARM implementations of the gzip CRC-32 algorithm
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_CRC32_IMPL_H
+#define LIB_ARM_CRC32_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * crc32_arm_crc() - implementation using crc32 instructions (only)
+ *
+ * In general this implementation is straightforward.  However, naive use of the
+ * crc32 instructions is serial: one of the two inputs to each crc32 instruction
+ * is the output of the previous one.  To take advantage of CPUs that can
+ * execute multiple crc32 instructions in parallel, when possible we interleave
+ * the checksumming of several adjacent chunks, then combine their CRCs.
+ *
+ * However, without pmull, combining CRCs is fairly slow.  So in this pmull-less
+ * version, we only use a large chunk length, and thus we only do chunked
+ * processing if there is a lot of data to checksum.  This also means that a
+ * variable chunk length wouldn't help much, so we just support a fixed length.
+ */
+#if HAVE_CRC32_INTRIN
+#  if HAVE_CRC32_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      ifdef __clang__
+#        define ATTRIBUTES	_target_attribute("armv8-a,crc")
+#      else
+#        define ATTRIBUTES	_target_attribute("arch=armv8-a+crc")
+#      endif
+#    else
+#      ifdef __clang__
+#        define ATTRIBUTES	_target_attribute("crc")
+#      else
+#        define ATTRIBUTES	_target_attribute("+crc")
+#      endif
+#    endif
+#  endif
+
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+
+/*
+ * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN
+ * bytes each by computing:
+ *
+ *	[ crc0*x^(3*8*L) + crc1*x^(2*8*L) + crc2*x^(1*8*L) + crc3 ] mod G(x)
+ *
+ * This has been optimized in several ways:
+ *
+ *    - The needed multipliers (x to some power, reduced mod G(x)) were
+ *	precomputed.
+ *
+ *    - The 3 multiplications are interleaved.
+ *
+ *    - The reduction mod G(x) is delayed to the end and done using __crc32d.
+ *	Note that the use of __crc32d introduces an extra factor of x^32.  To
+ *	cancel that out along with the extra factor of x^1 that gets introduced
+ *	because of how the 63-bit products are aligned in their 64-bit integers,
+ *	the multipliers are actually x^(j*8*L - 33) instead of x^(j*8*L).
+ */
+static forceinline ATTRIBUTES u32
+combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
+{
+	u64 res0 = 0, res1 = 0, res2 = 0;
+	int i;
+
+	/* Multiply crc{0,1,2} by CRC32_FIXED_CHUNK_MULT_{3,2,1}. */
+	for (i = 0; i < 32; i++) {
+		if (CRC32_FIXED_CHUNK_MULT_3 & (1U << i))
+			res0 ^= (u64)crc0 << i;
+		if (CRC32_FIXED_CHUNK_MULT_2 & (1U << i))
+			res1 ^= (u64)crc1 << i;
+		if (CRC32_FIXED_CHUNK_MULT_1 & (1U << i))
+			res2 ^= (u64)crc2 << i;
+	}
+	/* Add the different parts and reduce mod G(x). */
+	return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
+}
+
+#define crc32_arm_crc	crc32_arm_crc
+static u32 ATTRIBUTES MAYBE_UNUSED
+crc32_arm_crc(u32 crc, const u8 *p, size_t len)
+{
+	if (len >= 64) {
+		const size_t align = -(uintptr_t)p & 7;
+
+		/* Align p to the next 8-byte boundary. */
+		if (align) {
+			if (align & 1)
+				crc = __crc32b(crc, *p++);
+			if (align & 2) {
+				crc = __crc32h(crc, le16_bswap(*(u16 *)p));
+				p += 2;
+			}
+			if (align & 4) {
+				crc = __crc32w(crc, le32_bswap(*(u32 *)p));
+				p += 4;
+			}
+			len -= align;
+		}
+		/*
+		 * Interleave the processing of multiple adjacent data chunks to
+		 * take advantage of instruction-level parallelism.
+		 *
+		 * Some CPUs don't prefetch the data if it's being fetched in
+		 * multiple interleaved streams, so do explicit prefetching.
+		 */
+		while (len >= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN) {
+			const u64 *wp0 = (const u64 *)p;
+			const u64 * const wp0_end =
+				(const u64 *)(p + CRC32_FIXED_CHUNK_LEN);
+			u32 crc1 = 0, crc2 = 0, crc3 = 0;
+
+			STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
+			STATIC_ASSERT(CRC32_FIXED_CHUNK_LEN % (4 * 8) == 0);
+			do {
+				prefetchr(&wp0[64 + 0*CRC32_FIXED_CHUNK_LEN/8]);
+				prefetchr(&wp0[64 + 1*CRC32_FIXED_CHUNK_LEN/8]);
+				prefetchr(&wp0[64 + 2*CRC32_FIXED_CHUNK_LEN/8]);
+				prefetchr(&wp0[64 + 3*CRC32_FIXED_CHUNK_LEN/8]);
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+				wp0++;
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+				wp0++;
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+				wp0++;
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+				wp0++;
+			} while (wp0 != wp0_end);
+			crc = combine_crcs_slow(crc, crc1, crc2, crc3);
+			p += CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
+			len -= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
+		}
+		/*
+		 * Due to the large fixed chunk length used above, there might
+		 * still be a lot of data left.  So use a 64-byte loop here,
+		 * instead of a loop that is less unrolled.
+		 */
+		while (len >= 64) {
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 32)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 40)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 48)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 56)));
+			p += 64;
+			len -= 64;
+		}
+	}
+	if (len & 32) {
+		crc = __crc32d(crc, get_unaligned_le64(p + 0));
+		crc = __crc32d(crc, get_unaligned_le64(p + 8));
+		crc = __crc32d(crc, get_unaligned_le64(p + 16));
+		crc = __crc32d(crc, get_unaligned_le64(p + 24));
+		p += 32;
+	}
+	if (len & 16) {
+		crc = __crc32d(crc, get_unaligned_le64(p + 0));
+		crc = __crc32d(crc, get_unaligned_le64(p + 8));
+		p += 16;
+	}
+	if (len & 8) {
+		crc = __crc32d(crc, get_unaligned_le64(p));
+		p += 8;
+	}
+	if (len & 4) {
+		crc = __crc32w(crc, get_unaligned_le32(p));
+		p += 4;
+	}
+	if (len & 2) {
+		crc = __crc32h(crc, get_unaligned_le16(p));
+		p += 2;
+	}
+	if (len & 1)
+		crc = __crc32b(crc, *p);
+	return crc;
+}
+#undef ATTRIBUTES
+#endif /* crc32_arm_crc() */
+
+/*
+ * crc32_arm_crc_pmullcombine() - implementation using crc32 instructions, plus
+ *	pmull instructions for CRC combining
+ *
+ * This is similar to crc32_arm_crc(), but it enables the use of pmull
+ * (carryless multiplication) instructions for the steps where the CRCs of
+ * adjacent data chunks are combined.  As this greatly speeds up CRC
+ * combination, this implementation also differs from crc32_arm_crc() in that it
+ * uses a variable chunk length which can get fairly small.  The precomputed
+ * multipliers needed for the selected chunk length are loaded from a table.
+ *
+ * Note that pmull is used here only for combining the CRCs of separately
+ * checksummed chunks, not for folding the data itself.  See crc32_arm_pmull*()
+ * for implementations that use pmull for folding the data itself.
+ */
+#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN
+#  if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES	_target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8")
+#    else
+#      ifdef __clang__
+#        define ATTRIBUTES	_target_attribute("crc,crypto")
+#      else
+#        define ATTRIBUTES	_target_attribute("+crc,+crypto")
+#      endif
+#    endif
+#  endif
+
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+#include <arm_neon.h>
+
+/* Do carryless multiplication of two 32-bit values. */
+static forceinline ATTRIBUTES u64
+clmul_u32(u32 a, u32 b)
+{
+	uint64x2_t res = vreinterpretq_u64_p128(
+				compat_vmull_p64((poly64_t)a, (poly64_t)b));
+
+	return vgetq_lane_u64(res, 0);
+}
+
+/*
+ * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more
+ * quickly, and supports a variable chunk length.  The chunk length is
+ * 'i * CRC32_MIN_VARIABLE_CHUNK_LEN'
+ * where 1 <= i < ARRAY_LEN(crc32_mults_for_chunklen).
+ */
+static forceinline ATTRIBUTES u32
+combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
+{
+	u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]);
+	u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]);
+	u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]);
+
+	return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
+}
+
+#define crc32_arm_crc_pmullcombine	crc32_arm_crc_pmullcombine
+static u32 ATTRIBUTES MAYBE_UNUSED
+crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
+{
+	const size_t align = -(uintptr_t)p & 7;
+
+	if (len >= align + CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
+		/* Align p to the next 8-byte boundary. */
+		if (align) {
+			if (align & 1)
+				crc = __crc32b(crc, *p++);
+			if (align & 2) {
+				crc = __crc32h(crc, le16_bswap(*(u16 *)p));
+				p += 2;
+			}
+			if (align & 4) {
+				crc = __crc32w(crc, le32_bswap(*(u32 *)p));
+				p += 4;
+			}
+			len -= align;
+		}
+		/*
+		 * Handle CRC32_MAX_VARIABLE_CHUNK_LEN specially, so that better
+		 * code is generated for it.
+		 */
+		while (len >= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN) {
+			const u64 *wp0 = (const u64 *)p;
+			const u64 * const wp0_end =
+				(const u64 *)(p + CRC32_MAX_VARIABLE_CHUNK_LEN);
+			u32 crc1 = 0, crc2 = 0, crc3 = 0;
+
+			STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
+			STATIC_ASSERT(CRC32_MAX_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
+			do {
+				prefetchr(&wp0[64 + 0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+				prefetchr(&wp0[64 + 1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+				prefetchr(&wp0[64 + 2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+				prefetchr(&wp0[64 + 3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				wp0++;
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				wp0++;
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				wp0++;
+				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+				wp0++;
+			} while (wp0 != wp0_end);
+			crc = combine_crcs_fast(crc, crc1, crc2, crc3,
+						ARRAY_LEN(crc32_mults_for_chunklen) - 1);
+			p += CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
+			len -= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
+		}
+		/* Handle up to one variable-length chunk. */
+		if (len >= CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
+			const size_t i = len / (CRC32_NUM_CHUNKS *
+						CRC32_MIN_VARIABLE_CHUNK_LEN);
+			const size_t chunk_len =
+				i * CRC32_MIN_VARIABLE_CHUNK_LEN;
+			const u64 *wp0 = (const u64 *)(p + 0*chunk_len);
+			const u64 *wp1 = (const u64 *)(p + 1*chunk_len);
+			const u64 *wp2 = (const u64 *)(p + 2*chunk_len);
+			const u64 *wp3 = (const u64 *)(p + 3*chunk_len);
+			const u64 * const wp0_end = wp1;
+			u32 crc1 = 0, crc2 = 0, crc3 = 0;
+
+			STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
+			STATIC_ASSERT(CRC32_MIN_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
+			do {
+				prefetchr(wp0 + 64);
+				prefetchr(wp1 + 64);
+				prefetchr(wp2 + 64);
+				prefetchr(wp3 + 64);
+				crc  = __crc32d(crc,  le64_bswap(*wp0++));
+				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+				crc  = __crc32d(crc,  le64_bswap(*wp0++));
+				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+				crc  = __crc32d(crc,  le64_bswap(*wp0++));
+				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+				crc  = __crc32d(crc,  le64_bswap(*wp0++));
+				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+			} while (wp0 != wp0_end);
+			crc = combine_crcs_fast(crc, crc1, crc2, crc3, i);
+			p += CRC32_NUM_CHUNKS * chunk_len;
+			len -= CRC32_NUM_CHUNKS * chunk_len;
+		}
+
+		while (len >= 32) {
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
+			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
+			p += 32;
+			len -= 32;
+		}
+	} else {
+		while (len >= 32) {
+			crc = __crc32d(crc, get_unaligned_le64(p + 0));
+			crc = __crc32d(crc, get_unaligned_le64(p + 8));
+			crc = __crc32d(crc, get_unaligned_le64(p + 16));
+			crc = __crc32d(crc, get_unaligned_le64(p + 24));
+			p += 32;
+			len -= 32;
+		}
+	}
+	if (len & 16) {
+		crc = __crc32d(crc, get_unaligned_le64(p + 0));
+		crc = __crc32d(crc, get_unaligned_le64(p + 8));
+		p += 16;
+	}
+	if (len & 8) {
+		crc = __crc32d(crc, get_unaligned_le64(p));
+		p += 8;
+	}
+	if (len & 4) {
+		crc = __crc32w(crc, get_unaligned_le32(p));
+		p += 4;
+	}
+	if (len & 2) {
+		crc = __crc32h(crc, get_unaligned_le16(p));
+		p += 2;
+	}
+	if (len & 1)
+		crc = __crc32b(crc, *p);
+	return crc;
+}
+#undef ATTRIBUTES
+#endif /* crc32_arm_crc_pmullcombine() */
+
+/*
+ * crc32_arm_pmullx4() - implementation using "folding" with pmull instructions
+ *
+ * This implementation is intended for CPUs that support pmull instructions but
+ * not crc32 instructions.
+ */
+#if HAVE_PMULL_INTRIN
+#  define crc32_arm_pmullx4	crc32_arm_pmullx4
+#  define SUFFIX			 _pmullx4
+#  if HAVE_PMULL_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES    _target_attribute("fpu=crypto-neon-fp-armv8")
+#    else
+#      ifdef __clang__
+#        define ATTRIBUTES  _target_attribute("crypto")
+#      else
+#        define ATTRIBUTES  _target_attribute("+crypto")
+#      endif
+#    endif
+#  endif
+#  define ENABLE_EOR3		0
+#  include "crc32_pmull_helpers.h"
+
+static u32 ATTRIBUTES MAYBE_UNUSED
+crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
+{
+	static const u64 _aligned_attribute(16) mults[3][2] = {
+		CRC32_1VECS_MULTS,
+		CRC32_4VECS_MULTS,
+		CRC32_2VECS_MULTS,
+	};
+	static const u64 _aligned_attribute(16) final_mults[3][2] = {
+		{ CRC32_FINAL_MULT, 0 },
+		{ CRC32_BARRETT_CONSTANT_1, 0 },
+		{ CRC32_BARRETT_CONSTANT_2, 0 },
+	};
+	const uint8x16_t zeroes = vdupq_n_u8(0);
+	const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
+	const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
+	uint8x16_t v0, v1, v2, v3;
+
+	if (len < 64 + 15) {
+		if (len < 16)
+			return crc32_slice1(crc, p, len);
+		v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
+		p += 16;
+		len -= 16;
+		while (len >= 16) {
+			v0 = fold_vec(v0, vld1q_u8(p), multipliers_1);
+			p += 16;
+			len -= 16;
+		}
+	} else {
+		const poly64x2_t multipliers_4 = load_multipliers(mults[1]);
+		const poly64x2_t multipliers_2 = load_multipliers(mults[2]);
+		const size_t align = -(uintptr_t)p & 15;
+		const uint8x16_t *vp;
+
+		v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
+		p += 16;
+		/* Align p to the next 16-byte boundary. */
+		if (align) {
+			v0 = fold_partial_vec(v0, p, align, multipliers_1);
+			p += align;
+			len -= align;
+		}
+		vp = (const uint8x16_t *)p;
+		v1 = *vp++;
+		v2 = *vp++;
+		v3 = *vp++;
+		while (len >= 64 + 64) {
+			v0 = fold_vec(v0, *vp++, multipliers_4);
+			v1 = fold_vec(v1, *vp++, multipliers_4);
+			v2 = fold_vec(v2, *vp++, multipliers_4);
+			v3 = fold_vec(v3, *vp++, multipliers_4);
+			len -= 64;
+		}
+		v0 = fold_vec(v0, v2, multipliers_2);
+		v1 = fold_vec(v1, v3, multipliers_2);
+		if (len & 32) {
+			v0 = fold_vec(v0, *vp++, multipliers_2);
+			v1 = fold_vec(v1, *vp++, multipliers_2);
+		}
+		v0 = fold_vec(v0, v1, multipliers_1);
+		if (len & 16)
+			v0 = fold_vec(v0, *vp++, multipliers_1);
+		p = (const u8 *)vp;
+		len &= 15;
+	}
+
+	/* Handle any remaining partial block now before reducing to 32 bits. */
+	if (len)
+		v0 = fold_partial_vec(v0, p, len, multipliers_1);
+
+	/*
+	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+	 * which is equivalent to multiplying by x^32.  This is needed because
+	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+	 */
+
+	v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
+		      clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
+
+	/* Fold 96 => 64 bits. */
+	v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
+		      clmul_low(vandq_u8(v0, mask32),
+				load_multipliers(final_mults[0])));
+
+	/* Reduce 64 => 32 bits using Barrett reduction. */
+	v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
+	v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
+	return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
+}
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef ENABLE_EOR3
+#endif /* crc32_arm_pmullx4() */
+
+/*
+ * crc32_arm_pmullx12_crc() - large-stride implementation using "folding" with
+ *	pmull instructions, where crc32 instructions are also available
+ *
+ * See crc32_pmull_wide.h for explanation.
+ */
+#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN
+#  define crc32_arm_pmullx12_crc	crc32_arm_pmullx12_crc
+#  define SUFFIX				 _pmullx12_crc
+#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef __clang__
+#      define ATTRIBUTES  _target_attribute("crypto,crc")
+#    else
+#      define ATTRIBUTES  _target_attribute("+crypto,+crc")
+#    endif
+#  endif
+#  define ENABLE_EOR3	0
+#  include "crc32_pmull_wide.h"
+#endif
+
+/*
+ * crc32_arm_pmullx12_crc_eor3()
+ *
+ * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
+ * the sha3 extension) for even better performance.
+ *
+ * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than
+ * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3.
+ */
+#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \
+	(HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE)
+#  define crc32_arm_pmullx12_crc_eor3	crc32_arm_pmullx12_crc_eor3
+#  define SUFFIX				 _pmullx12_crc_eor3
+#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef __clang__
+#      define ATTRIBUTES  _target_attribute("crypto,crc,sha3")
+     /*
+      * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the
+      * default target is armv8.3-a or later in which case it must be omitted.
+      * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
+      */
+#    elif defined(__ARM_FEATURE_JCVT)
+#      define ATTRIBUTES  _target_attribute("+crypto,+crc,+sha3")
+#    else
+#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+crypto+crc+sha3")
+#    endif
+#  endif
+#  define ENABLE_EOR3	1
+#  include "crc32_pmull_wide.h"
+#endif
+
+/*
+ * On the Apple M1 processor, crc32 instructions max out at about 25.5 GB/s in
+ * the best case of using a 3-way or greater interleaved chunked implementation,
+ * whereas a pmull-based implementation achieves 68 GB/s provided that the
+ * stride length is large enough (about 10+ vectors with eor3, or 12+ without).
+ *
+ * For now we assume that crc32 instructions are preferable in other cases.
+ */
+#define PREFER_PMULL_TO_CRC	0
+#ifdef __APPLE__
+#  include <TargetConditionals.h>
+#  if TARGET_OS_OSX
+#    undef PREFER_PMULL_TO_CRC
+#    define PREFER_PMULL_TO_CRC	1
+#  endif
+#endif
+
+/*
+ * If the best implementation is statically available, use it unconditionally.
+ * Otherwise choose the best implementation at runtime.
+ */
+#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) && \
+	HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE
+#  define DEFAULT_IMPL	crc32_arm_pmullx12_crc_eor3
+#elif !PREFER_PMULL_TO_CRC && defined(crc32_arm_crc_pmullcombine) && \
+	HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE
+#  define DEFAULT_IMPL	crc32_arm_crc_pmullcombine
+#else
+static inline crc32_func_t
+arch_select_crc32_func(void)
+{
+	const u32 features MAYBE_UNUSED = get_arm_cpu_features();
+
+#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3)
+	if (HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features))
+		return crc32_arm_pmullx12_crc_eor3;
+#endif
+#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc)
+	if (HAVE_PMULL(features) && HAVE_CRC32(features))
+		return crc32_arm_pmullx12_crc;
+#endif
+#ifdef crc32_arm_crc_pmullcombine
+	if (HAVE_CRC32(features) && HAVE_PMULL(features))
+		return crc32_arm_crc_pmullcombine;
+#endif
+#ifdef crc32_arm_crc
+	if (HAVE_CRC32(features))
+		return crc32_arm_crc;
+#endif
+#ifdef crc32_arm_pmullx4
+	if (HAVE_PMULL(features))
+		return crc32_arm_pmullx4;
+#endif
+	return NULL;
+}
+#define arch_select_crc32_func	arch_select_crc32_func
+#endif
+
+#endif /* LIB_ARM_CRC32_IMPL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_pmull_helpers.h b/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_pmull_helpers.h
new file mode 100644
index 000000000..1cd1cc188
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_pmull_helpers.h
@@ -0,0 +1,184 @@
+/*
+ * arm/crc32_pmull_helpers.h - helper functions for CRC-32 folding with PMULL
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file is a "template" for instantiating helper functions for CRC folding
+ * with pmull instructions.  It accepts the following parameters:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.
+ * ENABLE_EOR3:
+ *	Use the eor3 instruction (from the sha3 extension).
+ */
+
+#include <arm_neon.h>
+
+/* Create a vector with 'a' in the first 4 bytes, and the rest zeroed out. */
+#undef u32_to_bytevec
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(u32_to_bytevec)(u32 a)
+{
+	return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
+}
+#define u32_to_bytevec	ADD_SUFFIX(u32_to_bytevec)
+
+/* Load two 64-bit values into a vector. */
+#undef load_multipliers
+static forceinline ATTRIBUTES poly64x2_t
+ADD_SUFFIX(load_multipliers)(const u64 p[2])
+{
+	return vreinterpretq_p64_u64(vld1q_u64(p));
+}
+#define load_multipliers	ADD_SUFFIX(load_multipliers)
+
+/* Do carryless multiplication of the low halves of two vectors. */
+#undef clmul_low
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
+{
+	return vreinterpretq_u8_p128(
+		     compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
+				      vgetq_lane_p64(b, 0)));
+}
+#define clmul_low	ADD_SUFFIX(clmul_low)
+
+/* Do carryless multiplication of the high halves of two vectors. */
+#undef clmul_high
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
+{
+#if defined(__clang__) && defined(ARCH_ARM64)
+	/*
+	 * Use inline asm to ensure that pmull2 is really used.  This works
+	 * around clang bug https://github.com/llvm/llvm-project/issues/52868.
+	 */
+	uint8x16_t res;
+
+	__asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
+	return res;
+#else
+	return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
+#endif
+}
+#define clmul_high	ADD_SUFFIX(clmul_high)
+
+#undef eor3
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
+{
+#if ENABLE_EOR3
+#if HAVE_SHA3_INTRIN
+	return veor3q_u8(a, b, c);
+#else
+	uint8x16_t res;
+
+	__asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+		: "=w" (res) : "w" (a), "w" (b), "w" (c));
+	return res;
+#endif
+#else /* ENABLE_EOR3 */
+	return veorq_u8(veorq_u8(a, b), c);
+#endif /* !ENABLE_EOR3 */
+}
+#define eor3	ADD_SUFFIX(eor3)
+
+#undef fold_vec
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
+{
+	uint8x16_t a = clmul_low(src, multipliers);
+	uint8x16_t b = clmul_high(src, multipliers);
+
+	return eor3(a, b, dst);
+}
+#define fold_vec	ADD_SUFFIX(fold_vec)
+
+#undef vtbl
+static forceinline ATTRIBUTES uint8x16_t
+ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices)
+{
+#ifdef ARCH_ARM64
+	return vqtbl1q_u8(table, indices);
+#else
+	uint8x8x2_t tab2;
+
+	tab2.val[0] = vget_low_u8(table);
+	tab2.val[1] = vget_high_u8(table);
+
+	return vcombine_u8(vtbl2_u8(tab2, vget_low_u8(indices)),
+			   vtbl2_u8(tab2, vget_high_u8(indices)));
+#endif
+}
+#define vtbl	ADD_SUFFIX(vtbl)
+
+/*
+ * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
+ * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
+ * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
+ * respectively.  Then fold x0 into x1 and return the result.  Assumes that
+ * 'p + len - 16' is in-bounds.
+ */
+#undef fold_partial_vec
+static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t
+ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
+			     poly64x2_t multipliers_1)
+{
+	/*
+	 * vtbl(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
+	 * vtbl(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
+	 */
+	static const u8 shift_tab[48] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	};
+	const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
+	const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
+	uint8x16_t x0, x1, bsl_mask;
+
+	/* x0 = v left-shifted by '16 - len' bytes */
+	x0 = vtbl(v, lshift);
+
+	/* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
+	bsl_mask = vreinterpretq_u8_s8(
+			vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
+
+	/*
+	 * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
+	 * bytes) followed by the remaining data.
+	 */
+	x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
+		      vld1q_u8(p + len - 16), vtbl(v, rshift));
+
+	return fold_vec(x0, x1, multipliers_1);
+}
+#define fold_partial_vec	ADD_SUFFIX(fold_partial_vec)
diff --git a/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_pmull_wide.h b/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_pmull_wide.h
new file mode 100644
index 000000000..a72e1d876
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/arm/crc32_pmull_wide.h
@@ -0,0 +1,227 @@
+/*
+ * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version)
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file is a "template" for instantiating PMULL-based crc32_arm functions.
+ * The "parameters" are:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.
+ * ENABLE_EOR3:
+ *	Use the eor3 instruction (from the sha3 extension).
+ *
+ * This is the extra-wide version; it uses an unusually large stride length of
+ * 12, and it assumes that crc32 instructions are available too.  It's intended
+ * for powerful CPUs that support both pmull and crc32 instructions, but where
+ * throughput of pmull and xor (given enough instructions issued in parallel) is
+ * significantly higher than that of crc32, thus making the crc32 instructions
+ * (counterintuitively) not actually the fastest way to compute the CRC-32.  The
+ * Apple M1 processor is an example of such a CPU.
+ */
+
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+#include <arm_neon.h>
+
+#include "crc32_pmull_helpers.h"
+
+static u32 ATTRIBUTES MAYBE_UNUSED
+ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
+{
+	uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
+
+	if (len < 3 * 192) {
+		static const u64 _aligned_attribute(16) mults[3][2] = {
+			CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS,
+		};
+		poly64x2_t multipliers_4, multipliers_2, multipliers_1;
+
+		if (len < 64)
+			goto tail;
+		multipliers_4 = load_multipliers(mults[0]);
+		multipliers_2 = load_multipliers(mults[1]);
+		multipliers_1 = load_multipliers(mults[2]);
+		/*
+		 * Short length; don't bother aligning the pointer, and fold
+		 * 64 bytes (4 vectors) at a time, at most.
+		 */
+		v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc));
+		v1 = vld1q_u8(p + 16);
+		v2 = vld1q_u8(p + 32);
+		v3 = vld1q_u8(p + 48);
+		p += 64;
+		len -= 64;
+		while (len >= 64) {
+			v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4);
+			v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4);
+			v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4);
+			v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4);
+			p += 64;
+			len -= 64;
+		}
+		v0 = fold_vec(v0, v2, multipliers_2);
+		v1 = fold_vec(v1, v3, multipliers_2);
+		if (len >= 32) {
+			v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2);
+			v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2);
+			p += 32;
+			len -= 32;
+		}
+		v0 = fold_vec(v0, v1, multipliers_1);
+	} else {
+		static const u64 _aligned_attribute(16) mults[4][2] = {
+			CRC32_12VECS_MULTS, CRC32_6VECS_MULTS,
+			CRC32_3VECS_MULTS, CRC32_1VECS_MULTS,
+		};
+		const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
+		const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
+		const poly64x2_t multipliers_3 = load_multipliers(mults[2]);
+		const poly64x2_t multipliers_1 = load_multipliers(mults[3]);
+		const size_t align = -(uintptr_t)p & 15;
+		const uint8x16_t *vp;
+
+		/* Align p to the next 16-byte boundary. */
+		if (align) {
+			if (align & 1)
+				crc = __crc32b(crc, *p++);
+			if (align & 2) {
+				crc = __crc32h(crc, le16_bswap(*(u16 *)p));
+				p += 2;
+			}
+			if (align & 4) {
+				crc = __crc32w(crc, le32_bswap(*(u32 *)p));
+				p += 4;
+			}
+			if (align & 8) {
+				crc = __crc32d(crc, le64_bswap(*(u64 *)p));
+				p += 8;
+			}
+			len -= align;
+		}
+		vp = (const uint8x16_t *)p;
+		v0 = veorq_u8(*vp++, u32_to_bytevec(crc));
+		v1 = *vp++;
+		v2 = *vp++;
+		v3 = *vp++;
+		v4 = *vp++;
+		v5 = *vp++;
+		v6 = *vp++;
+		v7 = *vp++;
+		v8 = *vp++;
+		v9 = *vp++;
+		v10 = *vp++;
+		v11 = *vp++;
+		len -= 192;
+		/* Fold 192 bytes (12 vectors) at a time. */
+		do {
+			v0 = fold_vec(v0, *vp++, multipliers_12);
+			v1 = fold_vec(v1, *vp++, multipliers_12);
+			v2 = fold_vec(v2, *vp++, multipliers_12);
+			v3 = fold_vec(v3, *vp++, multipliers_12);
+			v4 = fold_vec(v4, *vp++, multipliers_12);
+			v5 = fold_vec(v5, *vp++, multipliers_12);
+			v6 = fold_vec(v6, *vp++, multipliers_12);
+			v7 = fold_vec(v7, *vp++, multipliers_12);
+			v8 = fold_vec(v8, *vp++, multipliers_12);
+			v9 = fold_vec(v9, *vp++, multipliers_12);
+			v10 = fold_vec(v10, *vp++, multipliers_12);
+			v11 = fold_vec(v11, *vp++, multipliers_12);
+			len -= 192;
+		} while (len >= 192);
+
+		/*
+		 * Fewer than 192 bytes left.  Fold v0-v11 down to just v0,
+		 * while processing up to 144 more bytes.
+		 */
+		v0 = fold_vec(v0, v6, multipliers_6);
+		v1 = fold_vec(v1, v7, multipliers_6);
+		v2 = fold_vec(v2, v8, multipliers_6);
+		v3 = fold_vec(v3, v9, multipliers_6);
+		v4 = fold_vec(v4, v10, multipliers_6);
+		v5 = fold_vec(v5, v11, multipliers_6);
+		if (len >= 96) {
+			v0 = fold_vec(v0, *vp++, multipliers_6);
+			v1 = fold_vec(v1, *vp++, multipliers_6);
+			v2 = fold_vec(v2, *vp++, multipliers_6);
+			v3 = fold_vec(v3, *vp++, multipliers_6);
+			v4 = fold_vec(v4, *vp++, multipliers_6);
+			v5 = fold_vec(v5, *vp++, multipliers_6);
+			len -= 96;
+		}
+		v0 = fold_vec(v0, v3, multipliers_3);
+		v1 = fold_vec(v1, v4, multipliers_3);
+		v2 = fold_vec(v2, v5, multipliers_3);
+		if (len >= 48) {
+			v0 = fold_vec(v0, *vp++, multipliers_3);
+			v1 = fold_vec(v1, *vp++, multipliers_3);
+			v2 = fold_vec(v2, *vp++, multipliers_3);
+			len -= 48;
+		}
+		v0 = fold_vec(v0, v1, multipliers_1);
+		v0 = fold_vec(v0, v2, multipliers_1);
+		p = (const u8 *)vp;
+	}
+	/* Reduce 128 to 32 bits using crc32 instructions. */
+	crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0));
+	crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1));
+tail:
+	/* Finish up the remainder using crc32 instructions. */
+	if (len & 32) {
+		crc = __crc32d(crc, get_unaligned_le64(p + 0));
+		crc = __crc32d(crc, get_unaligned_le64(p + 8));
+		crc = __crc32d(crc, get_unaligned_le64(p + 16));
+		crc = __crc32d(crc, get_unaligned_le64(p + 24));
+		p += 32;
+	}
+	if (len & 16) {
+		crc = __crc32d(crc, get_unaligned_le64(p + 0));
+		crc = __crc32d(crc, get_unaligned_le64(p + 8));
+		p += 16;
+	}
+	if (len & 8) {
+		crc = __crc32d(crc, get_unaligned_le64(p));
+		p += 8;
+	}
+	if (len & 4) {
+		crc = __crc32w(crc, get_unaligned_le32(p));
+		p += 4;
+	}
+	if (len & 2) {
+		crc = __crc32h(crc, get_unaligned_le16(p));
+		p += 2;
+	}
+	if (len & 1)
+		crc = __crc32b(crc, *p);
+	return crc;
+}
+
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef ENABLE_EOR3
diff --git a/tools/z64compress/src/enc/libdeflate/lib/arm/matchfinder_impl.h b/tools/z64compress/src/enc/libdeflate/lib/arm/matchfinder_impl.h
new file mode 100644
index 000000000..b20f56a3b
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/arm/matchfinder_impl.h
@@ -0,0 +1,79 @@
+/*
+ * arm/matchfinder_impl.h - ARM implementations of matchfinder functions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_MATCHFINDER_IMPL_H
+#define LIB_ARM_MATCHFINDER_IMPL_H
+
+#include "cpu_features.h"
+
+#if HAVE_NEON_NATIVE
+#  include <arm_neon.h>
+static forceinline void
+matchfinder_init_neon(mf_pos_t *data, size_t size)
+{
+	int16x8_t *p = (int16x8_t *)data;
+	int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_neon
+
+static forceinline void
+matchfinder_rebase_neon(mf_pos_t *data, size_t size)
+{
+	int16x8_t *p = (int16x8_t *)data;
+	int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = vqaddq_s16(p[0], v);
+		p[1] = vqaddq_s16(p[1], v);
+		p[2] = vqaddq_s16(p[2], v);
+		p[3] = vqaddq_s16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_neon
+
+#endif /* HAVE_NEON_NATIVE */
+
+#endif /* LIB_ARM_MATCHFINDER_IMPL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/bt_matchfinder.h b/tools/z64compress/src/enc/libdeflate/lib/bt_matchfinder.h
new file mode 100644
index 000000000..b247d4bcc
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/bt_matchfinder.h
@@ -0,0 +1,342 @@
+/*
+ * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * This is a Binary Trees (bt) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * binary tree of sequences whose first 4 bytes share the same hash code.  Each
+ * sequence is identified by its starting position in the input buffer.  Each
+ * binary tree is always sorted such that each left child represents a sequence
+ * lexicographically lesser than its parent and each right child represents a
+ * sequence lexicographically greater than its parent.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, a new binary tree
+ * node is created to represent the current sequence.  Then, in a single tree
+ * traversal, the hash bucket's binary tree is searched for matches and is
+ * re-rooted at the new node.
+ *
+ * Compared to the simpler algorithm that uses linked lists instead of binary
+ * trees (see hc_matchfinder.h), the binary tree version gains more information
+ * at each node visitation.  Ideally, the binary tree version will examine only
+ * 'log(n)' nodes to find the same matches that the linked list version will
+ * find by examining 'n' nodes.  In addition, the binary tree version can
+ * examine fewer bytes at each node by taking advantage of the common prefixes
+ * that result from the sort order, whereas the linked list version may have to
+ * examine up to the full length of the match at each node.
+ *
+ * However, it is not always best to use the binary tree version.  It requires
+ * nearly twice as much memory as the linked list version, and it takes time to
+ * keep the binary trees sorted, even at positions where the compressor does not
+ * need matches.  Generally, when doing fast compression on small buffers,
+ * binary trees are the wrong approach.  They are best suited for thorough
+ * compression and/or large buffers.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+#ifndef LIB_BT_MATCHFINDER_H
+#define LIB_BT_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define BT_MATCHFINDER_HASH3_ORDER 16
+#define BT_MATCHFINDER_HASH3_WAYS  2
+#define BT_MATCHFINDER_HASH4_ORDER 16
+
+#define BT_MATCHFINDER_TOTAL_HASH_SIZE		\
+	(((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
+	  (1UL << BT_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
+
+/* Representation of a match found by the bt_matchfinder  */
+struct lz_match {
+
+	/* The number of bytes matched.  */
+	u16 length;
+
+	/* The offset back from the current position that was matched.  */
+	u16 offset;
+};
+
+struct MATCHFINDER_ALIGNED bt_matchfinder {
+
+	/* The hash table for finding length 3 matches  */
+	mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
+
+	/* The hash table which contains the roots of the binary trees for
+	 * finding length 4+ matches  */
+	mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
+
+	/* The child node references for the binary trees.  The left and right
+	 * children of the node for the sequence with position 'pos' are
+	 * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
+	mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
+};
+
+/* Prepare the matchfinder for a new input buffer.  */
+static forceinline void
+bt_matchfinder_init(struct bt_matchfinder *mf)
+{
+	STATIC_ASSERT(BT_MATCHFINDER_TOTAL_HASH_SIZE %
+		      MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_SIZE);
+}
+
+static forceinline void
+bt_matchfinder_slide_window(struct bt_matchfinder *mf)
+{
+	STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+static forceinline mf_pos_t *
+bt_left_child(struct bt_matchfinder *mf, s32 node)
+{
+	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
+}
+
+static forceinline mf_pos_t *
+bt_right_child(struct bt_matchfinder *mf, s32 node)
+{
+	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
+}
+
+/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
+ * and bt_matchfinder_skip_byte().  There must be sufficiently many bytes
+ * remaining to load a 32-bit integer from the *next* position.  */
+#define BT_MATCHFINDER_REQUIRED_NBYTES	5
+
+/* Advance the binary tree matchfinder by one byte, optionally recording
+ * matches.  @record_matches should be a compile-time constant.  */
+static forceinline struct lz_match *
+bt_matchfinder_advance_one_byte(struct bt_matchfinder * const mf,
+				const u8 * const in_base,
+				const ptrdiff_t cur_pos,
+				const u32 max_len,
+				const u32 nice_len,
+				const u32 max_search_depth,
+				u32 * const next_hashes,
+				struct lz_match *lz_matchptr,
+				const bool record_matches)
+{
+	const u8 *in_next = in_base + cur_pos;
+	u32 depth_remaining = max_search_depth;
+	const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+	u32 next_hashseq;
+	u32 hash3;
+	u32 hash4;
+	s32 cur_node;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+	s32 cur_node_2;
+#endif
+	const u8 *matchptr;
+	mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
+	u32 best_lt_len, best_gt_len;
+	u32 len;
+	u32 best_len = 3;
+
+	STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
+		      BT_MATCHFINDER_HASH3_WAYS <= 2);
+
+	next_hashseq = get_unaligned_le32(in_next + 1);
+
+	hash3 = next_hashes[0];
+	hash4 = next_hashes[1];
+
+	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
+	next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
+	prefetchw(&mf->hash3_tab[next_hashes[0]]);
+	prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+	cur_node = mf->hash3_tab[hash3][0];
+	mf->hash3_tab[hash3][0] = cur_pos;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+	cur_node_2 = mf->hash3_tab[hash3][1];
+	mf->hash3_tab[hash3][1] = cur_node;
+#endif
+	if (record_matches && cur_node > cutoff) {
+		u32 seq3 = load_u24_unaligned(in_next);
+		if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
+			lz_matchptr->length = 3;
+			lz_matchptr->offset = in_next - &in_base[cur_node];
+			lz_matchptr++;
+		}
+	#if BT_MATCHFINDER_HASH3_WAYS >= 2
+		else if (cur_node_2 > cutoff &&
+			seq3 == load_u24_unaligned(&in_base[cur_node_2]))
+		{
+			lz_matchptr->length = 3;
+			lz_matchptr->offset = in_next - &in_base[cur_node_2];
+			lz_matchptr++;
+		}
+	#endif
+	}
+
+	cur_node = mf->hash4_tab[hash4];
+	mf->hash4_tab[hash4] = cur_pos;
+
+	pending_lt_ptr = bt_left_child(mf, cur_pos);
+	pending_gt_ptr = bt_right_child(mf, cur_pos);
+
+	if (cur_node <= cutoff) {
+		*pending_lt_ptr = MATCHFINDER_INITVAL;
+		*pending_gt_ptr = MATCHFINDER_INITVAL;
+		return lz_matchptr;
+	}
+
+	best_lt_len = 0;
+	best_gt_len = 0;
+	len = 0;
+
+	for (;;) {
+		matchptr = &in_base[cur_node];
+
+		if (matchptr[len] == in_next[len]) {
+			len = lz_extend(in_next, matchptr, len + 1, max_len);
+			if (!record_matches || len > best_len) {
+				if (record_matches) {
+					best_len = len;
+					lz_matchptr->length = len;
+					lz_matchptr->offset = in_next - matchptr;
+					lz_matchptr++;
+				}
+				if (len >= nice_len) {
+					*pending_lt_ptr = *bt_left_child(mf, cur_node);
+					*pending_gt_ptr = *bt_right_child(mf, cur_node);
+					return lz_matchptr;
+				}
+			}
+		}
+
+		if (matchptr[len] < in_next[len]) {
+			*pending_lt_ptr = cur_node;
+			pending_lt_ptr = bt_right_child(mf, cur_node);
+			cur_node = *pending_lt_ptr;
+			best_lt_len = len;
+			if (best_gt_len < len)
+				len = best_gt_len;
+		} else {
+			*pending_gt_ptr = cur_node;
+			pending_gt_ptr = bt_left_child(mf, cur_node);
+			cur_node = *pending_gt_ptr;
+			best_gt_len = len;
+			if (best_lt_len < len)
+				len = best_lt_len;
+		}
+
+		if (cur_node <= cutoff || !--depth_remaining) {
+			*pending_lt_ptr = MATCHFINDER_INITVAL;
+			*pending_gt_ptr = MATCHFINDER_INITVAL;
+			return lz_matchptr;
+		}
+	}
+}
+
+/*
+ * Retrieve a list of matches with the current position.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base
+ *	Pointer to the next byte in the input buffer to process _at the last
+ *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
+ * @cur_pos
+ *	The current position in the input buffer relative to @in_base (the
+ *	position of the sequence being matched against).
+ * @max_len
+ *	The maximum permissible match length at this position.  Must be >=
+ *	BT_MATCHFINDER_REQUIRED_NBYTES.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ *	Must be <= @max_len.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *	The precomputed hash codes for the sequence beginning at @in_next.
+ *	These will be used and then updated with the precomputed hashcodes for
+ *	the sequence beginning at @in_next + 1.
+ * @lz_matchptr
+ *	An array in which this function will record the matches.  The recorded
+ *	matches will be sorted by strictly increasing length and (non-strictly)
+ *	increasing offset.  The maximum number of matches that may be found is
+ *	'nice_len - 2'.
+ *
+ * The return value is a pointer to the next available slot in the @lz_matchptr
+ * array.  (If no matches were found, this will be the same as @lz_matchptr.)
+ */
+static forceinline struct lz_match *
+bt_matchfinder_get_matches(struct bt_matchfinder *mf,
+			   const u8 *in_base,
+			   ptrdiff_t cur_pos,
+			   u32 max_len,
+			   u32 nice_len,
+			   u32 max_search_depth,
+			   u32 next_hashes[2],
+			   struct lz_match *lz_matchptr)
+{
+	return bt_matchfinder_advance_one_byte(mf,
+					       in_base,
+					       cur_pos,
+					       max_len,
+					       nice_len,
+					       max_search_depth,
+					       next_hashes,
+					       lz_matchptr,
+					       true);
+}
+
+/*
+ * Advance the matchfinder, but don't record any matches.
+ *
+ * This is very similar to bt_matchfinder_get_matches() because both functions
+ * must do hashing and tree re-rooting.
+ */
+static forceinline void
+bt_matchfinder_skip_byte(struct bt_matchfinder *mf,
+			 const u8 *in_base,
+			 ptrdiff_t cur_pos,
+			 u32 nice_len,
+			 u32 max_search_depth,
+			 u32 next_hashes[2])
+{
+	bt_matchfinder_advance_one_byte(mf,
+					in_base,
+					cur_pos,
+					nice_len,
+					nice_len,
+					max_search_depth,
+					next_hashes,
+					NULL,
+					false);
+}
+
+#endif /* LIB_BT_MATCHFINDER_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/cpu_features_common.h b/tools/z64compress/src/enc/libdeflate/lib/cpu_features_common.h
new file mode 100644
index 000000000..bfcaa3637
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/cpu_features_common.h
@@ -0,0 +1,91 @@
+/*
+ * cpu_features_common.h - code shared by all lib/$arch/cpu_features.c
+ *
+ * Copyright 2020 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_CPU_FEATURES_COMMON_H
+#define LIB_CPU_FEATURES_COMMON_H
+
+#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
+#  undef _ANSI_SOURCE	/* for strdup() and strtok_r() */
+#  ifndef __APPLE__
+#    define _GNU_SOURCE 1
+#  endif
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <string.h>
+#endif
+
+#include "lib_common.h"
+
+struct cpu_feature {
+	u32 bit;
+	const char *name;
+};
+
+#if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
+/* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */
+static inline void
+disable_cpu_features_for_testing(u32 *features,
+				 const struct cpu_feature *feature_table,
+				 size_t feature_table_length)
+{
+	char *env_value, *strbuf, *p, *saveptr = NULL;
+	size_t i;
+
+	env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES");
+	if (!env_value)
+		return;
+	strbuf = strdup(env_value);
+	if (!strbuf)
+		abort();
+	p = strtok_r(strbuf, ",", &saveptr);
+	while (p) {
+		for (i = 0; i < feature_table_length; i++) {
+			if (strcmp(p, feature_table[i].name) == 0) {
+				*features &= ~feature_table[i].bit;
+				break;
+			}
+		}
+		if (i == feature_table_length) {
+			fprintf(stderr,
+				"unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n",
+				p);
+			abort();
+		}
+		p = strtok_r(NULL, ",", &saveptr);
+	}
+	free(strbuf);
+}
+#else /* TEST_SUPPORT__DO_NOT_USE */
+static inline void
+disable_cpu_features_for_testing(u32 *features,
+				 const struct cpu_feature *feature_table,
+				 size_t feature_table_length)
+{
+}
+#endif /* !TEST_SUPPORT__DO_NOT_USE */
+
+#endif /* LIB_CPU_FEATURES_COMMON_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/crc32.c b/tools/z64compress/src/enc/libdeflate/lib/crc32.c
new file mode 100644
index 000000000..61c2cc763
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/crc32.c
@@ -0,0 +1,263 @@
+/*
+ * crc32.c - CRC-32 checksum algorithm for the gzip format
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * High-level description of CRC
+ * =============================
+ *
+ * Consider a bit sequence 'bits[1...len]'.  Interpret 'bits' as the "message"
+ * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
+ * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
+ *
+ *			R(x) = M(x)*x^n mod G(x)
+ *
+ * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
+ * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
+ * interpreted as a bitstring of length 'n'.
+ *
+ * CRC used in gzip
+ * ================
+ *
+ * In the gzip format (RFC 1952):
+ *
+ *	- The bitstring to checksum is formed from the bytes of the uncompressed
+ *	  data by concatenating the bits from the bytes in order, proceeding
+ *	  from the low-order bit to the high-order bit within each byte.
+ *
+ *	- The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
+ *	  x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
+ *	  Consequently, the CRC length is 32 bits ("CRC-32").
+ *
+ *	- The highest order 32 coefficients of M(x)*x^n are inverted.
+ *
+ *	- All 32 coefficients of R(x) are inverted.
+ *
+ * The two inversions cause added leading and trailing zero bits to affect the
+ * resulting CRC, whereas with a regular CRC such bits would have no effect on
+ * the CRC.
+ *
+ * Computation and optimizations
+ * =============================
+ *
+ * We can compute R(x) through "long division", maintaining only 32 bits of
+ * state at any given time.  Multiplication by 'x' can be implemented as
+ * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
+ * highest order bit represents the coefficient of x^0), and both addition and
+ * subtraction can be implemented as bitwise exclusive OR (since we are working
+ * in GF(2)).  Here is an unoptimized implementation:
+ *
+ *	static u32 crc32_gzip(const u8 *p, size_t len)
+ *	{
+ *		u32 crc = 0;
+ *		const u32 divisor = 0xEDB88320;
+ *
+ *		for (size_t i = 0; i < len * 8 + 32; i++) {
+ *			int bit;
+ *			u32 multiple;
+ *
+ *			if (i < len * 8)
+ *				bit = (p[i / 8] >> (i % 8)) & 1;
+ *			else
+ *				bit = 0; // one of the 32 appended 0 bits
+ *
+ *			if (i < 32) // the first 32 bits are inverted
+ *				bit ^= 1;
+ *
+ *			if (crc & 1)
+ *				multiple = divisor;
+ *			else
+ *				multiple = 0;
+ *
+ *			crc >>= 1;
+ *			crc |= (u32)bit << 31;
+ *			crc ^= multiple;
+ *		}
+ *
+ *		return ~crc;
+ *	}
+ *
+ * In this implementation, the 32-bit integer 'crc' maintains the remainder of
+ * the currently processed portion of the message (with 32 zero bits appended)
+ * when divided by the generator polynomial.  'crc' is the representation of
+ * R(x), and 'divisor' is the representation of G(x) excluding the x^32
+ * coefficient.  For each bit to process, we multiply R(x) by 'x^1', then add
+ * 'x^0' if the new bit is a 1.  If this causes R(x) to gain a nonzero x^32
+ * term, then we subtract G(x) from R(x).
+ *
+ * We can speed this up by taking advantage of the fact that XOR is commutative
+ * and associative, so the order in which we combine the inputs into 'crc' is
+ * unimportant.  And since each message bit we add doesn't affect the choice of
+ * 'multiple' until 32 bits later, we need not actually add each message bit
+ * until that point:
+ *
+ *	static u32 crc32_gzip(const u8 *p, size_t len)
+ *	{
+ *		u32 crc = ~0;
+ *		const u32 divisor = 0xEDB88320;
+ *
+ *		for (size_t i = 0; i < len * 8; i++) {
+ *			int bit;
+ *			u32 multiple;
+ *
+ *			bit = (p[i / 8] >> (i % 8)) & 1;
+ *			crc ^= bit;
+ *			if (crc & 1)
+ *				multiple = divisor;
+ *			else
+ *				multiple = 0;
+ *			crc >>= 1;
+ *			crc ^= multiple;
+ *		}
+ *
+ *		return ~crc;
+ *	}
+ *
+ * With the above implementation we get the effect of 32 appended 0 bits for
+ * free; they never affect the choice of a divisor, nor would they change the
+ * value of 'crc' if they were to be actually XOR'ed in.  And by starting with a
+ * remainder of all 1 bits, we get the effect of complementing the first 32
+ * message bits.
+ *
+ * The next optimization is to process the input in multi-bit units.  Suppose
+ * that we insert the next 'n' message bits into the remainder.  Then we get an
+ * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
+ * bits is the amount by which the low 32 bits of the remainder will change as a
+ * result of cancelling out those 'n' bits.  Taking n=8 (one byte) and
+ * precomputing a table containing the CRC of each possible byte, we get
+ * crc32_slice1() defined below.
+ *
+ * As a further optimization, we could increase the multi-bit unit size to 16.
+ * However, that is inefficient because the table size explodes from 256 entries
+ * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
+ * fit in L1 cache on typical processors.
+ *
+ * However, we can actually process 4 bytes at a time using 4 different tables
+ * with 256 entries each.  Logically, we form a 64-bit intermediate remainder
+ * and cancel out the high 32 bits in 8-bit chunks.  Bits 32-39 are cancelled
+ * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
+ * CRC of those bits with 8 zero bits appended, and so on.
+ *
+ * In crc32_slice8(), this method is extended to 8 bytes at a time.  The
+ * intermediate remainder (which we never actually store explicitly) is 96 bits.
+ *
+ * On CPUs that support fast carryless multiplication, CRCs can be computed even
+ * more quickly via "folding".  See e.g. the x86 PCLMUL implementation.
+ */
+
+#include "lib_common.h"
+#include "libdeflate.h"
+#include "crc32_multipliers.h"
+#include "crc32_tables.h"
+
+/* This is the default implementation.  It uses the slice-by-8 method. */
+static u32 MAYBE_UNUSED
+crc32_slice8(u32 crc, const u8 *p, size_t len)
+{
+	const u8 * const end = p + len;
+	const u8 *end64;
+
+	for (; ((uintptr_t)p & 7) && p != end; p++)
+		crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p];
+
+	end64 = p + ((end - p) & ~7);
+	for (; p != end64; p += 8) {
+		u32 v1 = le32_bswap(*(const u32 *)(p + 0));
+		u32 v2 = le32_bswap(*(const u32 *)(p + 4));
+
+		crc = crc32_slice8_table[0x700 + (u8)((crc ^ v1) >> 0)] ^
+		      crc32_slice8_table[0x600 + (u8)((crc ^ v1) >> 8)] ^
+		      crc32_slice8_table[0x500 + (u8)((crc ^ v1) >> 16)] ^
+		      crc32_slice8_table[0x400 + (u8)((crc ^ v1) >> 24)] ^
+		      crc32_slice8_table[0x300 + (u8)(v2 >> 0)] ^
+		      crc32_slice8_table[0x200 + (u8)(v2 >> 8)] ^
+		      crc32_slice8_table[0x100 + (u8)(v2 >> 16)] ^
+		      crc32_slice8_table[0x000 + (u8)(v2 >> 24)];
+	}
+
+	for (; p != end; p++)
+		crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p];
+
+	return crc;
+}
+
+/*
+ * This is a more lightweight generic implementation, which can be used as a
+ * subroutine by architecture-specific implementations to process small amounts
+ * of unaligned data at the beginning and/or end of the buffer.
+ */
+static forceinline u32 MAYBE_UNUSED
+crc32_slice1(u32 crc, const u8 *p, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len; i++)
+		crc = (crc >> 8) ^ crc32_slice1_table[(u8)crc ^ p[i]];
+	return crc;
+}
+
+/* Include architecture-specific implementation(s) if available. */
+#undef DEFAULT_IMPL
+#undef arch_select_crc32_func
+typedef u32 (*crc32_func_t)(u32 crc, const u8 *p, size_t len);
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
+#  include "arm/crc32_impl.h"
+#elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#  include "x86/crc32_impl.h"
+#endif
+
+#ifndef DEFAULT_IMPL
+#  define DEFAULT_IMPL crc32_slice8
+#endif
+
+#ifdef arch_select_crc32_func
+static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len);
+
+static volatile crc32_func_t crc32_impl = dispatch_crc32;
+
+/* Choose the best implementation at runtime. */
+static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len)
+{
+	crc32_func_t f = arch_select_crc32_func();
+
+	if (f == NULL)
+		f = DEFAULT_IMPL;
+
+	crc32_impl = f;
+	return f(crc, p, len);
+}
+#else
+/* The best implementation is statically known, so call it directly. */
+#define crc32_impl DEFAULT_IMPL
+#endif
+
+LIBDEFLATEAPI u32
+libdeflate_crc32(u32 crc, const void *p, size_t len)
+{
+	if (p == NULL) /* Return initial value. */
+		return 0;
+	return ~crc32_impl(~crc, p, len);
+}
diff --git a/tools/z64compress/src/enc/libdeflate/lib/crc32_multipliers.h b/tools/z64compress/src/enc/libdeflate/lib/crc32_multipliers.h
new file mode 100644
index 000000000..580b775bd
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/crc32_multipliers.h
@@ -0,0 +1,329 @@
+/*
+ * crc32_multipliers.h - constants for CRC-32 folding
+ *
+ * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.
+ */
+
+#define CRC32_1VECS_MULT_1 0xae689191 /* x^159 mod G(x) */
+#define CRC32_1VECS_MULT_2 0xccaa009e /* x^95 mod G(x) */
+#define CRC32_1VECS_MULTS { CRC32_1VECS_MULT_1, CRC32_1VECS_MULT_2 }
+
+#define CRC32_2VECS_MULT_1 0xf1da05aa /* x^287 mod G(x) */
+#define CRC32_2VECS_MULT_2 0x81256527 /* x^223 mod G(x) */
+#define CRC32_2VECS_MULTS { CRC32_2VECS_MULT_1, CRC32_2VECS_MULT_2 }
+
+#define CRC32_3VECS_MULT_1 0x3db1ecdc /* x^415 mod G(x) */
+#define CRC32_3VECS_MULT_2 0xaf449247 /* x^351 mod G(x) */
+#define CRC32_3VECS_MULTS { CRC32_3VECS_MULT_1, CRC32_3VECS_MULT_2 }
+
+#define CRC32_4VECS_MULT_1 0x8f352d95 /* x^543 mod G(x) */
+#define CRC32_4VECS_MULT_2 0x1d9513d7 /* x^479 mod G(x) */
+#define CRC32_4VECS_MULTS { CRC32_4VECS_MULT_1, CRC32_4VECS_MULT_2 }
+
+#define CRC32_5VECS_MULT_1 0x1c279815 /* x^671 mod G(x) */
+#define CRC32_5VECS_MULT_2 0xae0b5394 /* x^607 mod G(x) */
+#define CRC32_5VECS_MULTS { CRC32_5VECS_MULT_1, CRC32_5VECS_MULT_2 }
+
+#define CRC32_6VECS_MULT_1 0xdf068dc2 /* x^799 mod G(x) */
+#define CRC32_6VECS_MULT_2 0x57c54819 /* x^735 mod G(x) */
+#define CRC32_6VECS_MULTS { CRC32_6VECS_MULT_1, CRC32_6VECS_MULT_2 }
+
+#define CRC32_7VECS_MULT_1 0x31f8303f /* x^927 mod G(x) */
+#define CRC32_7VECS_MULT_2 0x0cbec0ed /* x^863 mod G(x) */
+#define CRC32_7VECS_MULTS { CRC32_7VECS_MULT_1, CRC32_7VECS_MULT_2 }
+
+#define CRC32_8VECS_MULT_1 0x33fff533 /* x^1055 mod G(x) */
+#define CRC32_8VECS_MULT_2 0x910eeec1 /* x^991 mod G(x) */
+#define CRC32_8VECS_MULTS { CRC32_8VECS_MULT_1, CRC32_8VECS_MULT_2 }
+
+#define CRC32_9VECS_MULT_1 0x26b70c3d /* x^1183 mod G(x) */
+#define CRC32_9VECS_MULT_2 0x3f41287a /* x^1119 mod G(x) */
+#define CRC32_9VECS_MULTS { CRC32_9VECS_MULT_1, CRC32_9VECS_MULT_2 }
+
+#define CRC32_10VECS_MULT_1 0xe3543be0 /* x^1311 mod G(x) */
+#define CRC32_10VECS_MULT_2 0x9026d5b1 /* x^1247 mod G(x) */
+#define CRC32_10VECS_MULTS { CRC32_10VECS_MULT_1, CRC32_10VECS_MULT_2 }
+
+#define CRC32_11VECS_MULT_1 0x5a1bb05d /* x^1439 mod G(x) */
+#define CRC32_11VECS_MULT_2 0xd1df2327 /* x^1375 mod G(x) */
+#define CRC32_11VECS_MULTS { CRC32_11VECS_MULT_1, CRC32_11VECS_MULT_2 }
+
+#define CRC32_12VECS_MULT_1 0x596c8d81 /* x^1567 mod G(x) */
+#define CRC32_12VECS_MULT_2 0xf5e48c85 /* x^1503 mod G(x) */
+#define CRC32_12VECS_MULTS { CRC32_12VECS_MULT_1, CRC32_12VECS_MULT_2 }
+
+#define CRC32_FINAL_MULT 0xb8bc6765 /* x^63 mod G(x) */
+#define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
+#define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
+#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
+
+#define CRC32_NUM_CHUNKS 4
+#define CRC32_MIN_VARIABLE_CHUNK_LEN 128UL
+#define CRC32_MAX_VARIABLE_CHUNK_LEN 16384UL
+
+/* Multipliers for implementations that use a variable chunk length */
+static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {
+	{ 0 /* unused row */ },
+	/* chunk_len=128 */
+	{ 0xd31343ea /* x^3039 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, 0x910eeec1 /* x^991 mod G(x) */, },
+	/* chunk_len=256 */
+	{ 0x1d6708a0 /* x^6111 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, },
+	/* chunk_len=384 */
+	{ 0xdb3839f3 /* x^9183 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, 0xd31343ea /* x^3039 mod G(x) */, },
+	/* chunk_len=512 */
+	{ 0x1753ab84 /* x^12255 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, },
+	/* chunk_len=640 */
+	{ 0x3796455c /* x^15327 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, 0xc352f6de /* x^5087 mod G(x) */, },
+	/* chunk_len=768 */
+	{ 0x3954de39 /* x^18399 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, },
+	/* chunk_len=896 */
+	{ 0x632d78c5 /* x^21471 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, 0x9a1b53c8 /* x^7135 mod G(x) */, },
+	/* chunk_len=1024 */
+	{ 0xa0decef3 /* x^24543 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, },
+	/* chunk_len=1152 */
+	{ 0xe9c09bb0 /* x^27615 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, 0xdb3839f3 /* x^9183 mod G(x) */, },
+	/* chunk_len=1280 */
+	{ 0xd51917a4 /* x^30687 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, },
+	/* chunk_len=1408 */
+	{ 0x154a8a62 /* x^33759 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, 0x3e9a43cd /* x^11231 mod G(x) */, },
+	/* chunk_len=1536 */
+	{ 0xf196555d /* x^36831 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, },
+	/* chunk_len=1664 */
+	{ 0x8eec2999 /* x^39903 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, 0x6044fbb0 /* x^13279 mod G(x) */, },
+	/* chunk_len=1792 */
+	{ 0x27892abf /* x^42975 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, },
+	/* chunk_len=1920 */
+	{ 0x77bc2419 /* x^46047 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, 0x3796455c /* x^15327 mod G(x) */, },
+	/* chunk_len=2048 */
+	{ 0xcea114a5 /* x^49119 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, },
+	/* chunk_len=2176 */
+	{ 0xa1077e85 /* x^52191 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, 0x0c21f835 /* x^17375 mod G(x) */, },
+	/* chunk_len=2304 */
+	{ 0xc5ed75e1 /* x^55263 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, },
+	/* chunk_len=2432 */
+	{ 0xca4fba3f /* x^58335 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, 0x6cb21510 /* x^19423 mod G(x) */, },
+	/* chunk_len=2560 */
+	{ 0xcf5bcdc4 /* x^61407 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, },
+	/* chunk_len=2688 */
+	{ 0xf36b9d16 /* x^64479 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, 0x632d78c5 /* x^21471 mod G(x) */, },
+	/* chunk_len=2816 */
+	{ 0xf76fd988 /* x^67551 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, },
+	/* chunk_len=2944 */
+	{ 0x6c45d92e /* x^70623 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, 0x0c46baec /* x^23519 mod G(x) */, },
+	/* chunk_len=3072 */
+	{ 0x6116b82b /* x^73695 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, },
+	/* chunk_len=3200 */
+	{ 0x4d9899bb /* x^76767 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, 0x53deb236 /* x^25567 mod G(x) */, },
+	/* chunk_len=3328 */
+	{ 0x3e7c93b9 /* x^79839 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, },
+	/* chunk_len=3456 */
+	{ 0x388b20ac /* x^82911 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, 0xe9c09bb0 /* x^27615 mod G(x) */, },
+	/* chunk_len=3584 */
+	{ 0x0956d953 /* x^85983 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, },
+	/* chunk_len=3712 */
+	{ 0x55cb4dfe /* x^89055 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, 0xc07331b3 /* x^29663 mod G(x) */, },
+	/* chunk_len=3840 */
+	{ 0x52222fea /* x^92127 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, },
+	/* chunk_len=3968 */
+	{ 0x0603989b /* x^95199 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, 0x5e04b9a5 /* x^31711 mod G(x) */, },
+	/* chunk_len=4096 */
+	{ 0x4470c029 /* x^98271 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, },
+	/* chunk_len=4224 */
+	{ 0xb6f35093 /* x^101343 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, 0x154a8a62 /* x^33759 mod G(x) */, },
+	/* chunk_len=4352 */
+	{ 0xc46805ba /* x^104415 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, },
+	/* chunk_len=4480 */
+	{ 0xc3876592 /* x^107487 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, 0xc35cf6e7 /* x^35807 mod G(x) */, },
+	/* chunk_len=4608 */
+	{ 0x5b0c98b9 /* x^110559 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, },
+	/* chunk_len=4736 */
+	{ 0x30d13e5f /* x^113631 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, 0x8c224466 /* x^37855 mod G(x) */, },
+	/* chunk_len=4864 */
+	{ 0x54afca53 /* x^116703 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, },
+	/* chunk_len=4992 */
+	{ 0x93102436 /* x^119775 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, 0x8eec2999 /* x^39903 mod G(x) */, },
+	/* chunk_len=5120 */
+	{ 0xbd2655a8 /* x^122847 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, },
+	/* chunk_len=5248 */
+	{ 0x70cd7f26 /* x^125919 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, 0x1691be45 /* x^41951 mod G(x) */, },
+	/* chunk_len=5376 */
+	{ 0x2d546c53 /* x^128991 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, },
+	/* chunk_len=5504 */
+	{ 0xb53410a8 /* x^132063 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, 0x161f3c12 /* x^43999 mod G(x) */, },
+	/* chunk_len=5632 */
+	{ 0x67a93f75 /* x^135135 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, },
+	/* chunk_len=5760 */
+	{ 0x9830ac33 /* x^138207 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, 0x77bc2419 /* x^46047 mod G(x) */, },
+	/* chunk_len=5888 */
+	{ 0xb0b6fc3e /* x^141279 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, },
+	/* chunk_len=6016 */
+	{ 0x84170f16 /* x^144351 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, 0x30de0f98 /* x^48095 mod G(x) */, },
+	/* chunk_len=6144 */
+	{ 0xd7017a0c /* x^147423 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, },
+	/* chunk_len=6272 */
+	{ 0xadb25de6 /* x^150495 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, 0x2b7e0e1b /* x^50143 mod G(x) */, },
+	/* chunk_len=6400 */
+	{ 0x8282fddc /* x^153567 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, },
+	/* chunk_len=6528 */
+	{ 0x46362bee /* x^156639 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, 0xa1077e85 /* x^52191 mod G(x) */, },
+	/* chunk_len=6656 */
+	{ 0xb9077a01 /* x^159711 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, },
+	/* chunk_len=6784 */
+	{ 0xf51d9bc6 /* x^162783 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, 0x7e774cf6 /* x^54239 mod G(x) */, },
+	/* chunk_len=6912 */
+	{ 0x4ca19a29 /* x^165855 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, },
+	/* chunk_len=7040 */
+	{ 0xdc0fc3fc /* x^168927 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, 0x3678fed2 /* x^56287 mod G(x) */, },
+	/* chunk_len=7168 */
+	{ 0x63c3d167 /* x^171999 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, },
+	/* chunk_len=7296 */
+	{ 0x5851d254 /* x^175071 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, 0xca4fba3f /* x^58335 mod G(x) */, },
+	/* chunk_len=7424 */
+	{ 0xfeacf2a1 /* x^178143 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, },
+	/* chunk_len=7552 */
+	{ 0x93b7edc8 /* x^181215 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, 0x58fa96ee /* x^60383 mod G(x) */, },
+	/* chunk_len=7680 */
+	{ 0x5539e44a /* x^184287 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, },
+	/* chunk_len=7808 */
+	{ 0xde32a3d2 /* x^187359 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, 0x6a6a3694 /* x^62431 mod G(x) */, },
+	/* chunk_len=7936 */
+	{ 0xf0baeeb6 /* x^190431 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, },
+	/* chunk_len=8064 */
+	{ 0xbe15887f /* x^193503 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, 0xf36b9d16 /* x^64479 mod G(x) */, },
+	/* chunk_len=8192 */
+	{ 0x64f34a05 /* x^196575 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, },
+	/* chunk_len=8320 */
+	{ 0x1b6d1aea /* x^199647 mod G(x) */, 0xfeafb67c /* x^133087 mod G(x) */, 0x4fb001a8 /* x^66527 mod G(x) */, },
+	/* chunk_len=8448 */
+	{ 0x82adb0b8 /* x^202719 mod G(x) */, 0x67a93f75 /* x^135135 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, },
+	/* chunk_len=8576 */
+	{ 0x694587c7 /* x^205791 mod G(x) */, 0x3b34408b /* x^137183 mod G(x) */, 0xeccb2978 /* x^68575 mod G(x) */, },
+	/* chunk_len=8704 */
+	{ 0xd2fc57c3 /* x^208863 mod G(x) */, 0x07fcf8c6 /* x^139231 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, },
+	/* chunk_len=8832 */
+	{ 0x9dd6837c /* x^211935 mod G(x) */, 0xb0b6fc3e /* x^141279 mod G(x) */, 0x6c45d92e /* x^70623 mod G(x) */, },
+	/* chunk_len=8960 */
+	{ 0x3a9d1f97 /* x^215007 mod G(x) */, 0xefd033b2 /* x^143327 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, },
+	/* chunk_len=9088 */
+	{ 0x1eee1d2a /* x^218079 mod G(x) */, 0xf2a6e46e /* x^145375 mod G(x) */, 0x55b4c814 /* x^72671 mod G(x) */, },
+	/* chunk_len=9216 */
+	{ 0xb57c7728 /* x^221151 mod G(x) */, 0xd7017a0c /* x^147423 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, },
+	/* chunk_len=9344 */
+	{ 0xf2fc5d61 /* x^224223 mod G(x) */, 0x242aac86 /* x^149471 mod G(x) */, 0x05245cf0 /* x^74719 mod G(x) */, },
+	/* chunk_len=9472 */
+	{ 0x26387824 /* x^227295 mod G(x) */, 0xc15c4ca5 /* x^151519 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, },
+	/* chunk_len=9600 */
+	{ 0x8c151e77 /* x^230367 mod G(x) */, 0x8282fddc /* x^153567 mod G(x) */, 0x4d9899bb /* x^76767 mod G(x) */, },
+	/* chunk_len=9728 */
+	{ 0x8ea1f680 /* x^233439 mod G(x) */, 0xf5ff6cdd /* x^155615 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, },
+	/* chunk_len=9856 */
+	{ 0xe8cf3d2a /* x^236511 mod G(x) */, 0x338b1fb1 /* x^157663 mod G(x) */, 0xeda61f70 /* x^78815 mod G(x) */, },
+	/* chunk_len=9984 */
+	{ 0x21f15b59 /* x^239583 mod G(x) */, 0xb9077a01 /* x^159711 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, },
+	/* chunk_len=10112 */
+	{ 0x6f68d64a /* x^242655 mod G(x) */, 0x901b0161 /* x^161759 mod G(x) */, 0xb9fd3537 /* x^80863 mod G(x) */, },
+	/* chunk_len=10240 */
+	{ 0x71b74d95 /* x^245727 mod G(x) */, 0xf5ddd5ad /* x^163807 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, },
+	/* chunk_len=10368 */
+	{ 0x4c2e7261 /* x^248799 mod G(x) */, 0x4ca19a29 /* x^165855 mod G(x) */, 0x388b20ac /* x^82911 mod G(x) */, },
+	/* chunk_len=10496 */
+	{ 0x8a2d38e8 /* x^251871 mod G(x) */, 0xd27ee0a1 /* x^167903 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, },
+	/* chunk_len=10624 */
+	{ 0x7e58ca17 /* x^254943 mod G(x) */, 0x69dfedd2 /* x^169951 mod G(x) */, 0x3a76805e /* x^84959 mod G(x) */, },
+	/* chunk_len=10752 */
+	{ 0xf997967f /* x^258015 mod G(x) */, 0x63c3d167 /* x^171999 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, },
+	/* chunk_len=10880 */
+	{ 0x48215963 /* x^261087 mod G(x) */, 0x71e1dfe0 /* x^174047 mod G(x) */, 0x42a6d410 /* x^87007 mod G(x) */, },
+	/* chunk_len=11008 */
+	{ 0xa704b94c /* x^264159 mod G(x) */, 0x679f198a /* x^176095 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, },
+	/* chunk_len=11136 */
+	{ 0x1d699056 /* x^267231 mod G(x) */, 0xfeacf2a1 /* x^178143 mod G(x) */, 0x55cb4dfe /* x^89055 mod G(x) */, },
+	/* chunk_len=11264 */
+	{ 0x6800bcc5 /* x^270303 mod G(x) */, 0x16024f15 /* x^180191 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, },
+	/* chunk_len=11392 */
+	{ 0x2d48e4ca /* x^273375 mod G(x) */, 0xbe61582f /* x^182239 mod G(x) */, 0x46026283 /* x^91103 mod G(x) */, },
+	/* chunk_len=11520 */
+	{ 0x4c4c2b55 /* x^276447 mod G(x) */, 0x5539e44a /* x^184287 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, },
+	/* chunk_len=11648 */
+	{ 0xd8ce94cb /* x^279519 mod G(x) */, 0xbc613c26 /* x^186335 mod G(x) */, 0x33776b4b /* x^93151 mod G(x) */, },
+	/* chunk_len=11776 */
+	{ 0xd0b5a02b /* x^282591 mod G(x) */, 0x490d3cc6 /* x^188383 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, },
+	/* chunk_len=11904 */
+	{ 0xa223f7ec /* x^285663 mod G(x) */, 0xf0baeeb6 /* x^190431 mod G(x) */, 0x0603989b /* x^95199 mod G(x) */, },
+	/* chunk_len=12032 */
+	{ 0x58de337a /* x^288735 mod G(x) */, 0x3bf3d597 /* x^192479 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, },
+	/* chunk_len=12160 */
+	{ 0x37f5d8f4 /* x^291807 mod G(x) */, 0x4d5b699b /* x^194527 mod G(x) */, 0xd7262e5f /* x^97247 mod G(x) */, },
+	/* chunk_len=12288 */
+	{ 0xfa8a435d /* x^294879 mod G(x) */, 0x64f34a05 /* x^196575 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, },
+	/* chunk_len=12416 */
+	{ 0x238709fe /* x^297951 mod G(x) */, 0x52e7458f /* x^198623 mod G(x) */, 0x9a174cd3 /* x^99295 mod G(x) */, },
+	/* chunk_len=12544 */
+	{ 0x9e1ba6f5 /* x^301023 mod G(x) */, 0xef0272f7 /* x^200671 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, },
+	/* chunk_len=12672 */
+	{ 0xcd8b57fa /* x^304095 mod G(x) */, 0x82adb0b8 /* x^202719 mod G(x) */, 0xb6f35093 /* x^101343 mod G(x) */, },
+	/* chunk_len=12800 */
+	{ 0x0aed142f /* x^307167 mod G(x) */, 0xb1650290 /* x^204767 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, },
+	/* chunk_len=12928 */
+	{ 0xd1f064db /* x^310239 mod G(x) */, 0x6e7340d3 /* x^206815 mod G(x) */, 0x5c28cb52 /* x^103391 mod G(x) */, },
+	/* chunk_len=13056 */
+	{ 0x464ac895 /* x^313311 mod G(x) */, 0xd2fc57c3 /* x^208863 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, },
+	/* chunk_len=13184 */
+	{ 0xa0e6beea /* x^316383 mod G(x) */, 0xcfeec3d0 /* x^210911 mod G(x) */, 0x0225d214 /* x^105439 mod G(x) */, },
+	/* chunk_len=13312 */
+	{ 0x78703ce0 /* x^319455 mod G(x) */, 0xc60f6075 /* x^212959 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, },
+	/* chunk_len=13440 */
+	{ 0xfea48165 /* x^322527 mod G(x) */, 0x3a9d1f97 /* x^215007 mod G(x) */, 0xc3876592 /* x^107487 mod G(x) */, },
+	/* chunk_len=13568 */
+	{ 0xdb89b8db /* x^325599 mod G(x) */, 0xa6172211 /* x^217055 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, },
+	/* chunk_len=13696 */
+	{ 0x7ca03731 /* x^328671 mod G(x) */, 0x1db42849 /* x^219103 mod G(x) */, 0xc5df246e /* x^109535 mod G(x) */, },
+	/* chunk_len=13824 */
+	{ 0x8801d0aa /* x^331743 mod G(x) */, 0xb57c7728 /* x^221151 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, },
+	/* chunk_len=13952 */
+	{ 0xf89cd7f0 /* x^334815 mod G(x) */, 0xcc396a0b /* x^223199 mod G(x) */, 0xdb799c51 /* x^111583 mod G(x) */, },
+	/* chunk_len=14080 */
+	{ 0x1611a808 /* x^337887 mod G(x) */, 0xaeae6105 /* x^225247 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, },
+	/* chunk_len=14208 */
+	{ 0xe3cdb888 /* x^340959 mod G(x) */, 0x26387824 /* x^227295 mod G(x) */, 0x30d13e5f /* x^113631 mod G(x) */, },
+	/* chunk_len=14336 */
+	{ 0x552a4cf6 /* x^344031 mod G(x) */, 0xee2d04bb /* x^229343 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, },
+	/* chunk_len=14464 */
+	{ 0x85e248e9 /* x^347103 mod G(x) */, 0x0a79663f /* x^231391 mod G(x) */, 0x53339cf7 /* x^115679 mod G(x) */, },
+	/* chunk_len=14592 */
+	{ 0x1c61c3e9 /* x^350175 mod G(x) */, 0x8ea1f680 /* x^233439 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, },
+	/* chunk_len=14720 */
+	{ 0xb14cfc2b /* x^353247 mod G(x) */, 0x2e073302 /* x^235487 mod G(x) */, 0x10897992 /* x^117727 mod G(x) */, },
+	/* chunk_len=14848 */
+	{ 0x6ec444cc /* x^356319 mod G(x) */, 0x9e819f13 /* x^237535 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, },
+	/* chunk_len=14976 */
+	{ 0xe2fa5f80 /* x^359391 mod G(x) */, 0x21f15b59 /* x^239583 mod G(x) */, 0x93102436 /* x^119775 mod G(x) */, },
+	/* chunk_len=15104 */
+	{ 0x6d33f4c6 /* x^362463 mod G(x) */, 0x31a27455 /* x^241631 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, },
+	/* chunk_len=15232 */
+	{ 0xb6dec609 /* x^365535 mod G(x) */, 0x4d437056 /* x^243679 mod G(x) */, 0x42eb1e2a /* x^121823 mod G(x) */, },
+	/* chunk_len=15360 */
+	{ 0x1846c518 /* x^368607 mod G(x) */, 0x71b74d95 /* x^245727 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, },
+	/* chunk_len=15488 */
+	{ 0x9f947f8a /* x^371679 mod G(x) */, 0x2b501619 /* x^247775 mod G(x) */, 0xa4924b0e /* x^123871 mod G(x) */, },
+	/* chunk_len=15616 */
+	{ 0xb7442f4d /* x^374751 mod G(x) */, 0xba30a5d8 /* x^249823 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, },
+	/* chunk_len=15744 */
+	{ 0xe2c93242 /* x^377823 mod G(x) */, 0x8a2d38e8 /* x^251871 mod G(x) */, 0x70cd7f26 /* x^125919 mod G(x) */, },
+	/* chunk_len=15872 */
+	{ 0xcd6863df /* x^380895 mod G(x) */, 0x78fd88dc /* x^253919 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, },
+	/* chunk_len=16000 */
+	{ 0xd512001d /* x^383967 mod G(x) */, 0xe6612dff /* x^255967 mod G(x) */, 0x5c4d0ca9 /* x^127967 mod G(x) */, },
+	/* chunk_len=16128 */
+	{ 0x4e8d6b6c /* x^387039 mod G(x) */, 0xf997967f /* x^258015 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, },
+	/* chunk_len=16256 */
+	{ 0xfa653ba1 /* x^390111 mod G(x) */, 0xc99014d4 /* x^260063 mod G(x) */, 0xa0c9fd27 /* x^130015 mod G(x) */, },
+	/* chunk_len=16384 */
+	{ 0x49893408 /* x^393183 mod G(x) */, 0x29c2448b /* x^262111 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, },
+};
+
+/* Multipliers for implementations that use a large fixed chunk length */
+#define CRC32_FIXED_CHUNK_LEN 32768UL
+#define CRC32_FIXED_CHUNK_MULT_1 0x29c2448b /* x^262111 mod G(x) */
+#define CRC32_FIXED_CHUNK_MULT_2 0x4b912f53 /* x^524255 mod G(x) */
+#define CRC32_FIXED_CHUNK_MULT_3 0x454c93be /* x^786399 mod G(x) */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/crc32_tables.h b/tools/z64compress/src/enc/libdeflate/lib/crc32_tables.h
new file mode 100644
index 000000000..86228c72a
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/crc32_tables.h
@@ -0,0 +1,587 @@
+/*
+ * crc32_tables.h - data tables for CRC-32 computation
+ *
+ * THIS FILE WAS GENERATED BY gen_crc32_tables.c.  DO NOT EDIT.
+ */
+
+static const u32 crc32_slice1_table[] MAYBE_UNUSED = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+	0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+	0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+	0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+	0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+	0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+	0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+	0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+	0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+	0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+	0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+	0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+	0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+	0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+	0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+	0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+	0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+	0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+	0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+	0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+};
+
+static const u32 crc32_slice8_table[] MAYBE_UNUSED = {
+	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+	0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+	0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+	0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+	0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+	0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+	0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+	0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+	0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+	0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+	0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+	0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+	0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+	0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+	0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+	0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+	0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+	0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+	0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+	0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+	0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
+	0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
+	0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
+	0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
+	0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
+	0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
+	0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
+	0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
+	0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
+	0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
+	0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
+	0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
+	0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
+	0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
+	0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
+	0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
+	0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
+	0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
+	0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
+	0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
+	0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
+	0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
+	0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
+	0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
+	0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
+	0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
+	0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
+	0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
+	0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
+	0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
+	0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
+	0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
+	0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
+	0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
+	0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
+	0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
+	0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
+	0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
+	0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
+	0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
+	0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
+	0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
+	0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
+	0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
+	0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
+	0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
+	0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
+	0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
+	0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
+	0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
+	0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
+	0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
+	0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
+	0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
+	0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
+	0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
+	0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
+	0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
+	0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
+	0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
+	0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
+	0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
+	0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
+	0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
+	0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
+	0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
+	0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
+	0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
+	0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
+	0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
+	0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
+	0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
+	0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
+	0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
+	0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
+	0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
+	0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
+	0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
+	0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
+	0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
+	0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
+	0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
+	0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
+	0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
+	0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
+	0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
+	0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
+	0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
+	0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
+	0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
+	0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
+	0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
+	0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
+	0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
+	0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
+	0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
+	0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
+	0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
+	0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
+	0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
+	0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
+	0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
+	0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
+	0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
+	0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
+	0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
+	0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
+	0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
+	0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
+	0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
+	0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
+	0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
+	0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
+	0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
+	0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
+	0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
+	0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
+	0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
+	0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
+	0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
+	0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
+	0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
+	0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
+	0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
+	0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
+	0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
+	0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
+	0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
+	0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
+	0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
+	0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
+	0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
+	0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
+	0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
+	0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
+	0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
+	0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
+	0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
+	0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
+	0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
+	0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
+	0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
+	0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
+	0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
+	0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
+	0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
+	0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
+	0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
+	0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
+	0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
+	0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
+	0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
+	0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
+	0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
+	0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
+	0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
+	0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
+	0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
+	0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
+	0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
+	0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
+	0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
+	0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
+	0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
+	0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
+	0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
+	0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
+	0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
+	0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
+	0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
+	0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
+	0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
+	0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
+	0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
+	0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
+	0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
+	0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
+	0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
+	0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
+	0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
+	0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
+	0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
+	0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
+	0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
+	0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
+	0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
+	0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
+	0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
+	0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
+	0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
+	0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
+	0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
+	0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
+	0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
+	0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
+	0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
+	0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
+	0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
+	0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
+	0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
+	0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
+	0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
+	0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
+	0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
+	0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
+	0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
+	0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
+	0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
+	0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
+	0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
+	0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
+	0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
+	0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
+	0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
+	0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
+	0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
+	0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
+	0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
+	0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
+	0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
+	0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
+	0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
+	0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
+	0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
+	0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
+	0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
+	0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
+	0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
+	0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
+	0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
+	0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
+	0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
+	0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
+	0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
+	0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
+	0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
+	0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
+	0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
+	0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
+	0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
+	0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
+	0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
+	0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
+	0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
+	0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
+	0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
+	0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
+	0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
+	0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
+	0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
+	0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
+	0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
+	0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
+	0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
+	0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
+	0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
+	0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
+	0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
+	0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
+	0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
+	0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
+	0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
+	0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
+	0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
+	0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
+	0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
+	0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
+	0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
+	0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
+	0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
+	0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
+	0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
+	0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
+	0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
+	0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
+	0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
+	0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
+	0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
+	0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
+	0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
+	0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
+	0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
+	0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
+	0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
+	0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
+	0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
+	0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
+	0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
+	0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
+	0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
+	0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
+	0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
+	0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
+	0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
+	0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
+	0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
+	0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
+	0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
+	0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
+	0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
+	0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
+	0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
+	0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
+	0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
+	0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
+	0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
+	0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
+	0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
+	0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
+	0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
+	0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
+	0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
+	0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
+	0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
+	0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
+	0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
+	0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
+	0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
+	0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
+	0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
+	0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
+	0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
+	0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
+	0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
+	0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
+	0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
+	0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
+	0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
+	0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
+	0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
+	0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
+	0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
+	0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
+	0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
+	0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
+	0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
+	0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
+	0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
+	0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
+	0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
+	0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
+	0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
+	0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
+	0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
+	0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
+	0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
+	0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
+	0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
+	0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
+	0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
+	0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
+	0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
+	0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
+	0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
+	0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
+	0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
+	0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
+	0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
+	0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
+	0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
+	0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
+	0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
+	0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
+	0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
+	0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
+	0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
+	0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
+	0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
+	0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
+	0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
+	0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
+	0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
+	0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
+	0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
+	0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
+	0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
+	0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
+	0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
+	0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
+	0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
+	0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
+	0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
+	0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
+	0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
+	0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
+	0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
+	0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
+	0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
+	0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
+	0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
+	0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
+	0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
+	0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
+	0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
+	0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
+	0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
+	0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
+	0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
+	0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
+	0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
+	0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
+	0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
+	0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
+	0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
+	0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
+	0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
+	0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
+	0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
+	0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
+	0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
+	0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
+	0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
+	0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
+	0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
+	0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
+	0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
+	0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
+	0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
+	0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
+	0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
+	0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
+	0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
+	0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
+	0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
+	0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
+	0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
+	0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
+	0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
+	0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
+	0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
+	0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
+	0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
+	0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
+	0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
+	0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
+	0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
+	0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
+	0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
+	0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
+	0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
+	0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
+	0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
+	0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
+	0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
+	0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
+	0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
+	0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
+	0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
+};
diff --git a/tools/z64compress/src/enc/libdeflate/lib/decompress_template.h b/tools/z64compress/src/enc/libdeflate/lib/decompress_template.h
new file mode 100644
index 000000000..2d9dfa82b
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/decompress_template.h
@@ -0,0 +1,774 @@
+/*
+ * decompress_template.h
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This is the actual DEFLATE decompression routine, lifted out of
+ * deflate_decompress.c so that it can be compiled multiple times with different
+ * target instruction sets.
+ */
+
+#ifndef ATTRIBUTES
+#  define ATTRIBUTES
+#endif
+#ifndef EXTRACT_VARBITS
+#  define EXTRACT_VARBITS(word, count)	((word) & BITMASK(count))
+#endif
+#ifndef EXTRACT_VARBITS8
+#  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
+#endif
+
+static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
+FUNCNAME(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+	u8 *out_next = out;
+	u8 * const out_end = out_next + out_nbytes_avail;
+	u8 * const out_fastloop_end =
+		out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
+
+	/* Input bitstream state; see deflate_decompress.c for documentation */
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	const u8 * const in_fastloop_end =
+		in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
+	bitbuf_t bitbuf = 0;
+	bitbuf_t saved_bitbuf;
+	u32 bitsleft = 0;
+	size_t overread_count = 0;
+
+	bool is_final_block;
+	unsigned block_type;
+	unsigned num_litlen_syms;
+	unsigned num_offset_syms;
+	bitbuf_t litlen_tablemask;
+	u32 entry;
+
+next_block:
+	/* Starting to read the next block */
+	;
+
+	STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
+	REFILL_BITS();
+
+	/* BFINAL: 1 bit */
+	is_final_block = bitbuf & BITMASK(1);
+
+	/* BTYPE: 2 bits */
+	block_type = (bitbuf >> 1) & BITMASK(2);
+
+	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+
+		/* Dynamic Huffman block */
+
+		/* The order in which precode lengths are stored */
+		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+		};
+
+		unsigned num_explicit_precode_lens;
+		unsigned i;
+
+		/* Read the codeword length counts. */
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
+		num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
+		num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
+
+		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
+		num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
+
+		d->static_codes_loaded = false;
+
+		/*
+		 * Read the precode codeword lengths.
+		 *
+		 * A 64-bit bitbuffer is just one bit too small to hold the
+		 * maximum number of precode lens, so to minimize branches we
+		 * merge one len with the previous fields.
+		 */
+		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
+		if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+			d->u.precode_lens[deflate_precode_lens_permutation[0]] =
+				(bitbuf >> 17) & BITMASK(3);
+			bitbuf >>= 20;
+			bitsleft -= 20;
+			REFILL_BITS();
+			i = 1;
+			do {
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		} else {
+			bitbuf >>= 17;
+			bitsleft -= 17;
+			i = 0;
+			do {
+				if ((u8)bitsleft < 3)
+					REFILL_BITS();
+				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+					bitbuf & BITMASK(3);
+				bitbuf >>= 3;
+				bitsleft -= 3;
+			} while (++i < num_explicit_precode_lens);
+		}
+		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
+
+		/* Build the decode table for the precode. */
+		SAFETY_CHECK(build_precode_decode_table(d));
+
+		/* Decode the litlen and offset codeword lengths. */
+		i = 0;
+		do {
+			unsigned presym;
+			u8 rep_val;
+			unsigned rep_count;
+
+			if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
+				REFILL_BITS();
+
+			/*
+			 * The code below assumes that the precode decode table
+			 * doesn't have any subtables.
+			 */
+			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+
+			/* Decode the next precode symbol. */
+			entry = d->u.l.precode_decode_table[
+				bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry; /* optimization: subtract full entry */
+			presym = entry >> 16;
+
+			if (presym < 16) {
+				/* Explicit codeword length */
+				d->u.l.lens[i++] = presym;
+				continue;
+			}
+
+			/* Run-length encoded codeword lengths */
+
+			/*
+			 * Note: we don't need verify that the repeat count
+			 * doesn't overflow the number of elements, since we've
+			 * sized the lens array to have enough extra space to
+			 * allow for the worst-case overrun (138 zeroes when
+			 * only 1 length was remaining).
+			 *
+			 * In the case of the small repeat counts (presyms 16
+			 * and 17), it is fastest to always write the maximum
+			 * number of entries.  That gets rid of branches that
+			 * would otherwise be required.
+			 *
+			 * It is not just because of the numerical order that
+			 * our checks go in the order 'presym < 16', 'presym ==
+			 * 16', and 'presym == 17'.  For typical data this is
+			 * ordered from most frequent to least frequent case.
+			 */
+			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
+
+			if (presym == 16) {
+				/* Repeat the previous length 3 - 6 times. */
+				SAFETY_CHECK(i != 0);
+				rep_val = d->u.l.lens[i - 1];
+				STATIC_ASSERT(3 + BITMASK(2) == 6);
+				rep_count = 3 + (bitbuf & BITMASK(2));
+				bitbuf >>= 2;
+				bitsleft -= 2;
+				d->u.l.lens[i + 0] = rep_val;
+				d->u.l.lens[i + 1] = rep_val;
+				d->u.l.lens[i + 2] = rep_val;
+				d->u.l.lens[i + 3] = rep_val;
+				d->u.l.lens[i + 4] = rep_val;
+				d->u.l.lens[i + 5] = rep_val;
+				i += rep_count;
+			} else if (presym == 17) {
+				/* Repeat zero 3 - 10 times. */
+				STATIC_ASSERT(3 + BITMASK(3) == 10);
+				rep_count = 3 + (bitbuf & BITMASK(3));
+				bitbuf >>= 3;
+				bitsleft -= 3;
+				d->u.l.lens[i + 0] = 0;
+				d->u.l.lens[i + 1] = 0;
+				d->u.l.lens[i + 2] = 0;
+				d->u.l.lens[i + 3] = 0;
+				d->u.l.lens[i + 4] = 0;
+				d->u.l.lens[i + 5] = 0;
+				d->u.l.lens[i + 6] = 0;
+				d->u.l.lens[i + 7] = 0;
+				d->u.l.lens[i + 8] = 0;
+				d->u.l.lens[i + 9] = 0;
+				i += rep_count;
+			} else {
+				/* Repeat zero 11 - 138 times. */
+				STATIC_ASSERT(11 + BITMASK(7) == 138);
+				rep_count = 11 + (bitbuf & BITMASK(7));
+				bitbuf >>= 7;
+				bitsleft -= 7;
+				memset(&d->u.l.lens[i], 0,
+				       rep_count * sizeof(d->u.l.lens[i]));
+				i += rep_count;
+			}
+		} while (i < num_litlen_syms + num_offset_syms);
+
+	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+		u16 len, nlen;
+
+		/*
+		 * Uncompressed block: copy 'len' bytes literally from the input
+		 * buffer to the output buffer.
+		 */
+
+		bitsleft -= 3; /* for BTYPE and BFINAL */
+
+		/*
+		 * Align the bitstream to the next byte boundary.  This means
+		 * the next byte boundary as if we were reading a byte at a
+		 * time.  Therefore, we have to rewind 'in_next' by any bytes
+		 * that have been refilled but not actually consumed yet (not
+		 * counting overread bytes, which don't increment 'in_next').
+		 */
+		bitsleft = (u8)bitsleft;
+		SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+		in_next -= (bitsleft >> 3) - overread_count;
+		overread_count = 0;
+		bitbuf = 0;
+		bitsleft = 0;
+
+		SAFETY_CHECK(in_end - in_next >= 4);
+		len = get_unaligned_le16(in_next);
+		nlen = get_unaligned_le16(in_next + 2);
+		in_next += 4;
+
+		SAFETY_CHECK(len == (u16)~nlen);
+		if (unlikely(len > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+		SAFETY_CHECK(len <= in_end - in_next);
+
+		memcpy(out_next, in_next, len);
+		in_next += len;
+		out_next += len;
+
+		goto block_done;
+
+	} else {
+		unsigned i;
+
+		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+
+		/*
+		 * Static Huffman block: build the decode tables for the static
+		 * codes.  Skip doing so if the tables are already set up from
+		 * an earlier static block; this speeds up decompression of
+		 * degenerate input of many empty or very short static blocks.
+		 *
+		 * Afterwards, the remainder is the same as decompressing a
+		 * dynamic Huffman block.
+		 */
+
+		bitbuf >>= 3; /* for BTYPE and BFINAL */
+		bitsleft -= 3;
+
+		if (d->static_codes_loaded)
+			goto have_decode_tables;
+
+		d->static_codes_loaded = true;
+
+		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
+		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
+
+		for (i = 0; i < 144; i++)
+			d->u.l.lens[i] = 8;
+		for (; i < 256; i++)
+			d->u.l.lens[i] = 9;
+		for (; i < 280; i++)
+			d->u.l.lens[i] = 7;
+		for (; i < 288; i++)
+			d->u.l.lens[i] = 8;
+
+		for (; i < 288 + 32; i++)
+			d->u.l.lens[i] = 5;
+
+		num_litlen_syms = 288;
+		num_offset_syms = 32;
+	}
+
+	/* Decompressing a Huffman block (either dynamic or static) */
+
+	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
+have_decode_tables:
+	litlen_tablemask = BITMASK(d->litlen_tablebits);
+
+	/*
+	 * This is the "fastloop" for decoding literals and matches.  It does
+	 * bounds checks on in_next and out_next in the loop conditions so that
+	 * additional bounds checks aren't needed inside the loop body.
+	 *
+	 * To reduce latency, the bitbuffer is refilled and the next litlen
+	 * decode table entry is preloaded before each loop iteration.
+	 */
+	if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
+		goto generic_loop;
+	REFILL_BITS_IN_FASTLOOP();
+	entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+	do {
+		u32 length, offset, lit;
+		const u8 *src;
+		u8 *dst;
+
+		/*
+		 * Consume the bits for the litlen decode table entry.  Save the
+		 * original bitbuf for later, in case the extra match length
+		 * bits need to be extracted from it.
+		 */
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+
+		/*
+		 * Begin by checking for a "fast" literal, i.e. a literal that
+		 * doesn't need a subtable.
+		 */
+		if (entry & HUFFDEC_LITERAL) {
+			/*
+			 * On 64-bit platforms, we decode up to 2 extra fast
+			 * literals in addition to the primary item, as this
+			 * increases performance and still leaves enough bits
+			 * remaining for what follows.  We could actually do 3,
+			 * assuming LITLEN_TABLEBITS=11, but that actually
+			 * decreases performance slightly (perhaps by messing
+			 * with the branch prediction of the conditional refill
+			 * that happens later while decoding the match offset).
+			 *
+			 * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
+			 * and FASTLOOP_MAX_BYTES_READ need to be updated if the
+			 * number of extra literals decoded here is changed.
+			 */
+			if (/* enough bits for 2 fast literals + length + offset preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 LENGTH_MAXBITS,
+							 OFFSET_TABLEBITS) &&
+			    /* enough bits for 2 fast literals + slow literal + litlen preload? */
+			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+							 DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							 LITLEN_TABLEBITS)) {
+				/* 1st extra fast literal */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				saved_bitbuf = bitbuf;
+				bitbuf >>= (u8)entry;
+				bitsleft -= entry;
+				*out_next++ = lit;
+				if (entry & HUFFDEC_LITERAL) {
+					/* 2nd extra fast literal */
+					lit = entry >> 16;
+					entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+					saved_bitbuf = bitbuf;
+					bitbuf >>= (u8)entry;
+					bitsleft -= entry;
+					*out_next++ = lit;
+					if (entry & HUFFDEC_LITERAL) {
+						/*
+						 * Another fast literal, but
+						 * this one is in lieu of the
+						 * primary item, so it doesn't
+						 * count as one of the extras.
+						 */
+						lit = entry >> 16;
+						entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+						REFILL_BITS_IN_FASTLOOP();
+						*out_next++ = lit;
+						continue;
+					}
+				}
+			} else {
+				/*
+				 * Decode a literal.  While doing so, preload
+				 * the next litlen decode table entry and refill
+				 * the bitbuffer.  To reduce latency, we've
+				 * arranged for there to be enough "preloadable"
+				 * bits remaining to do the table preload
+				 * independently of the refill.
+				 */
+				STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
+						LITLEN_TABLEBITS, LITLEN_TABLEBITS));
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+		}
+
+		/*
+		 * It's not a literal entry, so it can be a length entry, a
+		 * subtable pointer entry, or an end-of-block entry.  Detect the
+		 * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
+		 */
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			/* Subtable pointer or end-of-block entry */
+
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+
+			/*
+			 * A subtable is required.  Load and consume the
+			 * subtable entry.  The subtable entry can be of any
+			 * type: literal, length, or end-of-block.
+			 */
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+				EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+
+			/*
+			 * 32-bit platforms that use the byte-at-a-time refill
+			 * method have to do a refill here for there to always
+			 * be enough bits to decode a literal that requires a
+			 * subtable, then preload the next litlen decode table
+			 * entry; or to decode a match length that requires a
+			 * subtable, then preload the offset decode table entry.
+			 */
+			if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+							  LITLEN_TABLEBITS) ||
+			    !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
+							  OFFSET_TABLEBITS))
+				REFILL_BITS_IN_FASTLOOP();
+			if (entry & HUFFDEC_LITERAL) {
+				/* Decode a literal that required a subtable. */
+				lit = entry >> 16;
+				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+				REFILL_BITS_IN_FASTLOOP();
+				*out_next++ = lit;
+				continue;
+			}
+			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+				goto block_done;
+			/* Else, it's a length that required a subtable. */
+		}
+
+		/*
+		 * Decode the match length: the length base value associated
+		 * with the litlen symbol (which we extract from the decode
+		 * table entry), plus the extra length bits.  We don't need to
+		 * consume the extra length bits here, as they were included in
+		 * the bits consumed by the entry earlier.  We also don't need
+		 * to check for too-long matches here, as this is inside the
+		 * fastloop where it's already been verified that the output
+		 * buffer has enough space remaining to copy a max-length match.
+		 */
+		length = entry >> 16;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/*
+		 * Decode the match offset.  There are enough "preloadable" bits
+		 * remaining to preload the offset decode table entry, but a
+		 * refill might be needed before consuming it.
+		 */
+		STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
+							   OFFSET_TABLEBITS));
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
+						 LITLEN_TABLEBITS)) {
+			/*
+			 * Decoding a match offset on a 64-bit platform.  We may
+			 * need to refill once, but then we can decode the whole
+			 * offset and preload the next litlen table entry.
+			 */
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
+					     LITLEN_TABLEBITS - PRELOAD_SLACK))
+					REFILL_BITS_IN_FASTLOOP();
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			} else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
+					    LITLEN_TABLEBITS - PRELOAD_SLACK))
+				REFILL_BITS_IN_FASTLOOP();
+		} else {
+			/* Decoding a match offset on a 32-bit platform */
+			REFILL_BITS_IN_FASTLOOP();
+			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+				/* Offset codeword requires a subtable */
+				bitbuf >>= OFFSET_TABLEBITS;
+				bitsleft -= OFFSET_TABLEBITS;
+				entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+				REFILL_BITS_IN_FASTLOOP();
+				/* No further refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(
+					OFFSET_MAXBITS - OFFSET_TABLEBITS));
+			} else {
+				/* No refill needed before extra bits */
+				STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
+			}
+		}
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry; /* optimization: subtract full entry */
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+
+		/* Validate the match offset; needed even in the fastloop. */
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		/*
+		 * Before starting to issue the instructions to copy the match,
+		 * refill the bitbuffer and preload the litlen decode table
+		 * entry for the next loop iteration.  This can increase
+		 * performance by allowing the latency of the match copy to
+		 * overlap with these other operations.  To further reduce
+		 * latency, we've arranged for there to be enough bits remaining
+		 * to do the table preload independently of the refill, except
+		 * on 32-bit platforms using the byte-at-a-time refill method.
+		 */
+		if (!CAN_CONSUME_AND_THEN_PRELOAD(
+			MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
+			    OFFSET_MAXFASTBITS),
+			LITLEN_TABLEBITS) &&
+		    unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
+			REFILL_BITS_IN_FASTLOOP();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		REFILL_BITS_IN_FASTLOOP();
+
+		/*
+		 * Copy the match.  On most CPUs the fastest method is a
+		 * word-at-a-time copy, unconditionally copying about 5 words
+		 * since this is enough for most matches without being too much.
+		 *
+		 * The normal word-at-a-time copy works for offset >= WORDBYTES,
+		 * which is most cases.  The case of offset == 1 is also common
+		 * and is worth optimizing for, since it is just RLE encoding of
+		 * the previous byte, which is the result of compressing long
+		 * runs of the same byte.
+		 *
+		 * Writing past the match 'length' is allowed here, since it's
+		 * been ensured there is enough output space left for a slight
+		 * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
+		 * the maximum possible overrun here is changed.
+		 */
+		if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += WORDBYTES;
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += WORDBYTES;
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
+			machine_word_t v;
+
+			/*
+			 * This part tends to get auto-vectorized, so keep it
+			 * copying a multiple of 16 bytes at a time.
+			 */
+			v = (machine_word_t)0x0101010101010101 * src[0];
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			store_word_unaligned(v, dst);
+			dst += WORDBYTES;
+			while (dst < out_next) {
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+				store_word_unaligned(v, dst);
+				dst += WORDBYTES;
+			}
+		} else if (UNALIGNED_ACCESS_IS_FAST) {
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			store_word_unaligned(load_word_unaligned(src), dst);
+			src += offset;
+			dst += offset;
+			do {
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+				store_word_unaligned(load_word_unaligned(src), dst);
+				src += offset;
+				dst += offset;
+			} while (dst < out_next);
+		} else {
+			*dst++ = *src++;
+			*dst++ = *src++;
+			do {
+				*dst++ = *src++;
+			} while (dst < out_next);
+		}
+	} while (in_next < in_fastloop_end && out_next < out_fastloop_end);
+
+	/*
+	 * This is the generic loop for decoding literals and matches.  This
+	 * handles cases where in_next and out_next are close to the end of
+	 * their respective buffers.  Usually this loop isn't performance-
+	 * critical, as most time is spent in the fastloop above instead.  We
+	 * therefore omit some optimizations here in favor of smaller code.
+	 */
+generic_loop:
+	for (;;) {
+		u32 length, offset;
+		const u8 *src;
+		u8 *dst;
+
+		REFILL_BITS();
+		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+		saved_bitbuf = bitbuf;
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+		if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
+			entry = d->u.litlen_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			saved_bitbuf = bitbuf;
+			bitbuf >>= (u8)entry;
+			bitsleft -= entry;
+		}
+		length = entry >> 16;
+		if (entry & HUFFDEC_LITERAL) {
+			if (unlikely(out_next == out_end))
+				return LIBDEFLATE_INSUFFICIENT_SPACE;
+			*out_next++ = length;
+			continue;
+		}
+		if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+			goto block_done;
+		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+		if (unlikely(length > out_end - out_next))
+			return LIBDEFLATE_INSUFFICIENT_SPACE;
+
+		if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
+			REFILL_BITS();
+		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+			bitbuf >>= OFFSET_TABLEBITS;
+			bitsleft -= OFFSET_TABLEBITS;
+			entry = d->offset_decode_table[(entry >> 16) +
+					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+			if (!CAN_CONSUME(OFFSET_MAXBITS))
+				REFILL_BITS();
+		}
+		offset = entry >> 16;
+		offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
+		bitbuf >>= (u8)entry;
+		bitsleft -= entry;
+
+		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+		src = out_next - offset;
+		dst = out_next;
+		out_next += length;
+
+		STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+		*dst++ = *src++;
+		*dst++ = *src++;
+		do {
+			*dst++ = *src++;
+		} while (dst < out_next);
+	}
+
+block_done:
+	/* Finished decoding a block */
+
+	if (!is_final_block)
+		goto next_block;
+
+	/* That was the last block. */
+
+	bitsleft = (u8)bitsleft;
+
+	/*
+	 * If any of the implicit appended zero bytes were consumed (not just
+	 * refilled) before hitting end of stream, then the data is bad.
+	 */
+	SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+
+	/* Optionally return the actual number of bytes consumed. */
+	if (actual_in_nbytes_ret) {
+		/* Don't count bytes that were refilled but not consumed. */
+		in_next -= (bitsleft >> 3) - overread_count;
+
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+	}
+
+	/* Optionally return the actual number of bytes written. */
+	if (actual_out_nbytes_ret) {
+		*actual_out_nbytes_ret = out_next - (u8 *)out;
+	} else {
+		if (out_next != out_end)
+			return LIBDEFLATE_SHORT_OUTPUT;
+	}
+	return LIBDEFLATE_SUCCESS;
+}
+
+#undef FUNCNAME
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
diff --git a/tools/z64compress/src/enc/libdeflate/lib/deflate_compress.c b/tools/z64compress/src/enc/libdeflate/lib/deflate_compress.c
new file mode 100644
index 000000000..7c92d9823
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/deflate_compress.c
@@ -0,0 +1,3877 @@
+/*
+ * deflate_compress.c - a compressor for DEFLATE
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "deflate_constants.h"
+
+#include "libdeflate.h"
+
+/******************************************************************************/
+
+/*
+ * The following parameters can be changed at build time to customize the
+ * compression algorithms slightly:
+ *
+ * (Note, not all customizable parameters are here.  Some others can be found in
+ * libdeflate_alloc_compressor() and in *_matchfinder.h.)
+ */
+
+/*
+ * If this parameter is defined to 1, then the near-optimal parsing algorithm
+ * will be included, and compression levels 10-12 will use it.  This algorithm
+ * usually produces a compression ratio significantly better than the other
+ * algorithms.  However, it is slow.  If this parameter is defined to 0, then
+ * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
+ */
+#define SUPPORT_NEAR_OPTIMAL_PARSING	1
+
+/*
+ * This is the minimum block length that the compressor will use, in
+ * uncompressed bytes.  This should be a value below which using shorter blocks
+ * is unlikely to be worthwhile, due to the per-block overhead.  This value does
+ * not apply to the final block, which may be shorter than this (if the input is
+ * shorter, it will have to be), or to the final uncompressed block in a series
+ * of uncompressed blocks that cover more than UINT16_MAX bytes.
+ *
+ * This value is also approximately the amount by which what would otherwise be
+ * the second-to-last block is allowed to grow past the soft maximum length in
+ * order to avoid having to use a very short final block.
+ *
+ * Defining a fixed minimum block length is needed in order to guarantee a
+ * reasonable upper bound on the compressed size.  It's also needed because our
+ * block splitting algorithm doesn't work well on very short blocks.
+ */
+#define MIN_BLOCK_LENGTH	5000
+
+/*
+ * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
+ * maximum block length, in uncompressed bytes.  The compressor will try to end
+ * blocks at this length, but it may go slightly past it if there is a match
+ * that straddles this limit or if the input data ends soon after this limit.
+ * This parameter doesn't apply to uncompressed blocks, which the DEFLATE format
+ * limits to 65535 bytes.
+ *
+ * This should be a value above which it is very likely that splitting the block
+ * would produce a better compression ratio.  For the near-optimal compressor,
+ * increasing/decreasing this parameter will increase/decrease per-compressor
+ * memory usage linearly.
+ */
+#define SOFT_MAX_BLOCK_LENGTH	300000
+
+/*
+ * For the greedy, lazy, and lazy2 compressors: this is the length of the
+ * sequence store, which is an array where the compressor temporarily stores
+ * matches that it's going to use in the current block.  This value is the
+ * maximum number of matches that can be used in a block.  If the sequence store
+ * fills up, then the compressor will be forced to end the block early.  This
+ * value should be large enough so that this rarely happens, due to the block
+ * being ended normally before then.  Increasing/decreasing this value will
+ * increase/decrease per-compressor memory usage linearly.
+ */
+#define SEQ_STORE_LENGTH	50000
+
+/*
+ * For deflate_compress_fastest(): This is the soft maximum block length.
+ * deflate_compress_fastest() doesn't use the regular block splitting algorithm;
+ * it only ends blocks when they reach FAST_SOFT_MAX_BLOCK_LENGTH bytes or
+ * FAST_SEQ_STORE_LENGTH matches.  Therefore, this value should be lower than
+ * the regular SOFT_MAX_BLOCK_LENGTH.
+ */
+#define FAST_SOFT_MAX_BLOCK_LENGTH	65535
+
+/*
+ * For deflate_compress_fastest(): this is the length of the sequence store.
+ * This is like SEQ_STORE_LENGTH, but this should be a lower value.
+ */
+#define FAST_SEQ_STORE_LENGTH	8192
+
+/*
+ * These are the maximum codeword lengths, in bits, the compressor will use for
+ * each Huffman code.  The DEFLATE format defines limits for these.  However,
+ * further limiting litlen codewords to 14 bits is beneficial, since it has
+ * negligible effect on compression ratio but allows some optimizations when
+ * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
+ */
+#define MAX_LITLEN_CODEWORD_LEN		14
+#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
+#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/* Parameters specific to the near-optimal parsing algorithm */
+
+/*
+ * BIT_COST is a scaling factor that allows the near-optimal compressor to
+ * consider fractional bit costs when deciding which literal/match sequence to
+ * use.  This is useful when the true symbol costs are unknown.  For example, if
+ * the compressor thinks that a symbol has 6.5 bits of entropy, it can set its
+ * cost to 6.5 bits rather than have to use 6 or 7 bits.  Although in the end
+ * each symbol will use a whole number of bits due to the Huffman coding,
+ * considering fractional bits can be helpful due to the limited information.
+ *
+ * BIT_COST should be a power of 2.  A value of 8 or 16 works well.  A higher
+ * value isn't very useful since the calculations are approximate anyway.
+ *
+ * BIT_COST doesn't apply to deflate_flush_block(), which considers whole bits.
+ */
+#define BIT_COST	16
+
+/*
+ * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
+ * be needed to output a symbol that was unused in the previous optimization
+ * pass.  Assigning a default cost allows the symbol to be used in the next
+ * optimization pass.  However, the cost should be relatively high because the
+ * symbol probably won't be used very many times (if at all).
+ */
+#define LITERAL_NOSTAT_BITS	13
+#define LENGTH_NOSTAT_BITS	13
+#define OFFSET_NOSTAT_BITS	10
+
+/*
+ * This is (slightly less than) the maximum number of matches that the
+ * near-optimal compressor will cache per block.  This behaves similarly to
+ * SEQ_STORE_LENGTH for the other compressors.
+ */
+#define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/******************************************************************************/
+
+/* Include the needed matchfinders. */
+#define MATCHFINDER_WINDOW_ORDER	DEFLATE_WINDOW_ORDER
+#include "hc_matchfinder.h"
+#include "ht_matchfinder.h"
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+#  include "bt_matchfinder.h"
+/*
+ * This is the maximum number of matches the binary trees matchfinder can find
+ * at a single position.  Since the matchfinder never finds more than one match
+ * for the same length, presuming one of each possible length is sufficient for
+ * an upper bound.  (This says nothing about whether it is worthwhile to
+ * consider so many matches; this is just defining the worst case.)
+ */
+#define MAX_MATCHES_PER_POS	\
+	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
+#endif
+
+/*
+ * The largest block length we will ever use is when the final block is of
+ * length SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, or when any block is of
+ * length SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN.  The latter case
+ * occurs when the lazy2 compressor chooses two literals and a maximum-length
+ * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
+ */
+#define MAX_BLOCK_LENGTH	\
+	MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1,	\
+	    SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
+
+static forceinline void
+check_buildtime_parameters(void)
+{
+	/*
+	 * Verify that MIN_BLOCK_LENGTH is being honored, as
+	 * libdeflate_deflate_compress_bound() depends on it.
+	 */
+	STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >=
+		      MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >=
+		      MIN_BLOCK_LENGTH);
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <=
+		      MATCH_CACHE_LENGTH);
+#endif
+
+	/* The definition of MAX_BLOCK_LENGTH assumes this. */
+	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
+
+	/* Verify that the sequence stores aren't uselessly large. */
+	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
+		      SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+	STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <=
+		      FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+
+	/* Verify that the maximum codeword lengths are valid. */
+	STATIC_ASSERT(
+		MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+	STATIC_ASSERT(
+		MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+	STATIC_ASSERT(
+		MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
+	STATIC_ASSERT(
+		(1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
+	STATIC_ASSERT(
+		(1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
+	STATIC_ASSERT(
+		(1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
+}
+
+/******************************************************************************/
+
+/* Table: length slot => length slot base value */
+static const unsigned deflate_length_slot_base[] = {
+	3,    4,    5,    6,    7,    8,    9,    10,
+	11,   13,   15,   17,   19,   23,   27,   31,
+	35,   43,   51,   59,   67,   83,   99,   115,
+	131,  163,  195,  227,  258,
+};
+
+/* Table: length slot => number of extra length bits */
+static const u8 deflate_extra_length_bits[] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	1,    1,    1,    1,    2,    2,    2,    2,
+	3,    3,    3,    3,    4,    4,    4,    4,
+	5,    5,    5,    5,    0,
+};
+
+/* Table: offset slot => offset slot base value */
+static const unsigned deflate_offset_slot_base[] = {
+	1,     2,     3,     4,     5,     7,     9,     13,
+	17,    25,    33,    49,    65,    97,    129,   193,
+	257,   385,   513,   769,   1025,  1537,  2049,  3073,
+	4097,  6145,  8193,  12289, 16385, 24577,
+};
+
+/* Table: offset slot => number of extra offset bits */
+static const u8 deflate_extra_offset_bits[] = {
+	0,     0,     0,     0,     1,     1,     2,     2,
+	3,     3,     4,     4,     5,     5,     6,     6,
+	7,     7,     8,     8,     9,     9,     10,    10,
+	11,    11,    12,    12,    13,    13,
+};
+
+/* Table: length => length slot */
+static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
+	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
+	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
+	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 28,
+};
+
+/*
+ * A condensed table which maps offset => offset slot as follows:
+ *
+ *	offset <= 256: deflate_offset_slot[offset]
+ *	offset > 256: deflate_offset_slot[256 + ((offset - 1) >> 7)]
+ *
+ * This table was generated by scripts/gen_offset_slot_map.py.
+ */
+static const u8 deflate_offset_slot[512] = {
+	0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7,
+	7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
+	9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 0, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+};
+
+/* The order in which precode codeword lengths are stored */
+static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+};
+
+/* Table: precode symbol => number of extra bits */
+static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7
+};
+
+/* Codewords for the DEFLATE Huffman codes */
+struct deflate_codewords {
+	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * Codeword lengths (in bits) for the DEFLATE Huffman codes.
+ * A zero length means the corresponding symbol had zero frequency.
+ */
+struct deflate_lens {
+	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/* Codewords and lengths for the DEFLATE Huffman codes */
+struct deflate_codes {
+	struct deflate_codewords codewords;
+	struct deflate_lens lens;
+};
+
+/* Symbol frequency counters for the DEFLATE Huffman codes */
+struct deflate_freqs {
+	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * Represents a run of literals followed by a match or end-of-block.  This
+ * struct is needed to temporarily store items chosen by the parser, since items
+ * cannot be written until all items for the block have been chosen and the
+ * block's Huffman codes have been computed.
+ */
+struct deflate_sequence {
+
+	/*
+	 * Bits 0..22: the number of literals in this run.  This may be 0 and
+	 * can be at most MAX_BLOCK_LENGTH.  The literals are not stored
+	 * explicitly in this structure; instead, they are read directly from
+	 * the uncompressed data.
+	 *
+	 * Bits 23..31: the length of the match which follows the literals, or 0
+	 * if this literal run was the last in the block, so there is no match
+	 * which follows it.
+	 */
+#define SEQ_LENGTH_SHIFT 23
+#define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1)
+	u32 litrunlen_and_length;
+
+	/*
+	 * If 'length' doesn't indicate end-of-block, then this is the offset of
+	 * the match which follows the literals.
+	 */
+	u16 offset;
+
+	/*
+	 * If 'length' doesn't indicate end-of-block, then this is the offset
+	 * slot of the match which follows the literals.
+	 */
+	u16 offset_slot;
+};
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/* Costs for the near-optimal parsing algorithm */
+struct deflate_costs {
+
+	/* The cost to output each possible literal */
+	u32 literal[DEFLATE_NUM_LITERALS];
+
+	/* The cost to output each possible match length */
+	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
+
+	/* The cost to output a match offset of each possible offset slot */
+	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
+};
+
+/*
+ * This structure represents a byte position in the input data and a node in the
+ * graph of possible match/literal choices for the current block.
+ *
+ * Logically, each incoming edge to this node is labeled with a literal or a
+ * match that can be taken to reach this position from an earlier position; and
+ * each outgoing edge from this node is labeled with a literal or a match that
+ * can be taken to advance from this position to a later position.
+ *
+ * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
+ * associate with each node just two pieces of information:
+ *
+ *	'cost_to_end' is the minimum cost to reach the end of the block from
+ *	this position.
+ *
+ *	'item' represents the literal or match that must be chosen from here to
+ *	reach the end of the block with the minimum cost.  Equivalently, this
+ *	can be interpreted as the label of the outgoing edge on the minimum-cost
+ *	path to the "end of block" node from this node.
+ */
+struct deflate_optimum_node {
+
+	u32 cost_to_end;
+
+	/*
+	 * Notes on the match/literal representation used here:
+	 *
+	 *	The low bits of 'item' are the length: 1 if this is a literal,
+	 *	or the match length if this is a match.
+	 *
+	 *	The high bits of 'item' are the actual literal byte if this is a
+	 *	literal, or the match offset if this is a match.
+	 */
+#define OPTIMUM_OFFSET_SHIFT 9
+#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
+	u32 item;
+
+};
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+/* Block split statistics.  See "Block splitting algorithm" below. */
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
+			       NUM_MATCH_OBSERVATION_TYPES)
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
+struct block_split_stats {
+	u32 new_observations[NUM_OBSERVATION_TYPES];
+	u32 observations[NUM_OBSERVATION_TYPES];
+	u32 num_new_observations;
+	u32 num_observations;
+};
+
+struct deflate_output_bitstream;
+
+/* The main DEFLATE compressor structure */
+struct libdeflate_compressor {
+
+	/* Pointer to the compress() implementation chosen at allocation time */
+	void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
+		     size_t in_nbytes, struct deflate_output_bitstream *os);
+
+	/* The compression level with which this compressor was created */
+	unsigned compression_level;
+
+	/* Anything of this size or less we won't bother trying to compress. */
+	size_t max_passthrough_size;
+
+	/*
+	 * The maximum search depth: consider at most this many potential
+	 * matches at each position
+	 */
+	unsigned max_search_depth;
+
+	/*
+	 * The "nice" match length: if a match of this length is found, choose
+	 * it immediately without further consideration
+	 */
+	unsigned nice_match_length;
+
+	/* Frequency counters for the current block */
+	struct deflate_freqs freqs;
+
+	/* Block split statistics for the current block */
+	struct block_split_stats split_stats;
+
+	/* Dynamic Huffman codes for the current block */
+	struct deflate_codes codes;
+
+	/* The static Huffman codes defined by the DEFLATE format */
+	struct deflate_codes static_codes;
+
+	/* Temporary space for block flushing */
+	union {
+		/* Information about the precode */
+		struct {
+			u32 freqs[DEFLATE_NUM_PRECODE_SYMS];
+			u32 codewords[DEFLATE_NUM_PRECODE_SYMS];
+			u8 lens[DEFLATE_NUM_PRECODE_SYMS];
+			unsigned items[DEFLATE_NUM_LITLEN_SYMS +
+				       DEFLATE_NUM_OFFSET_SYMS];
+			unsigned num_litlen_syms;
+			unsigned num_offset_syms;
+			unsigned num_explicit_lens;
+			unsigned num_items;
+		} precode;
+		/*
+		 * The "full" length codewords.  Used only after the information
+		 * in 'precode' is no longer needed.
+		 */
+		struct {
+			u32 codewords[DEFLATE_MAX_MATCH_LEN + 1];
+			u8 lens[DEFLATE_MAX_MATCH_LEN + 1];
+		} length;
+	} o;
+
+	union {
+		/* Data for greedy or lazy parsing */
+		struct {
+			/* Hash chains matchfinder */
+			struct hc_matchfinder hc_mf;
+
+			/* Matches and literals chosen for the current block */
+			struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1];
+
+		} g; /* (g)reedy */
+
+		/* Data for fastest parsing */
+		struct {
+			/* Hash table matchfinder */
+			struct ht_matchfinder ht_mf;
+
+			/* Matches and literals chosen for the current block */
+			struct deflate_sequence sequences[
+						FAST_SEQ_STORE_LENGTH + 1];
+
+		} f; /* (f)astest */
+
+	#if SUPPORT_NEAR_OPTIMAL_PARSING
+		/* Data for near-optimal parsing */
+		struct {
+
+			/* Binary tree matchfinder */
+			struct bt_matchfinder bt_mf;
+
+			/*
+			 * Cached matches for the current block.  This array
+			 * contains the matches that were found at each position
+			 * in the block.  Specifically, for each position, there
+			 * is a list of matches found at that position, if any,
+			 * sorted by strictly increasing length.  In addition,
+			 * following the matches for each position, there is a
+			 * special 'struct lz_match' whose 'length' member
+			 * contains the number of matches found at that
+			 * position, and whose 'offset' member contains the
+			 * literal at that position.
+			 *
+			 * Note: in rare cases, there will be a very high number
+			 * of matches in the block and this array will overflow.
+			 * If this happens, we force the end of the current
+			 * block.  MATCH_CACHE_LENGTH is the length at which we
+			 * actually check for overflow.  The extra slots beyond
+			 * this are enough to absorb the worst case overflow,
+			 * which occurs if starting at
+			 * &match_cache[MATCH_CACHE_LENGTH - 1], we write
+			 * MAX_MATCHES_PER_POS matches and a match count header,
+			 * then skip searching for matches at
+			 * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
+			 * match count header for each.
+			 */
+			struct lz_match match_cache[MATCH_CACHE_LENGTH +
+						    MAX_MATCHES_PER_POS +
+						    DEFLATE_MAX_MATCH_LEN - 1];
+
+			/*
+			 * Array of nodes, one per position, for running the
+			 * minimum-cost path algorithm.
+			 *
+			 * This array must be large enough to accommodate the
+			 * worst-case number of nodes, which is MAX_BLOCK_LENGTH
+			 * plus 1 for the end-of-block node.
+			 */
+			struct deflate_optimum_node optimum_nodes[
+				MAX_BLOCK_LENGTH + 1];
+
+			/* The current cost model being used */
+			struct deflate_costs costs;
+
+			/*
+			 * A table that maps match offset to offset slot.  This
+			 * differs from deflate_offset_slot[] in that this is a
+			 * full map, not a condensed one.  The full map is more
+			 * appropriate for the near-optimal parser, since the
+			 * near-optimal parser does more offset => offset_slot
+			 * translations, it doesn't intersperse them with
+			 * matchfinding (so cache evictions are less of a
+			 * concern), and it uses more memory anyway.
+			 */
+			u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
+
+			/* Literal/match statistics saved from previous block */
+			u32 prev_observations[NUM_OBSERVATION_TYPES];
+			u32 prev_num_observations;
+
+			/*
+			 * Approximate match length frequencies based on a
+			 * greedy parse, gathered during matchfinding.  This is
+			 * used for setting the initial symbol costs.
+			 */
+			u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+
+			unsigned num_optim_passes;
+		} n; /* (n)ear-optimal */
+	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+	} p; /* (p)arser */
+};
+
+/*
+ * The type for the bitbuffer variable, which temporarily holds bits that are
+ * being packed into bytes and written to the output buffer.  For best
+ * performance, this should have size equal to a machine word.
+ */
+typedef machine_word_t bitbuf_t;
+
+/*
+ * The capacity of the bitbuffer, in bits.  This is 1 less than the real size,
+ * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7.
+ */
+#define BITBUF_NBITS	(8 * sizeof(bitbuf_t) - 1)
+
+/*
+ * Can the specified number of bits always be added to 'bitbuf' after any
+ * pending bytes have been flushed?  There can be up to 7 bits remaining after a
+ * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits.
+ */
+#define CAN_BUFFER(n)	(7 + (n) <= BITBUF_NBITS)
+
+/*
+ * Structure to keep track of the current state of sending bits to the
+ * compressed output buffer
+ */
+struct deflate_output_bitstream {
+
+	/* Bits that haven't yet been written to the output buffer */
+	bitbuf_t bitbuf;
+
+	/*
+	 * Number of bits currently held in @bitbuf.  This can be between 0 and
+	 * BITBUF_NBITS in general, or between 0 and 7 after a flush.
+	 */
+	unsigned bitcount;
+
+	/*
+	 * Pointer to the position in the output buffer at which the next byte
+	 * should be written
+	 */
+	u8 *next;
+
+	/*
+	 * Pointer to near the end of the output buffer.  'next' will never
+	 * exceed this.  There are OUTPUT_END_PADDING bytes reserved after this
+	 * to allow branchlessly writing a whole word at this location.
+	 */
+	u8 *end;
+};
+
+/*
+ * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
+ * present following os->end, in order to not overrun the buffer when generating
+ * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
+ * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
+ * to make the compression algorithm produce the same result on all CPU
+ * architectures (which is sometimes desirable), we have to unconditionally use
+ * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
+ */
+#define OUTPUT_END_PADDING	8
+
+/*
+ * Add some bits to the bitbuffer variable of the output bitstream.  The caller
+ * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS()
+ * frequently enough.
+ */
+#define ADD_BITS(bits, n)			\
+do {						\
+	bitbuf |= (bitbuf_t)(bits) << bitcount;	\
+	bitcount += (n);			\
+	ASSERT(bitcount <= BITBUF_NBITS);	\
+} while (0)
+
+/* Flush bits from the bitbuffer variable to the output buffer. */
+#define FLUSH_BITS()							\
+do {									\
+	if (UNALIGNED_ACCESS_IS_FAST) {					\
+		/* Flush a whole word (branchlessly). */		\
+		put_unaligned_leword(bitbuf, out_next);			\
+		bitbuf >>= bitcount & ~7;				\
+		out_next += MIN(out_end - out_next, bitcount >> 3);	\
+		bitcount &= 7;						\
+	} else {							\
+		/* Flush a byte at a time. */				\
+		while (bitcount >= 8) {					\
+			*out_next = bitbuf;				\
+			if (out_next != out_end)			\
+				out_next++;				\
+			bitcount -= 8;					\
+			bitbuf >>= 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than or equal to both of its children, so that the maxheap property is
+ * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
+ */
+static void
+heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
+{
+	unsigned parent_idx;
+	unsigned child_idx;
+	u32 v;
+
+	v = A[subtree_idx];
+	parent_idx = subtree_idx;
+	while ((child_idx = parent_idx * 2) <= length) {
+		if (child_idx < length && A[child_idx + 1] > A[child_idx])
+			child_idx++;
+		if (v >= A[child_idx])
+			break;
+		A[parent_idx] = A[child_idx];
+		parent_idx = child_idx;
+	}
+	A[parent_idx] = v;
+}
+
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
+ */
+static void
+heapify_array(u32 A[], unsigned length)
+{
+	unsigned subtree_idx;
+
+	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+		heapify_subtree(A, length, subtree_idx);
+}
+
+/*
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
+ *
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
+ */
+static void
+heap_sort(u32 A[], unsigned length)
+{
+	A--; /* Use 1-based indices  */
+
+	heapify_array(A, length);
+
+	while (length >= 2) {
+		u32 tmp = A[length];
+
+		A[length] = A[1];
+		A[1] = tmp;
+		length--;
+		heapify_subtree(A, length, 1);
+	}
+}
+
+#define NUM_SYMBOL_BITS 10
+#define NUM_FREQ_BITS	(32 - NUM_SYMBOL_BITS)
+#define SYMBOL_MASK	((1 << NUM_SYMBOL_BITS) - 1)
+#define FREQ_MASK	(~SYMBOL_MASK)
+
+#define GET_NUM_COUNTERS(num_syms)	(num_syms)
+
+/*
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
+ *
+ * @freqs[num_syms]
+ *	Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
+ *
+ * @lens[num_syms]
+ *	An array that eventually will hold the length of each codeword.  This
+ *	function only fills in the codeword lengths for symbols that have zero
+ *	frequency, which are not well defined per se but will be set to 0.
+ *
+ * @symout[num_syms]
+ *	The output array, described above.
+ *
+ * Returns the number of entries in 'symout' that were filled.  This is the
+ * number of symbols that have nonzero frequency.
+ */
+static unsigned
+sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
+{
+	unsigned sym;
+	unsigned i;
+	unsigned num_used_syms;
+	unsigned num_counters;
+	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
+
+	/*
+	 * We use heapsort, but with an added optimization.  Since often most
+	 * symbol frequencies are low, we first do a count sort using a limited
+	 * number of counters.  High frequencies are counted in the last
+	 * counter, and only they will be sorted with heapsort.
+	 *
+	 * Note: with more symbols, it is generally beneficial to have more
+	 * counters.  About 1 counter per symbol seems fastest.
+	 */
+
+	num_counters = GET_NUM_COUNTERS(num_syms);
+
+	memset(counters, 0, num_counters * sizeof(counters[0]));
+
+	/* Count the frequencies. */
+	for (sym = 0; sym < num_syms; sym++)
+		counters[MIN(freqs[sym], num_counters - 1)]++;
+
+	/*
+	 * Make the counters cumulative, ignoring the zero-th, which counted
+	 * symbols with zero frequency.  As a side effect, this calculates the
+	 * number of symbols with nonzero frequency.
+	 */
+	num_used_syms = 0;
+	for (i = 1; i < num_counters; i++) {
+		unsigned count = counters[i];
+
+		counters[i] = num_used_syms;
+		num_used_syms += count;
+	}
+
+	/*
+	 * Sort nonzero-frequency symbols using the counters.  At the same time,
+	 * set the codeword lengths of zero-frequency symbols to 0.
+	 */
+	for (sym = 0; sym < num_syms; sym++) {
+		u32 freq = freqs[sym];
+
+		if (freq != 0) {
+			symout[counters[MIN(freq, num_counters - 1)]++] =
+				sym | (freq << NUM_SYMBOL_BITS);
+		} else {
+			lens[sym] = 0;
+		}
+	}
+
+	/* Sort the symbols counted in the last counter. */
+	heap_sort(symout + counters[num_counters - 2],
+		  counters[num_counters - 1] - counters[num_counters - 2]);
+
+	return num_used_syms;
+}
+
+/*
+ * Build a Huffman tree.
+ *
+ * This is an optimized implementation that
+ *	(a) takes advantage of the frequencies being already sorted;
+ *	(b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *	    tree are sufficient to generate a canonical code;
+ *	(c) Only stores parent pointers, not child pointers;
+ *	(d) Produces the nodes in the same memory used for input frequency
+ *	    information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order.  Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function.  Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
+ * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is.  Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot.  This is because this
+ * implementation only generates the non-leaf nodes of the tree.
+ */
+static void
+build_tree(u32 A[], unsigned sym_count)
+{
+	const unsigned last_idx = sym_count - 1;
+
+	/* Index of the next lowest frequency leaf that still needs a parent */
+	unsigned i = 0;
+
+	/*
+	 * Index of the next lowest frequency non-leaf that still needs a
+	 * parent, or 'e' if there is currently no such node
+	 */
+	unsigned b = 0;
+
+	/* Index of the next spot for a non-leaf (will overwrite a leaf) */
+	unsigned e = 0;
+
+	do {
+		u32 new_freq;
+
+		/*
+		 * Select the next two lowest frequency nodes among the leaves
+		 * A[i] and non-leaves A[b], and create a new node A[e] to be
+		 * their parent.  Set the new node's frequency to the sum of the
+		 * frequencies of its two children.
+		 *
+		 * Usually the next two lowest frequency nodes are of the same
+		 * type (leaf or non-leaf), so check those cases first.
+		 */
+		if (i + 1 <= last_idx &&
+		    (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
+			/* Two leaves */
+			new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
+			i += 2;
+		} else if (b + 2 <= e &&
+			   (i > last_idx ||
+			    (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
+			/* Two non-leaves */
+			new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
+			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+			A[b + 1] = (e << NUM_SYMBOL_BITS) |
+				   (A[b + 1] & SYMBOL_MASK);
+			b += 2;
+		} else {
+			/* One leaf and one non-leaf */
+			new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
+			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+			i++;
+			b++;
+		}
+		A[e] = new_freq | (A[e] & SYMBOL_MASK);
+		/*
+		 * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
+		 * tree is complete once we've created 'n - 1' non-leaves.
+		 */
+	} while (++e < last_idx);
+}
+
+/*
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
+ *
+ * @A
+ *	The array produced by build_tree(), containing parent index information
+ *	for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *	a node; a node's parent always has a greater index than that node
+ *	itself.  This function will overwrite the parent index information in
+ *	this array, so essentially it will destroy the tree.  However, the data
+ *	in the low NUM_SYMBOL_BITS of each entry will be preserved.
+ *
+ * @root_idx
+ *	The 0-based index of the root node in 'A', and consequently one less
+ *	than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *	the actual length of 'A'.)
+ *
+ * @len_counts
+ *	An array of length ('max_codeword_len' + 1) in which the number of
+ *	codewords having each length <= max_codeword_len will be returned.
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ */
+static void
+compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
+		      unsigned max_codeword_len)
+{
+	unsigned len;
+	int node;
+
+	/*
+	 * The key observations are:
+	 *
+	 * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+	 *     parent before its children, by simply iterating through the array
+	 *     in reverse order.  Consequently, we can compute the depth of each
+	 *     node in one pass, overwriting the parent indices with depths.
+	 *
+	 * (2) We can initially assume that in the real Huffman tree, both
+	 *     children of the root are leaves.  This corresponds to two
+	 *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+	 *     during the traversal, we modify this assumption to account for
+	 *     the current node *not* being a leaf, but rather its two children
+	 *     being leaves.  This causes the loss of one codeword for the
+	 *     current depth and the addition of two codewords for the current
+	 *     depth plus one.
+	 *
+	 * (3) We can handle the length-limited constraint fairly easily by
+	 *     simply using the largest length available when a depth exceeds
+	 *     max_codeword_len.
+	 */
+
+	for (len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	len_counts[1] = 2;
+
+	/* Set the root node's depth to 0. */
+	A[root_idx] &= SYMBOL_MASK;
+
+	for (node = root_idx - 1; node >= 0; node--) {
+
+		/* Calculate the depth of this node. */
+
+		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+		unsigned depth = parent_depth + 1;
+
+		/*
+		 * Set the depth of this node so that it is available when its
+		 * children (if any) are processed.
+		 */
+		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+
+		/*
+		 * If needed, decrease the length to meet the length-limited
+		 * constraint.  This is not the optimal method for generating
+		 * length-limited Huffman codes!  But it should be good enough.
+		 */
+		if (depth >= max_codeword_len) {
+			depth = max_codeword_len;
+			do {
+				depth--;
+			} while (len_counts[depth] == 0);
+		}
+
+		/*
+		 * Account for the fact that we have a non-leaf node at the
+		 * current depth.
+		 */
+		len_counts[depth]--;
+		len_counts[depth + 1] += 2;
+	}
+}
+
+/*
+ * DEFLATE uses bit-reversed codewords, so we must bit-reverse the codewords
+ * after generating them.  All codewords have length <= 16 bits.  If the CPU has
+ * a bit-reversal instruction, then that is the fastest method.  Otherwise the
+ * fastest method is to reverse the bits in each of the two bytes using a table.
+ * The table method is slightly faster than using bitwise operations to flip
+ * adjacent 1, 2, 4, and then 8-bit fields, even if 2 to 4 codewords are packed
+ * into a machine word and processed together using that method.
+ */
+
+#ifdef rbit32
+static forceinline u32 reverse_codeword(u32 codeword, u8 len)
+{
+	return rbit32(codeword) >> ((32 - len) & 31);
+}
+#else
+/* Generated by scripts/gen_bitreverse_tab.py */
+static const u8 bitreverse_tab[256] = {
+	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
+	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
+	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
+	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
+	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
+	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
+	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
+	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
+	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
+	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
+	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
+	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+};
+
+static forceinline u32 reverse_codeword(u32 codeword, u8 len)
+{
+	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
+	codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) |
+		   bitreverse_tab[codeword >> 8];
+	return codeword >> (16 - len);
+}
+#endif /* !rbit32 */
+
+/*
+ * Generate the codewords for a canonical Huffman code.
+ *
+ * @A
+ *	The output array for codewords.  In addition, initially this
+ *	array must contain the symbols, sorted primarily by frequency and
+ *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *	each entry.
+ *
+ * @len
+ *	Output array for codeword lengths.
+ *
+ * @len_counts
+ *	An array that provides the number of codewords that will have
+ *	each possible length <= max_codeword_len.
+ *
+ * @max_codeword_len
+ *	Maximum length, in bits, of each codeword.
+ *
+ * @num_syms
+ *	Number of symbols in the alphabet, including symbols with zero
+ *	frequency.  This is the length of the 'A' and 'len' arrays.
+ */
+static void
+gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
+	      unsigned max_codeword_len, unsigned num_syms)
+{
+	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned i;
+	unsigned len;
+	unsigned sym;
+
+	/*
+	 * Given the number of codewords that will have each length, assign
+	 * codeword lengths to symbols.  We do this by assigning the lengths in
+	 * decreasing order to the symbols sorted primarily by increasing
+	 * frequency and secondarily by increasing symbol value.
+	 */
+	for (i = 0, len = max_codeword_len; len >= 1; len--) {
+		unsigned count = len_counts[len];
+
+		while (count--)
+			lens[A[i++] & SYMBOL_MASK] = len;
+	}
+
+	/*
+	 * Generate the codewords themselves.  We initialize the
+	 * 'next_codewords' array to provide the lexicographically first
+	 * codeword of each length, then assign codewords in symbol order.  This
+	 * produces a canonical code.
+	 */
+	next_codewords[0] = 0;
+	next_codewords[1] = 0;
+	for (len = 2; len <= max_codeword_len; len++)
+		next_codewords[len] =
+			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
+
+	for (sym = 0; sym < num_syms; sym++) {
+		/* DEFLATE requires bit-reversed codewords. */
+		A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
+					  lens[sym]);
+	}
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *			deflate_make_huffman_code()
+ * ---------------------------------------------------------------------
+ *
+ * Given an alphabet and the frequency of each symbol in it, construct a
+ * length-limited canonical Huffman code.
+ *
+ * @num_syms
+ *	The number of symbols in the alphabet.  The symbols are the integers in
+ *	the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *	must not exceed (1 << NUM_SYMBOL_BITS).
+ *
+ * @max_codeword_len
+ *	The maximum permissible codeword length.
+ *
+ * @freqs
+ *	An array of length @num_syms that gives the frequency of each symbol.
+ *	It is valid for some, none, or all of the frequencies to be 0.  The sum
+ *	of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
+ *
+ * @lens
+ *	An array of @num_syms entries in which this function will return the
+ *	length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *	0 frequency will not have codewords per se, but their entries in this
+ *	array will be set to 0.  No lengths greater than @max_codeword_len will
+ *	be assigned.
+ *
+ * @codewords
+ *	An array of @num_syms entries in which this function will return the
+ *	codeword for each symbol, right-justified and padded on the left with
+ *	zeroes.  Codewords for symbols with 0 frequency will be undefined.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * This function builds a length-limited canonical Huffman code.
+ *
+ * A length-limited Huffman code contains no codewords longer than some
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols.  A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap.  When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree.  The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root.  Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies.  In practice, it is fastest to first sort the
+ * symbols by frequency.  (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.)  At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code.  Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order.  Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree.  Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length).  And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree.  So in summary, we do NOT need a big structure like:
+ *
+ *	struct huffman_tree_node {
+ *		unsigned int symbol;
+ *		unsigned int frequency;
+ *		unsigned int depth;
+ *		struct huffman_tree_node *left_child;
+ *		struct huffman_tree_node *right_child;
+ *	};
+ *
+ *
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
+ */
+static void
+deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+			  const u32 freqs[], u8 lens[], u32 codewords[])
+{
+	u32 *A = codewords;
+	unsigned num_used_syms;
+
+	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+	STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1);
+
+	/*
+	 * We begin by sorting the symbols primarily by frequency and
+	 * secondarily by symbol value.  As an optimization, the array used for
+	 * this purpose ('A') shares storage with the space in which we will
+	 * eventually return the codewords.
+	 */
+	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+
+	/*
+	 * 'num_used_syms' is the number of symbols with nonzero frequency.
+	 * This may be less than @num_syms.  'num_used_syms' is also the number
+	 * of entries in 'A' that are valid.  Each entry consists of a distinct
+	 * symbol and a nonzero frequency packed into a 32-bit integer.
+	 */
+
+	/*
+	 * Handle special cases where only 0 or 1 symbols were used (had nonzero
+	 * frequency).
+	 */
+
+	if (unlikely(num_used_syms == 0)) {
+		/*
+		 * Code is empty.  sort_symbols() already set all lengths to 0,
+		 * so there is nothing more to do.
+		 */
+		return;
+	}
+
+	if (unlikely(num_used_syms == 1)) {
+		/*
+		 * Only one symbol was used, so we only need one codeword.  But
+		 * two codewords are needed to form the smallest complete
+		 * Huffman code, which uses codewords 0 and 1.  Therefore, we
+		 * choose another symbol to which to assign a codeword.  We use
+		 * 0 (if the used symbol is not 0) or 1 (if the used symbol is
+		 * 0).  In either case, the lesser-valued symbol must be
+		 * assigned codeword 0 so that the resulting code is canonical.
+		 */
+
+		unsigned sym = A[0] & SYMBOL_MASK;
+		unsigned nonzero_idx = sym ? sym : 1;
+
+		codewords[0] = 0;
+		lens[0] = 1;
+		codewords[nonzero_idx] = 1;
+		lens[nonzero_idx] = 1;
+		return;
+	}
+
+	/*
+	 * Build a stripped-down version of the Huffman tree, sharing the array
+	 * 'A' with the symbol values.  Then extract length counts from the tree
+	 * and use them to generate the final codewords.
+	 */
+
+	build_tree(A, num_used_syms);
+
+	{
+		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+
+		compute_length_counts(A, num_used_syms - 2,
+				      len_counts, max_codeword_len);
+
+		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
+	}
+}
+
+/*
+ * Clear the Huffman symbol frequency counters.  This must be called when
+ * starting a new DEFLATE block.
+ */
+static void
+deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
+{
+	memset(&c->freqs, 0, sizeof(c->freqs));
+}
+
+/*
+ * Build the literal/length and offset Huffman codes for a DEFLATE block.
+ *
+ * This takes as input the frequency tables for each alphabet and produces as
+ * output a set of tables that map symbols to codewords and codeword lengths.
+ */
+static void
+deflate_make_huffman_codes(const struct deflate_freqs *freqs,
+			   struct deflate_codes *codes)
+{
+	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
+				  MAX_LITLEN_CODEWORD_LEN,
+				  freqs->litlen,
+				  codes->lens.litlen,
+				  codes->codewords.litlen);
+
+	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
+				  MAX_OFFSET_CODEWORD_LEN,
+				  freqs->offset,
+				  codes->lens.offset,
+				  codes->codewords.offset);
+}
+
+/* Initialize c->static_codes. */
+static void
+deflate_init_static_codes(struct libdeflate_compressor *c)
+{
+	unsigned i;
+
+	for (i = 0; i < 144; i++)
+		c->freqs.litlen[i] = 1 << (9 - 8);
+	for (; i < 256; i++)
+		c->freqs.litlen[i] = 1 << (9 - 9);
+	for (; i < 280; i++)
+		c->freqs.litlen[i] = 1 << (9 - 7);
+	for (; i < 288; i++)
+		c->freqs.litlen[i] = 1 << (9 - 8);
+
+	for (i = 0; i < 32; i++)
+		c->freqs.offset[i] = 1 << (5 - 5);
+
+	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
+}
+
+/* Return the offset slot for the given match offset, using the small map. */
+static forceinline unsigned
+deflate_get_offset_slot(unsigned offset)
+{
+#if 1
+	if (offset <= 256)
+		return deflate_offset_slot[offset];
+	else
+		return deflate_offset_slot[256 + ((offset - 1) >> 7)];
+#else /* Branchless version */
+	u32 i1 = offset;
+	u32 i2 = 256 + ((offset - 1) >> 7);
+	u32 is_small = (s32)(offset - 257) >> 31;
+
+	return deflate_offset_slot[(i1 & is_small) ^ (i2 & ~is_small)];
+#endif
+}
+
+static unsigned
+deflate_compute_precode_items(const u8 lens[], const unsigned num_lens,
+			      u32 precode_freqs[], unsigned precode_items[])
+{
+	unsigned *itemptr;
+	unsigned run_start;
+	unsigned run_end;
+	unsigned extra_bits;
+	u8 len;
+
+	memset(precode_freqs, 0,
+	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
+
+	itemptr = precode_items;
+	run_start = 0;
+	do {
+		/* Find the next run of codeword lengths. */
+
+		/* len = the length being repeated */
+		len = lens[run_start];
+
+		/* Extend the run. */
+		run_end = run_start;
+		do {
+			run_end++;
+		} while (run_end != num_lens && len == lens[run_end]);
+
+		if (len == 0) {
+			/* Run of zeroes. */
+
+			/* Symbol 18: RLE 11 to 138 zeroes at a time. */
+			while ((run_end - run_start) >= 11) {
+				extra_bits = MIN((run_end - run_start) - 11,
+						 0x7F);
+				precode_freqs[18]++;
+				*itemptr++ = 18 | (extra_bits << 5);
+				run_start += 11 + extra_bits;
+			}
+
+			/* Symbol 17: RLE 3 to 10 zeroes at a time. */
+			if ((run_end - run_start) >= 3) {
+				extra_bits = MIN((run_end - run_start) - 3,
+						 0x7);
+				precode_freqs[17]++;
+				*itemptr++ = 17 | (extra_bits << 5);
+				run_start += 3 + extra_bits;
+			}
+		} else {
+
+			/* A run of nonzero lengths. */
+
+			/* Symbol 16: RLE 3 to 6 of the previous length. */
+			if ((run_end - run_start) >= 4) {
+				precode_freqs[len]++;
+				*itemptr++ = len;
+				run_start++;
+				do {
+					extra_bits = MIN((run_end - run_start) -
+							 3, 0x3);
+					precode_freqs[16]++;
+					*itemptr++ = 16 | (extra_bits << 5);
+					run_start += 3 + extra_bits;
+				} while ((run_end - run_start) >= 3);
+			}
+		}
+
+		/* Output any remaining lengths without RLE. */
+		while (run_start != run_end) {
+			precode_freqs[len]++;
+			*itemptr++ = len;
+			run_start++;
+		}
+	} while (run_start != num_lens);
+
+	return itemptr - precode_items;
+}
+
+/*
+ * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
+ * separate Huffman code, the "precode", which contains a symbol for each
+ * possible codeword length in the larger code as well as several special
+ * symbols to represent repeated codeword lengths (a form of run-length
+ * encoding).  The precode is itself constructed in canonical form, and its
+ * codeword lengths are represented literally in 19 3-bit fields that
+ * immediately precede the compressed codeword lengths of the larger code.
+ */
+
+/* Precompute the information needed to output dynamic Huffman codes. */
+static void
+deflate_precompute_huffman_header(struct libdeflate_compressor *c)
+{
+	/* Compute how many litlen and offset symbols are needed. */
+
+	for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
+	     c->o.precode.num_litlen_syms > 257;
+	     c->o.precode.num_litlen_syms--)
+		if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0)
+			break;
+
+	for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
+	     c->o.precode.num_offset_syms > 1;
+	     c->o.precode.num_offset_syms--)
+		if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0)
+			break;
+
+	/*
+	 * If we're not using the full set of literal/length codeword lengths,
+	 * then temporarily move the offset codeword lengths over so that the
+	 * literal/length and offset codeword lengths are contiguous.
+	 */
+	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
+		      DEFLATE_NUM_LITLEN_SYMS);
+	if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+		memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
+			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+			c->o.precode.num_offset_syms);
+	}
+
+	/*
+	 * Compute the "items" (RLE / literal tokens and extra bits) with which
+	 * the codeword lengths in the larger code will be output.
+	 */
+	c->o.precode.num_items =
+		deflate_compute_precode_items((u8 *)&c->codes.lens,
+					      c->o.precode.num_litlen_syms +
+					      c->o.precode.num_offset_syms,
+					      c->o.precode.freqs,
+					      c->o.precode.items);
+
+	/* Build the precode. */
+	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
+				  MAX_PRE_CODEWORD_LEN,
+				  c->o.precode.freqs, c->o.precode.lens,
+				  c->o.precode.codewords);
+
+	/* Count how many precode lengths we actually need to output. */
+	for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
+	     c->o.precode.num_explicit_lens > 4;
+	     c->o.precode.num_explicit_lens--)
+		if (c->o.precode.lens[deflate_precode_lens_permutation[
+				c->o.precode.num_explicit_lens - 1]] != 0)
+			break;
+
+	/* Restore the offset codeword lengths if needed. */
+	if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+			(u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
+			c->o.precode.num_offset_syms);
+	}
+}
+
+/*
+ * To make it faster to output matches, compute the "full" match length
+ * codewords, i.e. the concatenation of the litlen codeword and the extra bits
+ * for each possible match length.
+ */
+static void
+deflate_compute_full_len_codewords(struct libdeflate_compressor *c,
+				   const struct deflate_codes *codes)
+{
+	unsigned len;
+
+	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN +
+		      DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32);
+
+	for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) {
+		unsigned slot = deflate_length_slot[len];
+		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot;
+		u32 extra_bits = len - deflate_length_slot_base[slot];
+
+		c->o.length.codewords[len] =
+			codes->codewords.litlen[litlen_sym] |
+			(extra_bits << codes->lens.litlen[litlen_sym]);
+		c->o.length.lens[len] = codes->lens.litlen[litlen_sym] +
+					deflate_extra_length_bits[slot];
+	}
+}
+
+/* Write a match to the output buffer. */
+#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_)		\
+do {									\
+	const struct libdeflate_compressor *c__ = (c_);			\
+	const struct deflate_codes *codes__ = (codes_);			\
+	unsigned length__ = (length_);					\
+	unsigned offset__ = (offset_);					\
+	unsigned offset_slot__ = (offset_slot_);			\
+									\
+	/* Litlen symbol and extra length bits */			\
+	STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +		\
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS));	\
+	ADD_BITS(c__->o.length.codewords[length__],			\
+		 c__->o.length.lens[length__]);				\
+									\
+	if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +			\
+			DEFLATE_MAX_EXTRA_LENGTH_BITS +			\
+			MAX_OFFSET_CODEWORD_LEN +			\
+			DEFLATE_MAX_EXTRA_OFFSET_BITS))			\
+		FLUSH_BITS();						\
+									\
+	/* Offset symbol */						\
+	ADD_BITS(codes__->codewords.offset[offset_slot__],		\
+		 codes__->lens.offset[offset_slot__]);			\
+									\
+	if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +			\
+			DEFLATE_MAX_EXTRA_OFFSET_BITS))			\
+		FLUSH_BITS();						\
+									\
+	/* Extra offset bits */						\
+	ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__],	\
+		 deflate_extra_offset_bits[offset_slot__]);		\
+									\
+	FLUSH_BITS();							\
+} while (0)
+
+/*
+ * Choose the best type of block to use (dynamic Huffman, static Huffman, or
+ * uncompressed), then output it.
+ */
+static void
+deflate_flush_block(struct libdeflate_compressor *c,
+		    struct deflate_output_bitstream *os,
+		    const u8 *block_begin, u32 block_length,
+		    const struct deflate_sequence *sequences,
+		    bool is_final_block)
+{
+	/*
+	 * It is hard to get compilers to understand that writes to 'os->next'
+	 * don't alias 'os'.  That hurts performance significantly, as
+	 * everything in 'os' would keep getting re-loaded.  ('restrict'
+	 * *should* do the trick, but it's unreliable.)  Therefore, we keep all
+	 * the output bitstream state in local variables, and output bits using
+	 * macros.  This is similar to what the decompressor does.
+	 */
+	const u8 *in_next = block_begin;
+	const u8 * const in_end = block_begin + block_length;
+	bitbuf_t bitbuf = os->bitbuf;
+	unsigned bitcount = os->bitcount;
+	u8 *out_next = os->next;
+	u8 * const out_end = os->end;
+	/* The cost for each block type, in bits */
+	u32 dynamic_cost = 0;
+	u32 static_cost = 0;
+	u32 uncompressed_cost = 0;
+	u32 best_cost;
+	struct deflate_codes *codes;
+	unsigned sym;
+
+	ASSERT(block_length >= MIN_BLOCK_LENGTH || is_final_block);
+	ASSERT(block_length <= MAX_BLOCK_LENGTH);
+	ASSERT(bitcount <= 7);
+	ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
+	ASSERT(out_next <= out_end);
+
+	if (sequences != NULL /* !near_optimal */ ||
+	    !SUPPORT_NEAR_OPTIMAL_PARSING) {
+		/* Tally the end-of-block symbol. */
+		c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+
+		/* Build dynamic Huffman codes. */
+		deflate_make_huffman_codes(&c->freqs, &c->codes);
+	} /* Else, this was already done. */
+
+	/* Precompute the precode items and build the precode. */
+	deflate_precompute_huffman_header(c);
+
+	/* Account for the cost of encoding dynamic Huffman codes. */
+	dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
+	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
+		u32 extra = deflate_extra_precode_bits[sym];
+
+		dynamic_cost += c->o.precode.freqs[sym] *
+				(extra + c->o.precode.lens[sym]);
+	}
+
+	/* Account for the cost of encoding literals. */
+	for (sym = 0; sym < 144; sym++) {
+		dynamic_cost += c->freqs.litlen[sym] *
+				c->codes.lens.litlen[sym];
+		static_cost += c->freqs.litlen[sym] * 8;
+	}
+	for (; sym < 256; sym++) {
+		dynamic_cost += c->freqs.litlen[sym] *
+				c->codes.lens.litlen[sym];
+		static_cost += c->freqs.litlen[sym] * 9;
+	}
+
+	/* Account for the cost of encoding the end-of-block symbol. */
+	dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
+	static_cost += 7;
+
+	/* Account for the cost of encoding lengths. */
+	for (sym = DEFLATE_FIRST_LEN_SYM;
+	     sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
+	     sym++) {
+		u32 extra = deflate_extra_length_bits[
+					sym - DEFLATE_FIRST_LEN_SYM];
+
+		dynamic_cost += c->freqs.litlen[sym] *
+				(extra + c->codes.lens.litlen[sym]);
+		static_cost += c->freqs.litlen[sym] *
+				(extra + c->static_codes.lens.litlen[sym]);
+	}
+
+	/* Account for the cost of encoding offsets. */
+	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
+		u32 extra = deflate_extra_offset_bits[sym];
+
+		dynamic_cost += c->freqs.offset[sym] *
+				(extra + c->codes.lens.offset[sym]);
+		static_cost += c->freqs.offset[sym] * (extra + 5);
+	}
+
+	/* Compute the cost of using uncompressed blocks. */
+	uncompressed_cost += (-(bitcount + 3) & 7) + 32 +
+			     (40 * (DIV_ROUND_UP(block_length,
+						 UINT16_MAX) - 1)) +
+			     (8 * block_length);
+
+	/* Choose and output the cheapest type of block. */
+	best_cost = MIN(static_cost, uncompressed_cost);
+	if (dynamic_cost < best_cost) {
+		const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
+		const unsigned num_precode_items = c->o.precode.num_items;
+		unsigned precode_sym, precode_item;
+		unsigned i;
+
+		/* Dynamic Huffman block */
+
+		best_cost = dynamic_cost;
+		codes = &c->codes;
+		STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
+		ADD_BITS(is_final_block, 1);
+		ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2);
+		ADD_BITS(c->o.precode.num_litlen_syms - 257, 5);
+		ADD_BITS(c->o.precode.num_offset_syms - 1, 5);
+		ADD_BITS(num_explicit_lens - 4, 4);
+
+		/* Output the lengths of the codewords in the precode. */
+		if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+			/*
+			 * A 64-bit bitbuffer is just one bit too small to hold
+			 * the maximum number of precode lens, so to minimize
+			 * flushes we merge one len with the previous fields.
+			 */
+			precode_sym = deflate_precode_lens_permutation[0];
+			ADD_BITS(c->o.precode.lens[precode_sym], 3);
+			FLUSH_BITS();
+			i = 1; /* num_explicit_lens >= 4 */
+			do {
+				precode_sym =
+					deflate_precode_lens_permutation[i];
+				ADD_BITS(c->o.precode.lens[precode_sym], 3);
+			} while (++i < num_explicit_lens);
+			FLUSH_BITS();
+		} else {
+			FLUSH_BITS();
+			i = 0;
+			do {
+				precode_sym =
+					deflate_precode_lens_permutation[i];
+				ADD_BITS(c->o.precode.lens[precode_sym], 3);
+				FLUSH_BITS();
+			} while (++i < num_explicit_lens);
+		}
+
+		/*
+		 * Output the lengths of the codewords in the litlen and offset
+		 * codes, encoded by the precode.
+		 */
+		i = 0;
+		do {
+			precode_item = c->o.precode.items[i];
+			precode_sym = precode_item & 0x1F;
+			STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
+			ADD_BITS(c->o.precode.codewords[precode_sym],
+				 c->o.precode.lens[precode_sym]);
+			ADD_BITS(precode_item >> 5,
+				 deflate_extra_precode_bits[precode_sym]);
+			FLUSH_BITS();
+		} while (++i < num_precode_items);
+	} else if (static_cost < uncompressed_cost) {
+		/* Static Huffman block */
+		codes = &c->static_codes;
+		ADD_BITS(is_final_block, 1);
+		ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
+		FLUSH_BITS();
+	} else {
+		/*
+		 * Uncompressed block(s).  DEFLATE limits the length of
+		 * uncompressed blocks to UINT16_MAX bytes, so if the length of
+		 * the "block" we're flushing is over UINT16_MAX, we actually
+		 * output multiple blocks.
+		 */
+		do {
+			u8 bfinal = 0;
+			size_t len = UINT16_MAX;
+
+			if (in_end - in_next <= UINT16_MAX) {
+				bfinal = is_final_block;
+				len = in_end - in_next;
+			}
+			if (out_end - out_next <
+			    (bitcount + 3 + 7) / 8 + 4 + len) {
+				/* Not enough output space remaining. */
+				out_next = out_end;
+				goto out;
+			}
+			/*
+			 * Output BFINAL (1 bit) and BTYPE (2 bits), then align
+			 * to a byte boundary.
+			 */
+			STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
+			*out_next++ = (bfinal << bitcount) | bitbuf;
+			if (bitcount > 5)
+				*out_next++ = 0;
+			bitbuf = 0;
+			bitcount = 0;
+			/* Output LEN and NLEN, then the data itself. */
+			put_unaligned_le16(len, out_next);
+			out_next += 2;
+			put_unaligned_le16(~len, out_next);
+			out_next += 2;
+			memcpy(out_next, in_next, len);
+			out_next += len;
+			in_next += len;
+		} while (in_next != in_end);
+		/* Done outputting uncompressed block(s) */
+		goto out;
+	}
+
+	/* Output the literals and matches for a dynamic or static block. */
+	ASSERT(bitcount <= 7);
+	deflate_compute_full_len_codewords(c, codes);
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	if (sequences == NULL) {
+		/* Output the literals and matches from the minimum-cost path */
+		struct deflate_optimum_node *cur_node =
+			&c->p.n.optimum_nodes[0];
+		struct deflate_optimum_node * const end_node =
+			&c->p.n.optimum_nodes[block_length];
+		do {
+			unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+			unsigned offset = cur_node->item >>
+					  OPTIMUM_OFFSET_SHIFT;
+			if (length == 1) {
+				/* Literal */
+				ADD_BITS(codes->codewords.litlen[offset],
+					 codes->lens.litlen[offset]);
+				FLUSH_BITS();
+			} else {
+				/* Match */
+				WRITE_MATCH(c, codes, length, offset,
+					    c->p.n.offset_slot_full[offset]);
+			}
+			cur_node += length;
+		} while (cur_node != end_node);
+	} else
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+	{
+		/* Output the literals and matches from the sequences list. */
+		const struct deflate_sequence *seq;
+
+		for (seq = sequences; ; seq++) {
+			u32 litrunlen = seq->litrunlen_and_length &
+					SEQ_LITRUNLEN_MASK;
+			unsigned length = seq->litrunlen_and_length >>
+					  SEQ_LENGTH_SHIFT;
+			unsigned lit;
+
+			/* Output a run of literals. */
+			if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) {
+				for (; litrunlen >= 4; litrunlen -= 4) {
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					FLUSH_BITS();
+				}
+				if (litrunlen-- != 0) {
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					if (litrunlen-- != 0) {
+						lit = *in_next++;
+						ADD_BITS(codes->codewords.litlen[lit],
+							 codes->lens.litlen[lit]);
+						if (litrunlen-- != 0) {
+							lit = *in_next++;
+							ADD_BITS(codes->codewords.litlen[lit],
+								 codes->lens.litlen[lit]);
+						}
+					}
+					FLUSH_BITS();
+				}
+			} else {
+				while (litrunlen--) {
+					lit = *in_next++;
+					ADD_BITS(codes->codewords.litlen[lit],
+						 codes->lens.litlen[lit]);
+					FLUSH_BITS();
+				}
+			}
+
+			if (length == 0) { /* Last sequence? */
+				ASSERT(in_next == in_end);
+				break;
+			}
+
+			/* Output a match. */
+			WRITE_MATCH(c, codes, length, seq->offset,
+				    seq->offset_slot);
+			in_next += length;
+		}
+	}
+
+	/* Output the end-of-block symbol. */
+	ASSERT(bitcount <= 7);
+	ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
+		 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
+	FLUSH_BITS();
+out:
+	ASSERT(bitcount <= 7);
+	/*
+	 * Assert that the block cost was computed correctly, as
+	 * libdeflate_deflate_compress_bound() relies on this via the assumption
+	 * that uncompressed blocks will always be used when cheaper.
+	 */
+	ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount ==
+	       3 + best_cost || out_next == out_end);
+
+	os->bitbuf = bitbuf;
+	os->bitcount = bitcount;
+	os->next = out_next;
+}
+
+/******************************************************************************/
+
+/*
+ * Block splitting algorithm.  The problem is to decide when it is worthwhile to
+ * start a new block with new Huffman codes.  There is a theoretically optimal
+ * solution: recursively consider every possible block split, considering the
+ * exact cost of each block, and choose the minimum cost approach.  But this is
+ * far too slow.  Instead, as an approximation, we can count symbols and after
+ * every N symbols, compare the expected distribution of symbols based on the
+ * previous data with the actual distribution.  If they differ "by enough", then
+ * start a new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticeable
+ * based only on changes in these aggregate probabilities, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the probability distributions are "different enough"
+ * to start a new block, the simple heuristic of splitting when the sum of
+ * absolute differences exceeds a constant seems to be good enough.  We also add
+ * a number proportional to the block length so that the algorithm is more
+ * likely to end long blocks than short blocks.  This reflects the general
+ * expectation that it will become increasingly beneficial to start a new block
+ * as the current block grows longer.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+init_block_split_stats(struct block_split_stats *stats)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->new_observations[i] = 0;
+		stats->observations[i] = 0;
+	}
+	stats->num_new_observations = 0;
+	stats->num_observations = 0;
+}
+
+/*
+ * Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.
+ */
+static forceinline void
+observe_literal(struct block_split_stats *stats, u8 lit)
+{
+	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+	stats->num_new_observations++;
+}
+
+/*
+ * Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".
+ */
+static forceinline void
+observe_match(struct block_split_stats *stats, unsigned length)
+{
+	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
+				(length >= 9)]++;
+	stats->num_new_observations++;
+}
+
+static void
+merge_new_observations(struct block_split_stats *stats)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		stats->observations[i] += stats->new_observations[i];
+		stats->new_observations[i] = 0;
+	}
+	stats->num_observations += stats->num_new_observations;
+	stats->num_new_observations = 0;
+}
+
+static bool
+do_end_block_check(struct block_split_stats *stats, u32 block_length)
+{
+	if (stats->num_observations > 0) {
+		/*
+		 * Compute the sum of absolute differences of probabilities.  To
+		 * avoid needing to use floating point arithmetic or do slow
+		 * divisions, we do all arithmetic with the probabilities
+		 * multiplied by num_observations * num_new_observations.  E.g.,
+		 * for the "old" observations the probabilities would be
+		 * (double)observations[i] / num_observations, but since we
+		 * multiply by both num_observations and num_new_observations we
+		 * really do observations[i] * num_new_observations.
+		 */
+		u32 total_delta = 0;
+		u32 num_items;
+		u32 cutoff;
+		int i;
+
+		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+			u32 expected = stats->observations[i] *
+				       stats->num_new_observations;
+			u32 actual = stats->new_observations[i] *
+				     stats->num_observations;
+			u32 delta = (actual > expected) ? actual - expected :
+							  expected - actual;
+
+			total_delta += delta;
+		}
+
+		num_items = stats->num_observations +
+			    stats->num_new_observations;
+		/*
+		 * Heuristic: the cutoff is when the sum of absolute differences
+		 * of probabilities becomes at least 200/512.  As above, the
+		 * probability is multiplied by both num_new_observations and
+		 * num_observations.  Be careful to avoid integer overflow.
+		 */
+		cutoff = stats->num_new_observations * 200 / 512 *
+			 stats->num_observations;
+		/*
+		 * Very short blocks have a lot of overhead for the Huffman
+		 * codes, so only use them if it clearly seems worthwhile.
+		 * (This is an additional penalty, which adds to the smaller
+		 * penalty below which scales more slowly.)
+		 */
+		if (block_length < 10000 && num_items < 8192)
+			cutoff += (u64)cutoff * (8192 - num_items) / 8192;
+
+		/* Ready to end the block? */
+		if (total_delta +
+		    (block_length / 4096) * stats->num_observations >= cutoff)
+			return true;
+	}
+	merge_new_observations(stats);
+	return false;
+}
+
+static forceinline bool
+ready_to_check_block(const struct block_split_stats *stats,
+		     const u8 *in_block_begin, const u8 *in_next,
+		     const u8 *in_end)
+{
+	return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
+		&& in_next - in_block_begin >= MIN_BLOCK_LENGTH
+		&& in_end - in_next >= MIN_BLOCK_LENGTH;
+}
+
+static forceinline bool
+should_end_block(struct block_split_stats *stats,
+		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+{
+	/* Ready to try to end the block (again)? */
+	if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
+		return false;
+
+	return do_end_block_check(stats, in_next - in_block_begin);
+}
+
+/******************************************************************************/
+
+static void
+deflate_begin_sequences(struct libdeflate_compressor *c,
+			struct deflate_sequence *first_seq)
+{
+	deflate_reset_symbol_frequencies(c);
+	first_seq->litrunlen_and_length = 0;
+}
+
+static forceinline void
+deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
+		       bool gather_split_stats, struct deflate_sequence *seq)
+{
+	c->freqs.litlen[literal]++;
+
+	if (gather_split_stats)
+		observe_literal(&c->split_stats, literal);
+
+	STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK);
+	seq->litrunlen_and_length++;
+}
+
+static forceinline void
+deflate_choose_match(struct libdeflate_compressor *c,
+		     unsigned length, unsigned offset, bool gather_split_stats,
+		     struct deflate_sequence **seq_p)
+{
+	struct deflate_sequence *seq = *seq_p;
+	unsigned length_slot = deflate_length_slot[length];
+	unsigned offset_slot = deflate_get_offset_slot(offset);
+
+	c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
+	c->freqs.offset[offset_slot]++;
+	if (gather_split_stats)
+		observe_match(&c->split_stats, length);
+
+	seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT;
+	seq->offset = offset;
+	seq->offset_slot = offset_slot;
+
+	seq++;
+	seq->litrunlen_and_length = 0;
+	*seq_p = seq;
+}
+
+/*
+ * Decrease the maximum and nice match lengths if we're approaching the end of
+ * the input buffer.
+ */
+static forceinline void
+adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
+{
+	if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+		*max_len = remaining;
+		*nice_len = MIN(*nice_len, *max_len);
+	}
+}
+
+/*
+ * Choose the minimum match length for the greedy and lazy parsers.
+ *
+ * By default the minimum match length is 3, which is the smallest length the
+ * DEFLATE format allows.  However, with greedy and lazy parsing, some data
+ * (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
+ * Typically, this is because literals are very cheap.  In general, the
+ * near-optimal parser handles this case naturally, but the greedy and lazy
+ * parsers need a heuristic to decide when to use short matches.
+ *
+ * The heuristic we use is to make the minimum match length depend on the number
+ * of different literals that exist in the data.  If there are many different
+ * literals, then literals will probably be expensive, so short matches will
+ * probably be worthwhile.  Conversely, if not many literals are used, then
+ * probably literals will be cheap and short matches won't be worthwhile.
+ */
+static unsigned
+choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
+{
+	/* map from num_used_literals to min_len */
+	static const u8 min_lens[] = {
+		9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
+		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+		/* The rest is implicitly 3. */
+	};
+	unsigned min_len;
+
+	STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
+	STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
+
+	if (num_used_literals >= ARRAY_LEN(min_lens))
+		return 3;
+	min_len = min_lens[num_used_literals];
+	/*
+	 * With a low max_search_depth, it may be too hard to find long matches.
+	 */
+	if (max_search_depth < 16) {
+		if (max_search_depth < 5)
+			min_len = MIN(min_len, 4);
+		else if (max_search_depth < 10)
+			min_len = MIN(min_len, 5);
+		else
+			min_len = MIN(min_len, 7);
+	}
+	return min_len;
+}
+
+static unsigned
+calculate_min_match_len(const u8 *data, size_t data_len,
+			unsigned max_search_depth)
+{
+	u8 used[256] = { 0 };
+	unsigned num_used_literals = 0;
+	size_t i;
+
+	/*
+	 * For an initial approximation, scan the first 4 KiB of data.  The
+	 * caller may use recalculate_min_match_len() to update min_len later.
+	 */
+	data_len = MIN(data_len, 4096);
+	for (i = 0; i < data_len; i++)
+		used[data[i]] = 1;
+	for (i = 0; i < 256; i++)
+		num_used_literals += used[i];
+	return choose_min_match_len(num_used_literals, max_search_depth);
+}
+
+/*
+ * Recalculate the minimum match length for a block, now that we know the
+ * distribution of literals that are actually being used (freqs->litlen).
+ */
+static unsigned
+recalculate_min_match_len(const struct deflate_freqs *freqs,
+			  unsigned max_search_depth)
+{
+	u32 literal_freq = 0;
+	u32 cutoff;
+	unsigned num_used_literals = 0;
+	int i;
+
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		literal_freq += freqs->litlen[i];
+
+	cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
+
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		if (freqs->litlen[i] > cutoff)
+			num_used_literals++;
+	}
+	return choose_min_match_len(num_used_literals, max_search_depth);
+}
+
+static forceinline const u8 *
+choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
+		     size_t soft_max_len)
+{
+	if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
+		return in_end;
+	return in_block_begin + soft_max_len;
+}
+
+/*
+ * This is the level 0 "compressor".  It always outputs uncompressed blocks.
+ */
+static size_t
+deflate_compress_none(const u8 *in, size_t in_nbytes,
+		      u8 *out, size_t out_nbytes_avail)
+{
+	const u8 *in_next = in;
+	const u8 * const in_end = in + in_nbytes;
+	u8 *out_next = out;
+	u8 * const out_end = out + out_nbytes_avail;
+
+	/*
+	 * If the input is zero-length, we still must output a block in order
+	 * for the output to be a valid DEFLATE stream.  Handle this case
+	 * specially to avoid potentially passing NULL to memcpy() below.
+	 */
+	if (unlikely(in_nbytes == 0)) {
+		if (out_nbytes_avail < 5)
+			return 0;
+		/* BFINAL and BTYPE */
+		*out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
+		/* LEN and NLEN */
+		put_unaligned_le32(0xFFFF0000, out_next);
+		return 5;
+	}
+
+	do {
+		u8 bfinal = 0;
+		size_t len = UINT16_MAX;
+
+		if (in_end - in_next <= UINT16_MAX) {
+			bfinal = 1;
+			len = in_end - in_next;
+		}
+		if (out_end - out_next < 5 + len)
+			return 0;
+		/*
+		 * Output BFINAL and BTYPE.  The stream is already byte-aligned
+		 * here, so this step always requires outputting exactly 1 byte.
+		 */
+		*out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
+
+		/* Output LEN and NLEN, then the data itself. */
+		put_unaligned_le16(len, out_next);
+		out_next += 2;
+		put_unaligned_le16(~len, out_next);
+		out_next += 2;
+		memcpy(out_next, in_next, len);
+		out_next += len;
+		in_next += len;
+	} while (in_next != in_end);
+
+	return out_next - out;
+}
+
+/*
+ * This is a faster variant of deflate_compress_greedy().  It uses the
+ * ht_matchfinder rather than the hc_matchfinder.  It also skips the block
+ * splitting algorithm and just uses fixed length blocks.  c->max_search_depth
+ * has no effect with this algorithm, as it is hardcoded in ht_matchfinder.h.
+ */
+static void
+deflate_compress_fastest(struct libdeflate_compressor * restrict c,
+			 const u8 *in, size_t in_nbytes,
+			 struct deflate_output_bitstream *os)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	u32 next_hash = 0;
+
+	ht_matchfinder_init(&c->p.f.ht_mf);
+
+	do {
+		/* Starting a new DEFLATE block */
+
+		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
+		struct deflate_sequence *seq = c->p.f.sequences;
+
+		deflate_begin_sequences(c, seq);
+
+		do {
+			u32 length;
+			u32 offset;
+			size_t remaining = in_end - in_next;
+
+			if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+				max_len = remaining;
+				if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
+					do {
+						deflate_choose_literal(c,
+							*in_next++, false, seq);
+					} while (--max_len);
+					break;
+				}
+				nice_len = MIN(nice_len, max_len);
+			}
+			length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
+							      &in_cur_base,
+							      in_next,
+							      max_len,
+							      nice_len,
+							      &next_hash,
+							      &offset);
+			if (length) {
+				/* Match found */
+				deflate_choose_match(c, length, offset, false,
+						     &seq);
+				ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
+							  &in_cur_base,
+							  in_next + 1,
+							  in_end,
+							  length - 1,
+							  &next_hash);
+				in_next += length;
+			} else {
+				/* No match found */
+				deflate_choose_literal(c, *in_next++, false,
+						       seq);
+			}
+
+			/* Check if it's time to output another block. */
+		} while (in_next < in_max_block_end &&
+			 seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
+
+		deflate_flush_block(c, os, in_block_begin,
+				    in_next - in_block_begin,
+				    c->p.f.sequences, in_next == in_end);
+	} while (in_next != in_end);
+}
+
+/*
+ * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
+ */
+static void
+deflate_compress_greedy(struct libdeflate_compressor * restrict c,
+			const u8 *in, size_t in_nbytes,
+			struct deflate_output_bitstream *os)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	u32 next_hashes[2] = {0, 0};
+
+	hc_matchfinder_init(&c->p.g.hc_mf);
+
+	do {
+		/* Starting a new DEFLATE block */
+
+		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+		struct deflate_sequence *seq = c->p.g.sequences;
+		unsigned min_len;
+
+		init_block_split_stats(&c->split_stats);
+		deflate_begin_sequences(c, seq);
+		min_len = calculate_min_match_len(in_next,
+						  in_max_block_end - in_next,
+						  c->max_search_depth);
+		do {
+			u32 length;
+			u32 offset;
+
+			adjust_max_and_nice_len(&max_len, &nice_len,
+						in_end - in_next);
+			length = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next,
+						min_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth,
+						next_hashes,
+						&offset);
+
+			if (length >= min_len &&
+			    (length > DEFLATE_MIN_MATCH_LEN ||
+			     offset <= 4096)) {
+				/* Match found */
+				deflate_choose_match(c, length, offset, true,
+						     &seq);
+				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+							  &in_cur_base,
+							  in_next + 1,
+							  in_end,
+							  length - 1,
+							  next_hashes);
+				in_next += length;
+			} else {
+				/* No match found */
+				deflate_choose_literal(c, *in_next++, true,
+						       seq);
+			}
+
+			/* Check if it's time to output another block. */
+		} while (in_next < in_max_block_end &&
+			 seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+			 !should_end_block(&c->split_stats,
+					   in_block_begin, in_next, in_end));
+
+		deflate_flush_block(c, os, in_block_begin,
+				    in_next - in_block_begin,
+				    c->p.g.sequences, in_next == in_end);
+	} while (in_next != in_end);
+}
+
+static forceinline void
+deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
+			      const u8 *in, size_t in_nbytes,
+			      struct deflate_output_bitstream *os, bool lazy2)
+{
+	const u8 *in_next = in;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	u32 next_hashes[2] = {0, 0};
+
+	hc_matchfinder_init(&c->p.g.hc_mf);
+
+	do {
+		/* Starting a new DEFLATE block */
+
+		const u8 * const in_block_begin = in_next;
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+		const u8 *next_recalc_min_len =
+			in_next + MIN(in_end - in_next, 10000);
+		struct deflate_sequence *seq = c->p.g.sequences;
+		unsigned min_len;
+
+		init_block_split_stats(&c->split_stats);
+		deflate_begin_sequences(c, seq);
+		min_len = calculate_min_match_len(in_next,
+						  in_max_block_end - in_next,
+						  c->max_search_depth);
+		do {
+			unsigned cur_len;
+			unsigned cur_offset;
+			unsigned next_len;
+			unsigned next_offset;
+
+			/*
+			 * Recalculate the minimum match length if it hasn't
+			 * been done recently.
+			 */
+			if (in_next >= next_recalc_min_len) {
+				min_len = recalculate_min_match_len(
+						&c->freqs,
+						c->max_search_depth);
+				next_recalc_min_len +=
+					MIN(in_end - next_recalc_min_len,
+					    in_next - in_block_begin);
+			}
+
+			/* Find the longest match at the current position. */
+			adjust_max_and_nice_len(&max_len, &nice_len,
+						in_end - in_next);
+			cur_len = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next,
+						min_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth,
+						next_hashes,
+						&cur_offset);
+			if (cur_len < min_len ||
+			    (cur_len == DEFLATE_MIN_MATCH_LEN &&
+			     cur_offset > 8192)) {
+				/* No match found.  Choose a literal. */
+				deflate_choose_literal(c, *in_next++, true,
+						       seq);
+				continue;
+			}
+			in_next++;
+
+have_cur_match:
+			/*
+			 * We have a match at the current position.
+			 * If it's very long, choose it immediately.
+			 */
+			if (cur_len >= nice_len) {
+				deflate_choose_match(c, cur_len, cur_offset,
+						     true, &seq);
+				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+							  &in_cur_base,
+							  in_next,
+							  in_end,
+							  cur_len - 1,
+							  next_hashes);
+				in_next += cur_len - 1;
+				continue;
+			}
+
+			/*
+			 * Try to find a better match at the next position.
+			 *
+			 * Note: since we already have a match at the *current*
+			 * position, we use only half the 'max_search_depth'
+			 * when checking the *next* position.  This is a useful
+			 * trade-off because it's more worthwhile to use a
+			 * greater search depth on the initial match.
+			 *
+			 * Note: it's possible to structure the code such that
+			 * there's only one call to longest_match(), which
+			 * handles both the "find the initial match" and "try to
+			 * find a better match" cases.  However, it is faster to
+			 * have two call sites, with longest_match() inlined at
+			 * each.
+			 */
+			adjust_max_and_nice_len(&max_len, &nice_len,
+						in_end - in_next);
+			next_len = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next++,
+						cur_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth >> 1,
+						next_hashes,
+						&next_offset);
+			if (next_len >= cur_len &&
+			    4 * (int)(next_len - cur_len) +
+			    ((int)bsr32(cur_offset) -
+			     (int)bsr32(next_offset)) > 2) {
+				/*
+				 * Found a better match at the next position.
+				 * Output a literal.  Then the next match
+				 * becomes the current match.
+				 */
+				deflate_choose_literal(c, *(in_next - 2), true,
+						       seq);
+				cur_len = next_len;
+				cur_offset = next_offset;
+				goto have_cur_match;
+			}
+
+			if (lazy2) {
+				/* In lazy2 mode, look ahead another position */
+				adjust_max_and_nice_len(&max_len, &nice_len,
+							in_end - in_next);
+				next_len = hc_matchfinder_longest_match(
+						&c->p.g.hc_mf,
+						&in_cur_base,
+						in_next++,
+						cur_len - 1,
+						max_len,
+						nice_len,
+						c->max_search_depth >> 2,
+						next_hashes,
+						&next_offset);
+				if (next_len >= cur_len &&
+				    4 * (int)(next_len - cur_len) +
+				    ((int)bsr32(cur_offset) -
+				     (int)bsr32(next_offset)) > 6) {
+					/*
+					 * There's a much better match two
+					 * positions ahead, so use two literals.
+					 */
+					deflate_choose_literal(
+						c, *(in_next - 3), true, seq);
+					deflate_choose_literal(
+						c, *(in_next - 2), true, seq);
+					cur_len = next_len;
+					cur_offset = next_offset;
+					goto have_cur_match;
+				}
+				/*
+				 * No better match at either of the next 2
+				 * positions.  Output the current match.
+				 */
+				deflate_choose_match(c, cur_len, cur_offset,
+						     true, &seq);
+				if (cur_len > 3) {
+					hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+								  &in_cur_base,
+								  in_next,
+								  in_end,
+								  cur_len - 3,
+								  next_hashes);
+					in_next += cur_len - 3;
+				}
+			} else { /* !lazy2 */
+				/*
+				 * No better match at the next position.  Output
+				 * the current match.
+				 */
+				deflate_choose_match(c, cur_len, cur_offset,
+						     true, &seq);
+				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+							  &in_cur_base,
+							  in_next,
+							  in_end,
+							  cur_len - 2,
+							  next_hashes);
+				in_next += cur_len - 2;
+			}
+			/* Check if it's time to output another block. */
+		} while (in_next < in_max_block_end &&
+			 seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+			 !should_end_block(&c->split_stats,
+					   in_block_begin, in_next, in_end));
+
+		deflate_flush_block(c, os, in_block_begin,
+				    in_next - in_block_begin,
+				    c->p.g.sequences, in_next == in_end);
+	} while (in_next != in_end);
+}
+
+/*
+ * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
+ * see if there's a better match at the next position.  If yes, it outputs a
+ * literal and continues to the next position.  If no, it outputs the match.
+ */
+static void
+deflate_compress_lazy(struct libdeflate_compressor * restrict c,
+		      const u8 *in, size_t in_nbytes,
+		      struct deflate_output_bitstream *os)
+{
+	deflate_compress_lazy_generic(c, in, in_nbytes, os, false);
+}
+
+/*
+ * The lazy2 compressor.  This is similar to the regular lazy one, but it looks
+ * for a better match at the next 2 positions rather than the next 1.  This
+ * makes it take slightly more time, but compress some inputs slightly more.
+ */
+static void
+deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
+		       const u8 *in, size_t in_nbytes,
+		       struct deflate_output_bitstream *os)
+{
+	deflate_compress_lazy_generic(c, in, in_nbytes, os, true);
+}
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+
+/*
+ * Follow the minimum-cost path in the graph of possible match/literal choices
+ * for the current block and compute the frequencies of the Huffman symbols that
+ * would be needed to output those matches and literals.
+ */
+static void
+deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
+{
+	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
+
+	do {
+		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+
+		if (length == 1) {
+			/* Literal */
+			c->freqs.litlen[offset]++;
+		} else {
+			/* Match */
+			c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
+					deflate_length_slot[length]]++;
+			c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
+		}
+		cur_node += length;
+	} while (cur_node != end_node);
+
+	/* Tally the end-of-block symbol. */
+	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+}
+
+/* Set the current cost model from the codeword lengths specified in @lens. */
+static void
+deflate_set_costs_from_codes(struct libdeflate_compressor *c,
+			     const struct deflate_lens *lens)
+{
+	unsigned i;
+
+	/* Literals */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		u32 bits = (lens->litlen[i] ?
+			    lens->litlen[i] : LITERAL_NOSTAT_BITS);
+
+		c->p.n.costs.literal[i] = bits * BIT_COST;
+	}
+
+	/* Lengths */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
+		unsigned length_slot = deflate_length_slot[i];
+		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
+		u32 bits = (lens->litlen[litlen_sym] ?
+			    lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
+
+		bits += deflate_extra_length_bits[length_slot];
+		c->p.n.costs.length[i] = bits * BIT_COST;
+	}
+
+	/* Offset slots */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
+		u32 bits = (lens->offset[i] ?
+			    lens->offset[i] : OFFSET_NOSTAT_BITS);
+
+		bits += deflate_extra_offset_bits[i];
+		c->p.n.costs.offset_slot[i] = bits * BIT_COST;
+	}
+}
+
+/*
+ * This lookup table gives the default cost of a literal symbol and of a length
+ * symbol, depending on the characteristics of the input data.  It was generated
+ * by scripts/gen_default_litlen_costs.py.
+ *
+ * This table is indexed first by the estimated match probability:
+ *
+ *	i=0: data doesn't contain many matches	[match_prob=0.25]
+ *	i=1: neutral				[match_prob=0.50]
+ *	i=2: data contains lots of matches	[match_prob=0.75]
+ *
+ * This lookup produces a subtable which maps the number of distinct used
+ * literals to the default cost of a literal symbol, i.e.:
+ *
+ *	int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
+ *
+ * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
+ * accounts for literals usually getting cheaper as the number of distinct
+ * literals decreases, and as the proportion of literals to matches increases.
+ *
+ * The lookup also produces the cost of a length symbol, which is:
+ *
+ *	int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
+ *
+ * Note: we don't currently assign different costs to different literal symbols,
+ * or to different length symbols, as this is hard to do in a useful way.
+ */
+static const struct {
+	u8 used_lits_to_lit_cost[257];
+	u8 len_sym_cost;
+} default_litlen_costs[] = {
+	{ /* match_prob = 0.25 */
+		.used_lits_to_lit_cost = {
+			6, 6, 22, 32, 38, 43, 48, 51,
+			54, 57, 59, 61, 64, 65, 67, 69,
+			70, 72, 73, 74, 75, 76, 77, 79,
+			80, 80, 81, 82, 83, 84, 85, 85,
+			86, 87, 88, 88, 89, 89, 90, 91,
+			91, 92, 92, 93, 93, 94, 95, 95,
+			96, 96, 96, 97, 97, 98, 98, 99,
+			99, 99, 100, 100, 101, 101, 101, 102,
+			102, 102, 103, 103, 104, 104, 104, 105,
+			105, 105, 105, 106, 106, 106, 107, 107,
+			107, 108, 108, 108, 108, 109, 109, 109,
+			109, 110, 110, 110, 111, 111, 111, 111,
+			112, 112, 112, 112, 112, 113, 113, 113,
+			113, 114, 114, 114, 114, 114, 115, 115,
+			115, 115, 115, 116, 116, 116, 116, 116,
+			117, 117, 117, 117, 117, 118, 118, 118,
+			118, 118, 118, 119, 119, 119, 119, 119,
+			120, 120, 120, 120, 120, 120, 121, 121,
+			121, 121, 121, 121, 121, 122, 122, 122,
+			122, 122, 122, 123, 123, 123, 123, 123,
+			123, 123, 124, 124, 124, 124, 124, 124,
+			124, 125, 125, 125, 125, 125, 125, 125,
+			125, 126, 126, 126, 126, 126, 126, 126,
+			127, 127, 127, 127, 127, 127, 127, 127,
+			128, 128, 128, 128, 128, 128, 128, 128,
+			128, 129, 129, 129, 129, 129, 129, 129,
+			129, 129, 130, 130, 130, 130, 130, 130,
+			130, 130, 130, 131, 131, 131, 131, 131,
+			131, 131, 131, 131, 131, 132, 132, 132,
+			132, 132, 132, 132, 132, 132, 132, 133,
+			133, 133, 133, 133, 133, 133, 133, 133,
+			133, 134, 134, 134, 134, 134, 134, 134,
+			134,
+		},
+		.len_sym_cost = 109,
+	}, { /* match_prob = 0.5 */
+		.used_lits_to_lit_cost = {
+			16, 16, 32, 41, 48, 53, 57, 60,
+			64, 66, 69, 71, 73, 75, 76, 78,
+			80, 81, 82, 83, 85, 86, 87, 88,
+			89, 90, 91, 92, 92, 93, 94, 95,
+			96, 96, 97, 98, 98, 99, 99, 100,
+			101, 101, 102, 102, 103, 103, 104, 104,
+			105, 105, 106, 106, 107, 107, 108, 108,
+			108, 109, 109, 110, 110, 110, 111, 111,
+			112, 112, 112, 113, 113, 113, 114, 114,
+			114, 115, 115, 115, 115, 116, 116, 116,
+			117, 117, 117, 118, 118, 118, 118, 119,
+			119, 119, 119, 120, 120, 120, 120, 121,
+			121, 121, 121, 122, 122, 122, 122, 122,
+			123, 123, 123, 123, 124, 124, 124, 124,
+			124, 125, 125, 125, 125, 125, 126, 126,
+			126, 126, 126, 127, 127, 127, 127, 127,
+			128, 128, 128, 128, 128, 128, 129, 129,
+			129, 129, 129, 129, 130, 130, 130, 130,
+			130, 130, 131, 131, 131, 131, 131, 131,
+			131, 132, 132, 132, 132, 132, 132, 133,
+			133, 133, 133, 133, 133, 133, 134, 134,
+			134, 134, 134, 134, 134, 134, 135, 135,
+			135, 135, 135, 135, 135, 135, 136, 136,
+			136, 136, 136, 136, 136, 136, 137, 137,
+			137, 137, 137, 137, 137, 137, 138, 138,
+			138, 138, 138, 138, 138, 138, 138, 139,
+			139, 139, 139, 139, 139, 139, 139, 139,
+			140, 140, 140, 140, 140, 140, 140, 140,
+			140, 141, 141, 141, 141, 141, 141, 141,
+			141, 141, 141, 142, 142, 142, 142, 142,
+			142, 142, 142, 142, 142, 142, 143, 143,
+			143, 143, 143, 143, 143, 143, 143, 143,
+			144,
+		},
+		.len_sym_cost = 93,
+	}, { /* match_prob = 0.75 */
+		.used_lits_to_lit_cost = {
+			32, 32, 48, 57, 64, 69, 73, 76,
+			80, 82, 85, 87, 89, 91, 92, 94,
+			96, 97, 98, 99, 101, 102, 103, 104,
+			105, 106, 107, 108, 108, 109, 110, 111,
+			112, 112, 113, 114, 114, 115, 115, 116,
+			117, 117, 118, 118, 119, 119, 120, 120,
+			121, 121, 122, 122, 123, 123, 124, 124,
+			124, 125, 125, 126, 126, 126, 127, 127,
+			128, 128, 128, 129, 129, 129, 130, 130,
+			130, 131, 131, 131, 131, 132, 132, 132,
+			133, 133, 133, 134, 134, 134, 134, 135,
+			135, 135, 135, 136, 136, 136, 136, 137,
+			137, 137, 137, 138, 138, 138, 138, 138,
+			139, 139, 139, 139, 140, 140, 140, 140,
+			140, 141, 141, 141, 141, 141, 142, 142,
+			142, 142, 142, 143, 143, 143, 143, 143,
+			144, 144, 144, 144, 144, 144, 145, 145,
+			145, 145, 145, 145, 146, 146, 146, 146,
+			146, 146, 147, 147, 147, 147, 147, 147,
+			147, 148, 148, 148, 148, 148, 148, 149,
+			149, 149, 149, 149, 149, 149, 150, 150,
+			150, 150, 150, 150, 150, 150, 151, 151,
+			151, 151, 151, 151, 151, 151, 152, 152,
+			152, 152, 152, 152, 152, 152, 153, 153,
+			153, 153, 153, 153, 153, 153, 154, 154,
+			154, 154, 154, 154, 154, 154, 154, 155,
+			155, 155, 155, 155, 155, 155, 155, 155,
+			156, 156, 156, 156, 156, 156, 156, 156,
+			156, 157, 157, 157, 157, 157, 157, 157,
+			157, 157, 157, 158, 158, 158, 158, 158,
+			158, 158, 158, 158, 158, 158, 159, 159,
+			159, 159, 159, 159, 159, 159, 159, 159,
+			160,
+		},
+		.len_sym_cost = 84,
+	},
+};
+
+/*
+ * Choose the default costs for literal and length symbols.  These symbols are
+ * both part of the litlen alphabet.
+ */
+static void
+deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
+				    const u8 *block_begin, u32 block_length,
+				    u32 *lit_cost, u32 *len_sym_cost)
+{
+	unsigned num_used_literals = 0;
+	u32 literal_freq = block_length;
+	u32 match_freq = 0;
+	u32 cutoff;
+	u32 i;
+
+	/* Calculate the number of distinct literals that exist in the data. */
+	memset(c->freqs.litlen, 0,
+	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
+	cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
+	for (i = 0; i < block_length; i++)
+		c->freqs.litlen[block_begin[i]]++;
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+		if (c->freqs.litlen[i] > cutoff)
+			num_used_literals++;
+	}
+	if (num_used_literals == 0)
+		num_used_literals = 1;
+
+	/*
+	 * Estimate the relative frequency of literals and matches in the
+	 * optimal parsing solution.  We don't know the optimal solution, so
+	 * this can only be a very rough estimate.  Therefore, we basically use
+	 * the match frequency from a greedy parse.  We also apply the min_len
+	 * heuristic used by the greedy and lazy parsers, to avoid counting too
+	 * many matches when literals are cheaper than short matches.
+	 */
+	match_freq = 0;
+	i = choose_min_match_len(num_used_literals, c->max_search_depth);
+	for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+		match_freq += c->p.n.match_len_freqs[i];
+		literal_freq -= i * c->p.n.match_len_freqs[i];
+	}
+	if ((s32)literal_freq < 0) /* shouldn't happen */
+		literal_freq = 0;
+
+	if (match_freq > literal_freq)
+		i = 2; /* many matches */
+	else if (match_freq * 4 > literal_freq)
+		i = 1; /* neutral */
+	else
+		i = 0; /* few matches */
+
+	STATIC_ASSERT(BIT_COST == 16);
+	*lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
+							num_used_literals];
+	*len_sym_cost = default_litlen_costs[i].len_sym_cost;
+}
+
+static forceinline u32
+deflate_default_length_cost(unsigned len, u32 len_sym_cost)
+{
+	unsigned slot = deflate_length_slot[len];
+	u32 num_extra_bits = deflate_extra_length_bits[slot];
+
+	return len_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+static forceinline u32
+deflate_default_offset_slot_cost(unsigned slot)
+{
+	u32 num_extra_bits = deflate_extra_offset_bits[slot];
+	/*
+	 * Assume that all offset symbols are equally probable.
+	 * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
+	 * where 30 is the number of potentially-used offset symbols.
+	 */
+	u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
+
+	return offset_sym_cost + (num_extra_bits * BIT_COST);
+}
+
+/* Set default symbol costs for the first block's first optimization pass. */
+static void
+deflate_set_default_costs(struct libdeflate_compressor *c,
+			  u32 lit_cost, u32 len_sym_cost)
+{
+	unsigned i;
+
+	/* Literals */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		c->p.n.costs.literal[i] = lit_cost;
+
+	/* Lengths */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+		c->p.n.costs.length[i] =
+			deflate_default_length_cost(i, len_sym_cost);
+
+	/* Offset slots */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+		c->p.n.costs.offset_slot[i] =
+			deflate_default_offset_slot_cost(i);
+}
+
+static forceinline void
+deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
+{
+	if (change_amount == 0)
+		/* Block is very similar to previous; prefer previous costs. */
+		*cost_p = (default_cost + 3 * *cost_p) / 4;
+	else if (change_amount == 1)
+		*cost_p = (default_cost + *cost_p) / 2;
+	else if (change_amount == 2)
+		*cost_p = (5 * default_cost + 3 * *cost_p) / 8;
+	else
+		/* Block differs greatly from previous; prefer default costs. */
+		*cost_p = (3 * default_cost + *cost_p) / 4;
+}
+
+static forceinline void
+deflate_adjust_costs_impl(struct libdeflate_compressor *c,
+			  u32 lit_cost, u32 len_sym_cost, int change_amount)
+{
+	unsigned i;
+
+	/* Literals */
+	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+		deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
+				    change_amount);
+
+	/* Lengths */
+	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+		deflate_adjust_cost(&c->p.n.costs.length[i],
+				    deflate_default_length_cost(i,
+								len_sym_cost),
+				    change_amount);
+
+	/* Offset slots */
+	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
+				    deflate_default_offset_slot_cost(i),
+				    change_amount);
+}
+
+/*
+ * Adjust the costs when beginning a new block.
+ *
+ * Since the current costs have been optimized for the data, it's undesirable to
+ * throw them away and start over with the default costs.  At the same time, we
+ * don't want to bias the parse by assuming that the next block will be similar
+ * to the current block.  As a compromise, make the costs closer to the
+ * defaults, but don't simply set them to the defaults.
+ */
+static void
+deflate_adjust_costs(struct libdeflate_compressor *c,
+		     u32 lit_cost, u32 len_sym_cost)
+{
+	u64 total_delta = 0;
+	u64 cutoff;
+	int i;
+
+	/*
+	 * Decide how different the current block is from the previous block,
+	 * using the block splitting statistics from the current and previous
+	 * blocks.  The more different the current block is, the more we prefer
+	 * the default costs rather than the previous block's costs.
+	 *
+	 * The algorithm here is similar to the end-of-block check one, but here
+	 * we compare two entire blocks rather than a partial block with a small
+	 * extra part, and therefore we need 64-bit numbers in some places.
+	 */
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+		u64 prev = (u64)c->p.n.prev_observations[i] *
+			    c->split_stats.num_observations;
+		u64 cur = (u64)c->split_stats.observations[i] *
+			  c->p.n.prev_num_observations;
+
+		total_delta += prev > cur ? prev - cur : cur - prev;
+	}
+	cutoff = ((u64)c->p.n.prev_num_observations *
+		  c->split_stats.num_observations * 200) / 512;
+
+	if (4 * total_delta > 9 * cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
+	else if (2 * total_delta > 3 * cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
+	else if (2 * total_delta > cutoff)
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
+	else
+		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
+}
+
+/*
+ * Find the minimum-cost path through the graph of possible match/literal
+ * choices for this block.
+ *
+ * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
+ * represents the node at the beginning of the block, to
+ * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
+ * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
+ *
+ * The algorithm works backwards, starting at the end node and proceeding
+ * backwards one node at a time.  At each node, the minimum cost to reach the
+ * end node is computed and the match/literal choice that begins that path is
+ * saved.
+ */
+static void
+deflate_find_min_cost_path(struct libdeflate_compressor *c,
+			   const u32 block_length,
+			   const struct lz_match *cache_ptr)
+{
+	struct deflate_optimum_node *end_node =
+		&c->p.n.optimum_nodes[block_length];
+	struct deflate_optimum_node *cur_node = end_node;
+
+	cur_node->cost_to_end = 0;
+	do {
+		unsigned num_matches;
+		unsigned literal;
+		u32 best_cost_to_end;
+
+		cur_node--;
+		cache_ptr--;
+
+		num_matches = cache_ptr->length;
+		literal = cache_ptr->offset;
+
+		/* It's always possible to choose a literal. */
+		best_cost_to_end = c->p.n.costs.literal[literal] +
+				   (cur_node + 1)->cost_to_end;
+		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
+
+		/* Also consider matches if there are any. */
+		if (num_matches) {
+			const struct lz_match *match;
+			unsigned len;
+			unsigned offset;
+			unsigned offset_slot;
+			u32 offset_cost;
+			u32 cost_to_end;
+
+			/*
+			 * Consider each length from the minimum
+			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
+			 * match found at this position.  For each length, we
+			 * consider only the smallest offset for which that
+			 * length is available.  Although this is not guaranteed
+			 * to be optimal due to the possibility of a larger
+			 * offset costing less than a smaller offset to code,
+			 * this is a very useful heuristic.
+			 */
+			match = cache_ptr - num_matches;
+			len = DEFLATE_MIN_MATCH_LEN;
+			do {
+				offset = match->offset;
+				offset_slot = c->p.n.offset_slot_full[offset];
+				offset_cost =
+					c->p.n.costs.offset_slot[offset_slot];
+				do {
+					cost_to_end = offset_cost +
+						c->p.n.costs.length[len] +
+						(cur_node + len)->cost_to_end;
+					if (cost_to_end < best_cost_to_end) {
+						best_cost_to_end = cost_to_end;
+						cur_node->item = len |
+							((u32)offset <<
+							 OPTIMUM_OFFSET_SHIFT);
+					}
+				} while (++len <= match->length);
+			} while (++match != cache_ptr);
+			cache_ptr -= num_matches;
+		}
+		cur_node->cost_to_end = best_cost_to_end;
+	} while (cur_node != &c->p.n.optimum_nodes[0]);
+}
+
+/*
+ * Choose the literal/match sequence to use for the current block.  The basic
+ * algorithm finds a minimum-cost path through the block's graph of
+ * literal/match choices, given a cost model.  However, the cost of each symbol
+ * is unknown until the Huffman codes have been built, but at the same time the
+ * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
+ * multiple passes must be used to try to approximate an optimal solution.  The
+ * first pass uses default costs, mixed with the costs from the previous block
+ * if any.  Later passes use the Huffman codeword lengths from the previous pass
+ * as the costs.
+ */
+static void
+deflate_optimize_block(struct libdeflate_compressor *c,
+		       const u8 *block_begin, u32 block_length,
+		       const struct lz_match *cache_ptr, bool is_first_block,
+		       bool is_final_block)
+{
+	unsigned num_passes_remaining = c->p.n.num_optim_passes;
+	u32 lit_cost, len_sym_cost;
+	u32 i;
+
+	/*
+	 * Force the block to really end at the desired length, even if some
+	 * matches extend beyond it.
+	 */
+	for (i = block_length;
+	     i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
+		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
+		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
+
+	/* Set the initial costs. */
+	deflate_choose_default_litlen_costs(c, block_begin, block_length,
+					    &lit_cost, &len_sym_cost);
+	if (is_first_block)
+		deflate_set_default_costs(c, lit_cost, len_sym_cost);
+	else
+		deflate_adjust_costs(c, lit_cost, len_sym_cost);
+
+	do {
+		/* Find the minimum cost path for this pass. */
+		deflate_find_min_cost_path(c, block_length, cache_ptr);
+
+		/* Compute frequencies of the chosen symbols. */
+		deflate_reset_symbol_frequencies(c);
+		deflate_tally_item_list(c, block_length);
+
+		/* Make the Huffman codes. */
+		deflate_make_huffman_codes(&c->freqs, &c->codes);
+
+		/*
+		 * Update the costs.  After the last optimization pass, the
+		 * final costs won't be needed for this block, but they will be
+		 * used in determining the initial costs for the next block.
+		 */
+		if (--num_passes_remaining || !is_final_block)
+			deflate_set_costs_from_codes(c, &c->codes.lens);
+	} while (num_passes_remaining);
+}
+
+static void
+deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
+{
+	init_block_split_stats(&c->split_stats);
+	memset(c->p.n.new_match_len_freqs, 0,
+	       sizeof(c->p.n.new_match_len_freqs));
+	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
+static void
+deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
+{
+	unsigned i;
+
+	merge_new_observations(&c->split_stats);
+	for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+		c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
+		c->p.n.new_match_len_freqs[i] = 0;
+	}
+}
+
+/*
+ * Save some literal/match statistics from the previous block so that
+ * deflate_adjust_costs() will be able to decide how much the current block
+ * differs from the previous one.
+ */
+static void
+deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->p.n.prev_observations[i] = c->split_stats.observations[i];
+	c->p.n.prev_num_observations = c->split_stats.num_observations;
+}
+
+static void
+deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
+{
+	int i;
+
+	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+		c->split_stats.observations[i] = 0;
+	c->split_stats.num_observations = 0;
+	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+}
+
+/*
+ * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
+ * representation of each DEFLATE block using a minimum-cost path search over
+ * the graph of possible match/literal choices for that block, assuming a
+ * certain cost for each Huffman symbol.
+ *
+ * For several reasons, the end result is not guaranteed to be optimal:
+ *
+ * - Nonoptimal choice of blocks
+ * - Heuristic limitations on which matches are actually considered
+ * - Symbol costs are unknown until the symbols have already been chosen
+ *   (so iterative optimization must be used)
+ */
+static void
+deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
+			      const u8 *in, size_t in_nbytes,
+			      struct deflate_output_bitstream *os)
+{
+	const u8 *in_next = in;
+	const u8 *in_block_begin = in_next;
+	const u8 *in_end = in_next + in_nbytes;
+	const u8 *in_cur_base = in_next;
+	const u8 *in_next_slide =
+		in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
+	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+	unsigned nice_len = MIN(c->nice_match_length, max_len);
+	struct lz_match *cache_ptr = c->p.n.match_cache;
+	u32 next_hashes[2] = {0, 0};
+
+	bt_matchfinder_init(&c->p.n.bt_mf);
+	deflate_near_optimal_init_stats(c);
+
+	do {
+		/* Starting a new DEFLATE block */
+		const u8 * const in_max_block_end = choose_max_block_end(
+				in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+		const u8 *prev_end_block_check = NULL;
+		bool change_detected = false;
+		const u8 *next_observation = in_next;
+		unsigned min_len;
+
+		/*
+		 * Use the minimum match length heuristic to improve the
+		 * literal/match statistics gathered during matchfinding.
+		 * However, the actual near-optimal parse won't respect min_len,
+		 * as it can accurately assess the costs of different matches.
+		 */
+		min_len = calculate_min_match_len(
+					in_block_begin,
+					in_max_block_end - in_block_begin,
+					c->max_search_depth);
+
+		/*
+		 * Find matches until we decide to end the block.  We end the
+		 * block if any of the following is true:
+		 *
+		 * (1) Maximum block length has been reached
+		 * (2) Match catch may overflow.
+		 * (3) Block split heuristic says to split now.
+		 */
+		for (;;) {
+			struct lz_match *matches;
+			unsigned best_len;
+			size_t remaining = in_end - in_next;
+
+			/* Slide the window forward if needed. */
+			if (in_next == in_next_slide) {
+				bt_matchfinder_slide_window(&c->p.n.bt_mf);
+				in_cur_base = in_next;
+				in_next_slide = in_next +
+					MIN(remaining, MATCHFINDER_WINDOW_SIZE);
+			}
+
+			/*
+			 * Find matches with the current position using the
+			 * binary tree matchfinder and save them in match_cache.
+			 *
+			 * Note: the binary tree matchfinder is more suited for
+			 * optimal parsing than the hash chain matchfinder.  The
+			 * reasons for this include:
+			 *
+			 * - The binary tree matchfinder can find more matches
+			 *   in the same number of steps.
+			 * - One of the major advantages of hash chains is that
+			 *   skipping positions (not searching for matches at
+			 *   them) is faster; however, with optimal parsing we
+			 *   search for matches at almost all positions, so this
+			 *   advantage of hash chains is negated.
+			 */
+			matches = cache_ptr;
+			best_len = 0;
+			adjust_max_and_nice_len(&max_len, &nice_len, remaining);
+			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
+				cache_ptr = bt_matchfinder_get_matches(
+						&c->p.n.bt_mf,
+						in_cur_base,
+						in_next - in_cur_base,
+						max_len,
+						nice_len,
+						c->max_search_depth,
+						next_hashes,
+						matches);
+				if (cache_ptr > matches)
+					best_len = cache_ptr[-1].length;
+			}
+			if (in_next >= next_observation) {
+				if (best_len >= min_len) {
+					observe_match(&c->split_stats,
+						      best_len);
+					next_observation = in_next + best_len;
+					c->p.n.new_match_len_freqs[best_len]++;
+				} else {
+					observe_literal(&c->split_stats,
+							*in_next);
+					next_observation = in_next + 1;
+				}
+			}
+
+			cache_ptr->length = cache_ptr - matches;
+			cache_ptr->offset = *in_next;
+			in_next++;
+			cache_ptr++;
+
+			/*
+			 * If there was a very long match found, don't cache any
+			 * matches for the bytes covered by that match.  This
+			 * avoids degenerate behavior when compressing highly
+			 * redundant data, where the number of matches can be
+			 * very large.
+			 *
+			 * This heuristic doesn't actually hurt the compression
+			 * ratio very much.  If there's a long match, then the
+			 * data must be highly compressible, so it doesn't
+			 * matter much what we do.
+			 */
+			if (best_len >= DEFLATE_MIN_MATCH_LEN &&
+			    best_len >= nice_len) {
+				--best_len;
+				do {
+					remaining = in_end - in_next;
+					if (in_next == in_next_slide) {
+						bt_matchfinder_slide_window(
+							&c->p.n.bt_mf);
+						in_cur_base = in_next;
+						in_next_slide = in_next +
+							MIN(remaining,
+							    MATCHFINDER_WINDOW_SIZE);
+					}
+					adjust_max_and_nice_len(&max_len,
+								&nice_len,
+								remaining);
+					if (max_len >=
+					    BT_MATCHFINDER_REQUIRED_NBYTES) {
+						bt_matchfinder_skip_byte(
+							&c->p.n.bt_mf,
+							in_cur_base,
+							in_next - in_cur_base,
+							nice_len,
+							c->max_search_depth,
+							next_hashes);
+					}
+					cache_ptr->length = 0;
+					cache_ptr->offset = *in_next;
+					in_next++;
+					cache_ptr++;
+				} while (--best_len);
+			}
+			/* Maximum block length or end of input reached? */
+			if (in_next >= in_max_block_end)
+				break;
+			/* Match cache overflowed? */
+			if (cache_ptr >=
+			    &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+				break;
+			/* Not ready to try to end the block (again)? */
+			if (!ready_to_check_block(&c->split_stats,
+						  in_block_begin, in_next,
+						  in_end))
+				continue;
+			/* Check if it would be worthwhile to end the block. */
+			if (do_end_block_check(&c->split_stats,
+					       in_next - in_block_begin)) {
+				change_detected = true;
+				break;
+			}
+			/* Ending the block doesn't seem worthwhile here. */
+			deflate_near_optimal_merge_stats(c);
+			prev_end_block_check = in_next;
+		}
+		/*
+		 * All the matches for this block have been cached.  Now choose
+		 * the precise end of the block and the sequence of items to
+		 * output to represent it, then flush the block.
+		 */
+		if (change_detected && prev_end_block_check != NULL) {
+			/*
+			 * The block is being ended because a recent chunk of
+			 * data differs from the rest of the block.  We could
+			 * end the block at 'in_next' like the greedy and lazy
+			 * compressors do, but that's not ideal since it would
+			 * include the differing chunk in the block.  The
+			 * near-optimal compressor has time to do a better job.
+			 * Therefore, we rewind to just before the chunk, and
+			 * output a block that only goes up to there.
+			 *
+			 * We then set things up to correctly start the next
+			 * block, considering that some work has already been
+			 * done on it (some matches found and stats gathered).
+			 */
+			struct lz_match *orig_cache_ptr = cache_ptr;
+			const u8 *in_block_end = prev_end_block_check;
+			u32 block_length = in_block_end - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = false;
+			u32 num_bytes_to_rewind = in_next - in_block_end;
+			size_t cache_len_rewound;
+
+			/* Rewind the match cache. */
+			do {
+				cache_ptr--;
+				cache_ptr -= cache_ptr->length;
+			} while (--num_bytes_to_rewind);
+			cache_len_rewound = orig_cache_ptr - cache_ptr;
+
+			deflate_optimize_block(c, in_block_begin, block_length,
+					       cache_ptr, is_first, is_final);
+			deflate_flush_block(c, os, in_block_begin, block_length,
+					    NULL, is_final);
+			memmove(c->p.n.match_cache, cache_ptr,
+				cache_len_rewound * sizeof(*cache_ptr));
+			cache_ptr = &c->p.n.match_cache[cache_len_rewound];
+			deflate_near_optimal_save_stats(c);
+			/*
+			 * Clear the stats for the just-flushed block, leaving
+			 * just the stats for the beginning of the next block.
+			 */
+			deflate_near_optimal_clear_old_stats(c);
+			in_block_begin = in_block_end;
+		} else {
+			/*
+			 * The block is being ended for a reason other than a
+			 * differing data chunk being detected.  Don't rewind at
+			 * all; just end the block at the current position.
+			 */
+			u32 block_length = in_next - in_block_begin;
+			bool is_first = (in_block_begin == in);
+			bool is_final = (in_next == in_end);
+
+			deflate_near_optimal_merge_stats(c);
+			deflate_optimize_block(c, in_block_begin, block_length,
+					       cache_ptr, is_first, is_final);
+			deflate_flush_block(c, os, in_block_begin, block_length,
+					    NULL, is_final);
+			cache_ptr = &c->p.n.match_cache[0];
+			deflate_near_optimal_save_stats(c);
+			deflate_near_optimal_init_stats(c);
+			in_block_begin = in_next;
+		}
+	} while (in_next != in_end);
+}
+
+/* Initialize c->p.n.offset_slot_full. */
+static void
+deflate_init_offset_slot_full(struct libdeflate_compressor *c)
+{
+	unsigned offset_slot;
+	unsigned offset;
+	unsigned offset_end;
+
+	for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
+	     offset_slot++) {
+		offset = deflate_offset_slot_base[offset_slot];
+		offset_end = offset +
+			     (1 << deflate_extra_offset_bits[offset_slot]);
+		do {
+			c->p.n.offset_slot_full[offset] = offset_slot;
+		} while (++offset != offset_end);
+	}
+}
+
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor(int compression_level)
+{
+	struct libdeflate_compressor *c;
+	size_t size = offsetof(struct libdeflate_compressor, p);
+
+	check_buildtime_parameters();
+
+	if (compression_level < 0 || compression_level > 12)
+		return NULL;
+
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	if (compression_level >= 10)
+		size += sizeof(c->p.n);
+	else
+#endif
+	{
+		if (compression_level >= 2)
+			size += sizeof(c->p.g);
+		else if (compression_level == 1)
+			size += sizeof(c->p.f);
+	}
+
+	c = libdeflate_aligned_malloc(MATCHFINDER_MEM_ALIGNMENT, size);
+	if (!c)
+		return NULL;
+
+	c->compression_level = compression_level;
+
+	/*
+	 * The higher the compression level, the more we should bother trying to
+	 * compress very small inputs.
+	 */
+	c->max_passthrough_size = 55 - (compression_level * 4);
+
+	switch (compression_level) {
+	case 0:
+		c->max_passthrough_size = SIZE_MAX;
+		c->impl = NULL; /* not used */
+		break;
+	case 1:
+		c->impl = deflate_compress_fastest;
+		/* max_search_depth is unused. */
+		c->nice_match_length = 32;
+		break;
+	case 2:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 6;
+		c->nice_match_length = 10;
+		break;
+	case 3:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 12;
+		c->nice_match_length = 14;
+		break;
+	case 4:
+		c->impl = deflate_compress_greedy;
+		c->max_search_depth = 16;
+		c->nice_match_length = 30;
+		break;
+	case 5:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 16;
+		c->nice_match_length = 30;
+		break;
+	case 6:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 35;
+		c->nice_match_length = 65;
+		break;
+	case 7:
+		c->impl = deflate_compress_lazy;
+		c->max_search_depth = 100;
+		c->nice_match_length = 130;
+		break;
+	case 8:
+		c->impl = deflate_compress_lazy2;
+		c->max_search_depth = 300;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		break;
+	case 9:
+#if !SUPPORT_NEAR_OPTIMAL_PARSING
+	default:
+#endif
+		c->impl = deflate_compress_lazy2;
+		c->max_search_depth = 600;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		break;
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+	case 10:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 35;
+		c->nice_match_length = 75;
+		c->p.n.num_optim_passes = 2;
+		deflate_init_offset_slot_full(c);
+		break;
+	case 11:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 70;
+		c->nice_match_length = 150;
+		c->p.n.num_optim_passes = 3;
+		deflate_init_offset_slot_full(c);
+		break;
+	case 12:
+	default:
+		c->impl = deflate_compress_near_optimal;
+		c->max_search_depth = 150;
+		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+		c->p.n.num_optim_passes = 4;
+		deflate_init_offset_slot_full(c);
+		break;
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+	}
+
+	deflate_init_static_codes(c);
+
+	return c;
+}
+
+LIBDEFLATEAPI size_t
+libdeflate_deflate_compress(struct libdeflate_compressor *c,
+			    const void *in, size_t in_nbytes,
+			    void *out, size_t out_nbytes_avail)
+{
+	struct deflate_output_bitstream os;
+
+	/*
+	 * For extremely short inputs, or for compression level 0, just output
+	 * uncompressed blocks.
+	 */
+	if (unlikely(in_nbytes <= c->max_passthrough_size))
+		return deflate_compress_none(in, in_nbytes,
+					     out, out_nbytes_avail);
+
+	/*
+	 * Initialize the output bitstream structure.
+	 *
+	 * The end is set to OUTPUT_END_PADDING below the true end, so that
+	 * FLUSH_BITS() can be more efficient.
+	 */
+	if (unlikely(out_nbytes_avail <= OUTPUT_END_PADDING))
+		return 0;
+	os.bitbuf = 0;
+	os.bitcount = 0;
+	os.next = out;
+	os.end = os.next + out_nbytes_avail - OUTPUT_END_PADDING;
+	(*c->impl)(c, in, in_nbytes, &os);
+	/*
+	 * If 'os.next' reached 'os.end', then either there was not enough space
+	 * in the output buffer, or the compressed size would have been within
+	 * OUTPUT_END_PADDING of the true end.  For performance reasons we don't
+	 * distinguish between these cases; we just make sure to return some
+	 * extra space from libdeflate_deflate_compress_bound().
+	 */
+	if (os.next >= os.end)
+		return 0;
+	ASSERT(os.bitcount <= 7);
+	if (os.bitcount)
+		*os.next++ = os.bitbuf;
+	return os.next - (u8 *)out;
+}
+
+LIBDEFLATEAPI void
+libdeflate_free_compressor(struct libdeflate_compressor *c)
+{
+	libdeflate_aligned_free(c);
+}
+
+unsigned int
+libdeflate_get_compression_level(struct libdeflate_compressor *c)
+{
+	return c->compression_level;
+}
+
+LIBDEFLATEAPI size_t
+libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
+				  size_t in_nbytes)
+{
+	size_t bound = 0;
+	size_t max_blocks;
+
+	/*
+	 * Since the compressor never uses a compressed block when an
+	 * uncompressed block is cheaper, the worst case can be no worse than
+	 * the case where only uncompressed blocks are used.
+	 *
+	 * This is true even though up to 7 bits are "wasted" to byte-align the
+	 * bitstream when a compressed block is followed by an uncompressed
+	 * block.  This is because a compressed block wouldn't have been used if
+	 * it wasn't cheaper than an uncompressed block, and uncompressed blocks
+	 * always end on a byte boundary.  So the alignment bits will, at worst,
+	 * go up to the place where the uncompressed block would have ended.
+	 */
+
+	/*
+	 * The minimum length that is passed to deflate_flush_block() is
+	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.
+	 *
+	 * If deflate_flush_block() decides to use an uncompressed block, it
+	 * actually will (in general) output a series of uncompressed blocks in
+	 * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
+	 * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
+	 * as in that case this behavior can't result in more blocks than the
+	 * case where deflate_flush_block() is called with min-length inputs.
+	 *
+	 * So the number of uncompressed blocks needed would be bounded by
+	 * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
+	 * need 1 (empty) block, which gives the final expression below.
+	 */
+	STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
+	max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+
+	/*
+	 * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
+	 * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
+	 * alignment bits at the very start of the block can be disregarded;
+	 * they would otherwise increase the overhead to 6 bytes per block.)
+	 */
+	bound += 5 * max_blocks;
+
+	/* Account for the data itself, stored uncompressed. */
+	bound += in_nbytes;
+
+	/*
+	 * Add 1 + OUTPUT_END_PADDING because for performance reasons, the
+	 * compressor doesn't distinguish between cases where there wasn't
+	 * enough space and cases where the compressed size would have been
+	 * 'out_nbytes_avail - OUTPUT_END_PADDING' or greater.  Adding
+	 * 1 + OUTPUT_END_PADDING to the bound ensures the needed wiggle room.
+	 */
+	bound += 1 + OUTPUT_END_PADDING;
+
+	return bound;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/lib/deflate_compress.h b/tools/z64compress/src/enc/libdeflate/lib/deflate_compress.h
new file mode 100644
index 000000000..9451d548b
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/deflate_compress.h
@@ -0,0 +1,15 @@
+#ifndef LIB_DEFLATE_COMPRESS_H
+#define LIB_DEFLATE_COMPRESS_H
+
+#include "lib_common.h"
+
+/*
+ * DEFLATE compression is private to deflate_compress.c, but we do need to be
+ * able to query the compression level for zlib and gzip header generation.
+ */
+
+struct libdeflate_compressor;
+
+unsigned int libdeflate_get_compression_level(struct libdeflate_compressor *c);
+
+#endif /* LIB_DEFLATE_COMPRESS_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/deflate_constants.h b/tools/z64compress/src/enc/libdeflate/lib/deflate_constants.h
new file mode 100644
index 000000000..95c9e0a50
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/deflate_constants.h
@@ -0,0 +1,56 @@
+/*
+ * deflate_constants.h - constants for the DEFLATE compression format
+ */
+
+#ifndef LIB_DEFLATE_CONSTANTS_H
+#define LIB_DEFLATE_CONSTANTS_H
+
+/* Valid block types  */
+#define DEFLATE_BLOCKTYPE_UNCOMPRESSED		0
+#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN	1
+#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN	2
+
+/* Minimum and maximum supported match lengths (in bytes)  */
+#define DEFLATE_MIN_MATCH_LEN			3
+#define DEFLATE_MAX_MATCH_LEN			258
+
+/* Maximum supported match offset (in bytes) */
+#define DEFLATE_MAX_MATCH_OFFSET		32768
+
+/* log2 of DEFLATE_MAX_MATCH_OFFSET */
+#define DEFLATE_WINDOW_ORDER			15
+
+/* Number of symbols in each Huffman code.  Note: for the literal/length
+ * and offset codes, these are actually the maximum values; a given block
+ * might use fewer symbols.  */
+#define DEFLATE_NUM_PRECODE_SYMS		19
+#define DEFLATE_NUM_LITLEN_SYMS			288
+#define DEFLATE_NUM_OFFSET_SYMS			32
+
+/* The maximum number of symbols across all codes  */
+#define DEFLATE_MAX_NUM_SYMS			288
+
+/* Division of symbols in the literal/length code  */
+#define DEFLATE_NUM_LITERALS			256
+#define DEFLATE_END_OF_BLOCK			256
+#define DEFLATE_FIRST_LEN_SYM			257
+
+/* Maximum codeword length, in bits, within each Huffman code  */
+#define DEFLATE_MAX_PRE_CODEWORD_LEN		7
+#define DEFLATE_MAX_LITLEN_CODEWORD_LEN		15
+#define DEFLATE_MAX_OFFSET_CODEWORD_LEN		15
+
+/* The maximum codeword length across all codes  */
+#define DEFLATE_MAX_CODEWORD_LEN		15
+
+/* Maximum possible overrun when decoding codeword lengths  */
+#define DEFLATE_MAX_LENS_OVERRUN		137
+
+/*
+ * Maximum number of extra bits that may be required to represent a match
+ * length or offset.
+ */
+#define DEFLATE_MAX_EXTRA_LENGTH_BITS		5
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS		13
+
+#endif /* LIB_DEFLATE_CONSTANTS_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/deflate_decompress.c b/tools/z64compress/src/enc/libdeflate/lib/deflate_decompress.c
new file mode 100644
index 000000000..7d22fc443
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/deflate_decompress.c
@@ -0,0 +1,1176 @@
+/*
+ * deflate_decompress.c - a decompressor for DEFLATE
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ * This is a highly optimized DEFLATE decompressor.  It is much faster than
+ * vanilla zlib, typically well over twice as fast, though results vary by CPU.
+ *
+ * Why this is faster than vanilla zlib:
+ *
+ * - Word accesses rather than byte accesses when reading input
+ * - Word accesses rather than byte accesses when copying matches
+ * - Faster Huffman decoding combined with various DEFLATE-specific tricks
+ * - Larger bitbuffer variable that doesn't need to be refilled as often
+ * - Other optimizations to remove unnecessary branches
+ * - Only full-buffer decompression is supported, so the code doesn't need to
+ *   support stopping and resuming decompression.
+ * - On x86_64, a version of the decompression routine is compiled with BMI2
+ *   instructions enabled and is used automatically at runtime when supported.
+ */
+
+#include <limits.h>
+
+#include "lib_common.h"
+#include "deflate_constants.h"
+
+#include "libdeflate.h"
+
+/*
+ * If the expression passed to SAFETY_CHECK() evaluates to false, then the
+ * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
+ * compressed data is invalid.
+ *
+ * Theoretically, these checks could be disabled for specialized applications
+ * where all input to the decompressor will be trusted.
+ */
+#if 0
+#  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
+#  define SAFETY_CHECK(expr)	(void)(expr)
+#else
+#  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
+#endif
+
+/*****************************************************************************
+ *				Input bitstream                              *
+ *****************************************************************************/
+
+/*
+ * The state of the "input bitstream" consists of the following variables:
+ *
+ *	- in_next: a pointer to the next unread byte in the input buffer
+ *
+ *	- in_end: a pointer to just past the end of the input buffer
+ *
+ *	- bitbuf: a word-sized variable containing bits that have been read from
+ *		  the input buffer or from the implicit appended zero bytes
+ *
+ *	- bitsleft: the number of bits in 'bitbuf' available to be consumed.
+ *		    After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
+ *		    contain more bits than this.  However, only the bits counted
+ *		    by 'bitsleft' can actually be consumed; the rest can only be
+ *		    used for preloading.
+ *
+ *		    As a micro-optimization, we allow bits 8 and higher of
+ *		    'bitsleft' to contain garbage.  When consuming the bits
+ *		    associated with a decode table entry, this allows us to do
+ *		    'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
+ *		    On some CPUs, this helps reduce instruction dependencies.
+ *		    This does have the disadvantage that 'bitsleft' sometimes
+ *		    needs to be cast to 'u8', such as when it's used as a shift
+ *		    amount in REFILL_BITS_BRANCHLESS().  But that one happens
+ *		    for free since most CPUs ignore high bits in shift amounts.
+ *
+ *	- overread_count: the total number of implicit appended zero bytes that
+ *			  have been loaded into the bitbuffer, including any
+ *			  counted by 'bitsleft' and any already consumed
+ */
+
+/*
+ * The type for the bitbuffer variable ('bitbuf' described above).  For best
+ * performance, this should have size equal to a machine word.
+ *
+ * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
+ * which they don't have to refill as often.
+ */
+typedef machine_word_t bitbuf_t;
+#define BITBUF_NBITS	(8 * (int)sizeof(bitbuf_t))
+
+/* BITMASK(n) returns a bitmask of length 'n'. */
+#define BITMASK(n)	(((bitbuf_t)1 << (n)) - 1)
+
+/*
+ * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
+ * of '(u8)bitsleft'.  This is the size of the bitbuffer variable, minus 1 if
+ * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
+ */
+#define MAX_BITSLEFT	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
+
+/*
+ * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
+ * consumable (counted in 'bitsleft') immediately after refilling the bitbuffer.
+ * Since only whole bytes can be added to 'bitsleft', the worst case is
+ * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
+ */
+#define CONSUMABLE_NBITS	(MAX_BITSLEFT - 7)
+
+/*
+ * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
+ * to be preloadable immediately after REFILL_BITS_IN_FASTLOOP().  (It is *not*
+ * guaranteed after REFILL_BITS(), since REFILL_BITS() falls back to a
+ * byte-at-a-time refill method near the end of input.)  This may exceed the
+ * number of consumable bits (counted by 'bitsleft').  Any bits not counted in
+ * 'bitsleft' can only be used for precomputation and cannot be consumed.
+ */
+#define FASTLOOP_PRELOADABLE_NBITS	\
+	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
+
+/*
+ * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
+ * preloadable but not consumable, following REFILL_BITS_IN_FASTLOOP() and any
+ * subsequent consumptions.  This is 1 bit if the branchless refill method is
+ * being used, and 0 bits otherwise.
+ */
+#define PRELOAD_SLACK	MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
+
+/*
+ * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
+ * refilled, then it's always possible to consume 'n' bits from it.  'n' should
+ * be a compile-time constant, to enable compile-time evaluation.
+ */
+#define CAN_CONSUME(n)	(CONSUMABLE_NBITS >= (n))
+
+/*
+ * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
+ * guaranteed that after REFILL_BITS_IN_FASTLOOP(), it's always possible to
+ * consume 'consume_nbits' bits, then preload 'preload_nbits' bits.  The
+ * arguments should be compile-time constants to enable compile-time evaluation.
+ */
+#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits)	\
+	(CONSUMABLE_NBITS >= (consume_nbits) &&				\
+	 FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
+
+/*
+ * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
+ * reading the next word from the input buffer and updating 'in_next' and
+ * 'bitsleft' based on how many bits were refilled -- counting whole bytes only.
+ * This is much faster than reading a byte at a time, at least if the CPU is
+ * little endian and supports fast unaligned memory accesses.
+ *
+ * The simplest way of branchlessly updating 'bitsleft' would be:
+ *
+ *	bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
+ *
+ * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
+ * WORDBITS, so that in binary it looks like 111111 or 11111.  Then, we update
+ * 'bitsleft' by just setting the bits above the low 3 bits:
+ *
+ *	bitsleft |= MAX_BITSLEFT & ~7;
+ *
+ * That compiles down to a single instruction like 'or $0x38, %rbp'.  Using
+ * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
+ * done when 'bitsleft == MAX_BITSLEFT' without invoking undefined behavior.
+ *
+ * The simplest way of branchlessly updating 'in_next' would be:
+ *
+ *	in_next += (MAX_BITSLEFT - bitsleft) >> 3;
+ *
+ * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
+ * isn't really better:
+ *
+ *	in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
+ *
+ * An alternative which can be marginally better is the following:
+ *
+ *	in_next += sizeof(bitbuf_t) - 1;
+ *	in_next -= (bitsleft >> 3) & 0x7;
+ *
+ * It seems this would increase the number of CPU instructions from 3 (sub, shr,
+ * add) to 4 (add, shr, and, sub).  However, if the CPU has a bitfield
+ * extraction instruction (e.g. arm's ubfx), it stays at 3, and is potentially
+ * more efficient because the length of the longest dependency chain decreases
+ * from 3 to 2.  This alternative also has the advantage that it ignores the
+ * high bits in 'bitsleft', so it is compatible with the micro-optimization we
+ * use where we let the high bits of 'bitsleft' contain garbage.
+ */
+#define REFILL_BITS_BRANCHLESS()					\
+do {									\
+	bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft;	\
+	in_next += sizeof(bitbuf_t) - 1;				\
+	in_next -= (bitsleft >> 3) & 0x7;				\
+	bitsleft |= MAX_BITSLEFT & ~7;					\
+} while (0)
+
+/*
+ * REFILL_BITS() loads bits from the input buffer until the bitbuffer variable
+ * contains at least CONSUMABLE_NBITS consumable bits.
+ *
+ * This checks for the end of input, and it doesn't guarantee
+ * FASTLOOP_PRELOADABLE_NBITS, so it can't be used in the fastloop.
+ *
+ * If we would overread the input buffer, we just don't read anything, leaving
+ * the bits zeroed but marking them filled.  This simplifies the decompressor
+ * because it removes the need to always be able to distinguish between real
+ * overreads and overreads caused only by the decompressor's own lookahead.
+ *
+ * We do still keep track of the number of bytes that have been overread, for
+ * two reasons.  First, it allows us to determine the exact number of bytes that
+ * were consumed once the stream ends or an uncompressed block is reached.
+ * Second, it allows us to stop early if the overread amount gets so large (more
+ * than sizeof bitbuf) that it can only be caused by a real overread.  (The
+ * second part is arguably unneeded, since libdeflate is buffer-based; given
+ * infinite zeroes, it will eventually either completely fill the output buffer
+ * or return an error.  However, we do it to be slightly more friendly to the
+ * not-recommended use case of decompressing with an unknown output size.)
+ */
+#define REFILL_BITS()							\
+do {									\
+	if (UNALIGNED_ACCESS_IS_FAST &&					\
+	    likely(in_end - in_next >= sizeof(bitbuf_t))) {		\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			if (likely(in_next != in_end)) {		\
+				bitbuf |= (bitbuf_t)*in_next++ <<	\
+					  (u8)bitsleft;			\
+			} else {					\
+				overread_count++;			\
+				SAFETY_CHECK(overread_count <=		\
+					     sizeof(bitbuf_t));		\
+			}						\
+			bitsleft += 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
+ * end of the input.  It can only be used in the fastloop.
+ */
+#define REFILL_BITS_IN_FASTLOOP()					\
+do {									\
+	STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST ||			\
+		      FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS);	\
+	if (UNALIGNED_ACCESS_IS_FAST) {					\
+		REFILL_BITS_BRANCHLESS();				\
+	} else {							\
+		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
+			bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft;	\
+			bitsleft += 8;					\
+		}							\
+	}								\
+} while (0)
+
+/*
+ * This is the worst-case maximum number of output bytes that are written to
+ * during each iteration of the fastloop.  The worst case is 2 literals, then a
+ * match of length DEFLATE_MAX_MATCH_LEN.  Additionally, some slack space must
+ * be included for the intentional overrun in the match copy implementation.
+ */
+#define FASTLOOP_MAX_BYTES_WRITTEN	\
+	(2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
+
+/*
+ * This is the worst-case maximum number of input bytes that are read during
+ * each iteration of the fastloop.  To get this value, we first compute the
+ * greatest number of bits that can be refilled during a loop iteration.  The
+ * refill at the beginning can add at most MAX_BITSLEFT, and the amount that can
+ * be refilled later is no more than the maximum amount that can be consumed by
+ * 2 literals that don't need a subtable, then a match.  We convert this value
+ * to bytes, rounding up; this gives the maximum number of bytes that 'in_next'
+ * can be advanced.  Finally, we add sizeof(bitbuf_t) to account for
+ * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
+ */
+#define FASTLOOP_MAX_BYTES_READ					\
+	(DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) +	\
+		      LENGTH_MAXBITS + OFFSET_MAXBITS, 8) +	\
+	 sizeof(bitbuf_t))
+
+/*****************************************************************************
+ *                              Huffman decoding                             *
+ *****************************************************************************/
+
+/*
+ * The fastest way to decode Huffman-encoded data is basically to use a decode
+ * table that maps the next TABLEBITS bits of data to their symbol.  Each entry
+ * decode_table[i] maps to the symbol whose codeword is a prefix of 'i'.  A
+ * symbol with codeword length 'n' has '2**(TABLEBITS-n)' entries in the table.
+ *
+ * Ideally, TABLEBITS and the maximum codeword length would be the same; some
+ * compression formats are designed with this goal in mind.  Unfortunately, in
+ * DEFLATE, the maximum litlen and offset codeword lengths are 15 bits, which is
+ * too large for a practical TABLEBITS.  It's not *that* much larger, though, so
+ * the workaround is to use a single level of subtables.  In the main table,
+ * entries for prefixes of codewords longer than TABLEBITS contain a "pointer"
+ * to the appropriate subtable along with the number of bits it is indexed with.
+ *
+ * The most efficient way to allocate subtables is to allocate them dynamically
+ * after the main table.  The worst-case number of table entries needed,
+ * including subtables, is precomputable; see the ENOUGH constants below.
+ *
+ * A useful optimization is to store the codeword lengths in the decode table so
+ * that they don't have to be looked up by indexing a separate table that maps
+ * symbols to their codeword lengths.  We basically do this; however, for the
+ * litlen and offset codes we also implement some DEFLATE-specific optimizations
+ * that build in the consideration of the "extra bits" and the
+ * literal/length/end-of-block division.  For the exact decode table entry
+ * format we use, see the definitions of the *_decode_results[] arrays below.
+ */
+
+
+/*
+ * These are the TABLEBITS values we use for each of the DEFLATE Huffman codes,
+ * along with their corresponding ENOUGH values.
+ *
+ * For the precode, we use PRECODE_TABLEBITS == 7 since this is the maximum
+ * precode codeword length.  This avoids ever needing subtables.
+ *
+ * For the litlen and offset codes, we cannot realistically avoid ever needing
+ * subtables, since litlen and offset codewords can be up to 15 bits.  A higher
+ * TABLEBITS reduces the number of lookups that need a subtable, which increases
+ * performance; however, it increases memory usage and makes building the table
+ * take longer, which decreases performance.  We choose values that work well in
+ * practice, making subtables rarely needed without making the tables too large.
+ *
+ * Our choice of OFFSET_TABLEBITS == 8 is a bit low; without any special
+ * considerations, 9 would fit the trade-off curve better.  However, there is a
+ * performance benefit to using exactly 8 bits when it is a compile-time
+ * constant, as many CPUs can take the low byte more easily than the low 9 bits.
+ *
+ * zlib treats its equivalents of TABLEBITS as maximum values; whenever it
+ * builds a table, it caps the actual table_bits to the longest codeword.  This
+ * makes sense in theory, as there's no need for the table to be any larger than
+ * needed to support the longest codeword.  However, having the table bits be a
+ * compile-time constant is beneficial to the performance of the decode loop, so
+ * there is a trade-off.  libdeflate currently uses the dynamic table_bits
+ * strategy for the litlen table only, due to its larger maximum size.
+ * PRECODE_TABLEBITS and OFFSET_TABLEBITS are smaller, so going dynamic there
+ * isn't as useful, and OFFSET_TABLEBITS=8 is useful as mentioned above.
+ *
+ * Each TABLEBITS value has a corresponding ENOUGH value that gives the
+ * worst-case maximum number of decode table entries, including the main table
+ * and all subtables.  The ENOUGH value depends on three parameters:
+ *
+ *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
+ *	(2) the maximum number of main table bits (*_TABLEBITS)
+ *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *
+ * The ENOUGH values were computed using the utility program 'enough' from zlib.
+ */
+#define PRECODE_TABLEBITS	7
+#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
+#define LITLEN_TABLEBITS	11
+#define LITLEN_ENOUGH		2342	/* enough 288 11 15	*/
+#define OFFSET_TABLEBITS	8
+#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
+
+/*
+ * make_decode_table_entry() creates a decode table entry for the given symbol
+ * by combining the static part 'decode_results[sym]' with the dynamic part
+ * 'len', which is the remaining codeword length (the codeword length for main
+ * table entries, or the codeword length minus TABLEBITS for subtable entries).
+ *
+ * In all cases, we add 'len' to each of the two low-order bytes to create the
+ * appropriately-formatted decode table entry.  See the definitions of the
+ * *_decode_results[] arrays below, where the entry format is described.
+ */
+static forceinline u32
+make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
+{
+	return decode_results[sym] + (len << 8) + len;
+}
+
+/*
+ * Here is the format of our precode decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Bit 20-16:  presym
+ *	Bit 10-8:   codeword length [not used]
+ *	Bit 2-0:    codeword length
+ *
+ * The precode decode table never has subtables, since we use
+ * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
+ *
+ * precode_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
+ */
+static const u32 precode_decode_results[] = {
+#define ENTRY(presym)	((u32)presym << 16)
+	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
+#undef ENTRY
+};
+
+/* Litlen and offset decode table entry flags */
+
+/* Indicates a literal entry in the litlen decode table */
+#define HUFFDEC_LITERAL			0x80000000
+
+/* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
+#define HUFFDEC_EXCEPTIONAL		0x00008000
+
+/* Indicates a subtable pointer entry in the litlen or offset decode table */
+#define HUFFDEC_SUBTABLE_POINTER	0x00004000
+
+/* Indicates an end-of-block entry in the litlen decode table */
+#define HUFFDEC_END_OF_BLOCK		0x00002000
+
+/* Maximum number of bits that can be consumed by decoding a match length */
+#define LENGTH_MAXBITS		(DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+#define LENGTH_MAXFASTBITS	(LITLEN_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+
+/*
+ * Here is the format of our litlen decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Literals:
+ *		Bit 31:     1 (HUFFDEC_LITERAL)
+ *		Bit 23-16:  literal value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Lengths:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 24-16:  length base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	End of block:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     1 (HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   remaining codeword length [not used]
+ *		Bit 3-0:    remaining codeword length
+ *	Subtable pointer:
+ *		Bit 31:     0 (!HUFFDEC_LITERAL)
+ *		Bit 30-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * This format has several desirable properties:
+ *
+ *	- The codeword length, length slot base, and number of extra length bits
+ *	  are all built in.  This eliminates the need to separately look up this
+ *	  information by indexing separate arrays by symbol or length slot.
+ *
+ *	- The HUFFDEC_* flags enable easily distinguishing between the different
+ *	  types of entries.  The HUFFDEC_LITERAL flag enables a fast path for
+ *	  literals; the high bit is used for this, as some CPUs can test the
+ *	  high bit more easily than other bits.  The HUFFDEC_EXCEPTIONAL flag
+ *	  makes it possible to detect the two unlikely cases (subtable pointer
+ *	  and end of block) in a single bit flag test.
+ *
+ *	- The low byte is the number of bits that need to be removed from the
+ *	  bitstream; this makes this value easily accessible, and it enables the
+ *	  micro-optimization of doing 'bitsleft -= entry' instead of
+ *	  'bitsleft -= (u8)entry'.  It also includes the number of extra bits,
+ *	  so they don't need to be removed separately.
+ *
+ *	- The flags in bits 15-13 are arranged to be 0 when the
+ *	  "remaining codeword length" in bits 11-8 is needed, making this value
+ *	  fairly easily accessible as well via a shift and downcast.
+ *
+ *	- Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
+ *	  needed, making it possible to extract this value with '& 0x3F' rather
+ *	  than '& 0xF'.  This value is only used as a shift amount, so this can
+ *	  save an 'and' instruction as the masking by 0x3F happens implicitly.
+ *
+ * litlen_decode_results[] contains the static part of the entry for each
+ * symbol.  make_decode_table_entry() produces the final entries.
+ */
+static const u32 litlen_decode_results[] = {
+
+	/* Literals */
+#define ENTRY(literal)	(HUFFDEC_LITERAL | ((u32)literal << 16))
+	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
+	ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
+	ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
+	ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
+	ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
+	ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
+	ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
+	ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
+	ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
+	ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
+	ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
+	ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
+	ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
+	ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
+	ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
+	ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
+	ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
+	ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
+	ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
+	ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
+	ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
+	ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
+	ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
+	ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
+	ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
+	ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
+	ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
+	ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
+	ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
+	ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
+	ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
+	ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
+	ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
+	ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
+	ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
+	ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
+	ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
+	ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
+	ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
+	ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
+	ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
+	ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
+	ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
+	ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
+	ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
+	ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
+	ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
+	ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
+	ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
+	ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
+	ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
+	ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
+	ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
+	ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
+	ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
+	ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
+	ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
+	ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
+	ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
+	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
+#undef ENTRY
+
+	/* End of block */
+	HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
+
+	/* Lengths */
+#define ENTRY(length_base, num_extra_bits)	\
+	(((u32)(length_base) << 16) | (num_extra_bits))
+	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
+	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
+	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
+	ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
+	ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
+	ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
+	ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
+	ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
+#undef ENTRY
+};
+
+/* Maximum number of bits that can be consumed by decoding a match offset */
+#define OFFSET_MAXBITS		(DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+#define OFFSET_MAXFASTBITS	(OFFSET_TABLEBITS /* no subtable needed */ + \
+				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+
+/*
+ * Here is the format of our offset decode table entries.  Bits not explicitly
+ * described contain zeroes:
+ *
+ *	Offsets:
+ *		Bit 31-16:  offset base value
+ *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   remaining codeword length
+ *		Bit 4-0:    remaining codeword length + number of extra bits
+ *	Subtable pointer:
+ *		Bit 31-16:  index of start of subtable
+ *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *		Bit 11-8:   number of subtable bits
+ *		Bit 3-0:    number of main table bits
+ *
+ * These work the same way as the length entries and subtable pointer entries in
+ * the litlen decode table; see litlen_decode_results[] above.
+ */
+static const u32 offset_decode_results[] = {
+#define ENTRY(offset_base, num_extra_bits)	\
+	(((u32)(offset_base) << 16) | (num_extra_bits))
+	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
+	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
+	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
+	ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
+	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
+	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
+	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
+	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
+#undef ENTRY
+};
+
+/*
+ * The main DEFLATE decompressor structure.  Since libdeflate only supports
+ * full-buffer decompression, this structure doesn't store the entire
+ * decompression state, most of which is in stack variables.  Instead, this
+ * struct just contains the decode tables and some temporary arrays used for
+ * building them, as these are too large to comfortably allocate on the stack.
+ *
+ * Storing the decode tables in the decompressor struct also allows the decode
+ * tables for the static codes to be reused whenever two static Huffman blocks
+ * are decoded without an intervening dynamic block, even across streams.
+ */
+struct libdeflate_decompressor {
+
+	/*
+	 * The arrays aren't all needed at the same time.  'precode_lens' and
+	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
+	 * Furthermore, 'lens' need not be retained after building the litlen
+	 * and offset decode tables.  In fact, 'lens' can be in union with
+	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+	 * and is built first.
+	 */
+
+	union {
+		u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+
+		struct {
+			u8 lens[DEFLATE_NUM_LITLEN_SYMS +
+				DEFLATE_NUM_OFFSET_SYMS +
+				DEFLATE_MAX_LENS_OVERRUN];
+
+			u32 precode_decode_table[PRECODE_ENOUGH];
+		} l;
+
+		u32 litlen_decode_table[LITLEN_ENOUGH];
+	} u;
+
+	u32 offset_decode_table[OFFSET_ENOUGH];
+
+	/* used only during build_decode_table() */
+	u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
+
+	bool static_codes_loaded;
+	unsigned litlen_tablebits;
+};
+
+/*
+ * Build a table for fast decoding of symbols from a Huffman code.  As input,
+ * this function takes the codeword length of each symbol which may be used in
+ * the code.  As output, it produces a decode table for the canonical Huffman
+ * code described by the codeword lengths.  The decode table is built with the
+ * assumption that it will be indexed with "bit-reversed" codewords, where the
+ * low-order bit is the first bit of the codeword.  This format is used for all
+ * Huffman codes in DEFLATE.
+ *
+ * @decode_table
+ *	The array in which the decode table will be generated.  This array must
+ *	have sufficient length; see the definition of the ENOUGH numbers.
+ * @lens
+ *	An array which provides, for each symbol, the length of the
+ *	corresponding codeword in bits, or 0 if the symbol is unused.  This may
+ *	alias @decode_table, since nothing is written to @decode_table until all
+ *	@lens have been consumed.  All codeword lengths are assumed to be <=
+ *	@max_codeword_len but are otherwise considered untrusted.  If they do
+ *	not form a valid Huffman code, then the decode table is not built and
+ *	%false is returned.
+ * @num_syms
+ *	The number of symbols in the code, including all unused symbols.
+ * @decode_results
+ *	An array which gives the incomplete decode result for each symbol.  The
+ *	needed values in this array will be combined with codeword lengths to
+ *	make the final decode table entries using make_decode_table_entry().
+ * @table_bits
+ *	The log base-2 of the number of main table entries to use.
+ *	If @table_bits_ret != NULL, then @table_bits is treated as a maximum
+ *	value and it will be decreased if a smaller table would be sufficient.
+ * @max_codeword_len
+ *	The maximum allowed codeword length for this Huffman code.
+ *	Must be <= DEFLATE_MAX_CODEWORD_LEN.
+ * @sorted_syms
+ *	A temporary array of length @num_syms.
+ * @table_bits_ret
+ *	If non-NULL, then the dynamic table_bits is enabled, and the actual
+ *	table_bits value will be returned here.
+ *
+ * Returns %true if successful; %false if the codeword lengths do not form a
+ * valid Huffman code.
+ */
+static bool
+build_decode_table(u32 decode_table[],
+		   const u8 lens[],
+		   const unsigned num_syms,
+		   const u32 decode_results[],
+		   unsigned table_bits,
+		   unsigned max_codeword_len,
+		   u16 *sorted_syms,
+		   unsigned *table_bits_ret)
+{
+	unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
+	unsigned sym;		/* current symbol */
+	unsigned codeword;	/* current codeword, bit-reversed */
+	unsigned len;		/* current codeword length in bits */
+	unsigned count;		/* num codewords remaining with this length */
+	u32 codespace_used;	/* codespace used out of '2^max_codeword_len' */
+	unsigned cur_table_end; /* end index of current table */
+	unsigned subtable_prefix; /* codeword prefix of current subtable */
+	unsigned subtable_start;  /* start index of current subtable */
+	unsigned subtable_bits;   /* log2 of current subtable length */
+
+	/* Count how many codewords have each length, including 0. */
+	for (len = 0; len <= max_codeword_len; len++)
+		len_counts[len] = 0;
+	for (sym = 0; sym < num_syms; sym++)
+		len_counts[lens[sym]]++;
+
+	/*
+	 * Determine the actual maximum codeword length that was used, and
+	 * decrease table_bits to it if allowed.
+	 */
+	while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
+		max_codeword_len--;
+	if (table_bits_ret != NULL) {
+		table_bits = MIN(table_bits, max_codeword_len);
+		*table_bits_ret = table_bits;
+	}
+
+	/*
+	 * Sort the symbols primarily by increasing codeword length and
+	 * secondarily by increasing symbol value; or equivalently by their
+	 * codewords in lexicographic order, since a canonical code is assumed.
+	 *
+	 * For efficiency, also compute 'codespace_used' in the same pass over
+	 * 'len_counts[]' used to build 'offsets[]' for sorting.
+	 */
+
+	/* Ensure that 'codespace_used' cannot overflow. */
+	STATIC_ASSERT(sizeof(codespace_used) == 4);
+	STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
+		      DEFLATE_MAX_NUM_SYMS);
+
+	offsets[0] = 0;
+	offsets[1] = len_counts[0];
+	codespace_used = 0;
+	for (len = 1; len < max_codeword_len; len++) {
+		offsets[len + 1] = offsets[len] + len_counts[len];
+		codespace_used = (codespace_used << 1) + len_counts[len];
+	}
+	codespace_used = (codespace_used << 1) + len_counts[len];
+
+	for (sym = 0; sym < num_syms; sym++)
+		sorted_syms[offsets[lens[sym]]++] = sym;
+
+	sorted_syms += offsets[0]; /* Skip unused symbols */
+
+	/* lens[] is done being used, so we can write to decode_table[] now. */
+
+	/*
+	 * Check whether the lengths form a complete code (exactly fills the
+	 * codespace), an incomplete code (doesn't fill the codespace), or an
+	 * overfull code (overflows the codespace).  A codeword of length 'n'
+	 * uses proportion '1/(2^n)' of the codespace.  An overfull code is
+	 * nonsensical, so is considered invalid.  An incomplete code is
+	 * considered valid only in two specific cases; see below.
+	 */
+
+	/* overfull code? */
+	if (unlikely(codespace_used > (1U << max_codeword_len)))
+		return false;
+
+	/* incomplete code? */
+	if (unlikely(codespace_used < (1U << max_codeword_len))) {
+		u32 entry;
+		unsigned i;
+
+		if (codespace_used == 0) {
+			/*
+			 * An empty code is allowed.  This can happen for the
+			 * offset code in DEFLATE, since a dynamic Huffman block
+			 * need not contain any matches.
+			 */
+
+			/* sym=0, len=1 (arbitrary) */
+			entry = make_decode_table_entry(decode_results, 0, 1);
+		} else {
+			/*
+			 * Allow codes with a single used symbol, with codeword
+			 * length 1.  The DEFLATE RFC is unclear regarding this
+			 * case.  What zlib's decompressor does is permit this
+			 * for the litlen and offset codes and assume the
+			 * codeword is '0' rather than '1'.  We do the same
+			 * except we allow this for precodes too, since there's
+			 * no convincing reason to treat the codes differently.
+			 * We also assign both codewords '0' and '1' to the
+			 * symbol to avoid having to handle '1' specially.
+			 */
+			if (codespace_used != (1U << (max_codeword_len - 1)) ||
+			    len_counts[1] != 1)
+				return false;
+			entry = make_decode_table_entry(decode_results,
+							*sorted_syms, 1);
+		}
+		/*
+		 * Note: the decode table still must be fully initialized, in
+		 * case the stream is malformed and contains bits from the part
+		 * of the codespace the incomplete code doesn't use.
+		 */
+		for (i = 0; i < (1U << table_bits); i++)
+			decode_table[i] = entry;
+		return true;
+	}
+
+	/*
+	 * The lengths form a complete code.  Now, enumerate the codewords in
+	 * lexicographic order and fill the decode table entries for each one.
+	 *
+	 * First, process all codewords with len <= table_bits.  Each one gets
+	 * '2^(table_bits-len)' direct entries in the table.
+	 *
+	 * Since DEFLATE uses bit-reversed codewords, these entries aren't
+	 * consecutive but rather are spaced '2^len' entries apart.  This makes
+	 * filling them naively somewhat awkward and inefficient, since strided
+	 * stores are less cache-friendly and preclude the use of word or
+	 * vector-at-a-time stores to fill multiple entries per instruction.
+	 *
+	 * To optimize this, we incrementally double the table size.  When
+	 * processing codewords with length 'len', the table is treated as
+	 * having only '2^len' entries, so each codeword uses just one entry.
+	 * Then, each time 'len' is incremented, the table size is doubled and
+	 * the first half is copied to the second half.  This significantly
+	 * improves performance over naively doing strided stores.
+	 *
+	 * Note that some entries copied for each table doubling may not have
+	 * been initialized yet, but it doesn't matter since they're guaranteed
+	 * to be initialized later (because the Huffman code is complete).
+	 */
+	codeword = 0;
+	len = 1;
+	while ((count = len_counts[len]) == 0)
+		len++;
+	cur_table_end = 1U << len;
+	while (len <= table_bits) {
+		/* Process all 'count' codewords with length 'len' bits. */
+		do {
+			unsigned bit;
+
+			/* Fill the first entry for the current codeword. */
+			decode_table[codeword] =
+				make_decode_table_entry(decode_results,
+							*sorted_syms++, len);
+
+			if (codeword == cur_table_end - 1) {
+				/* Last codeword (all 1's) */
+				for (; len < table_bits; len++) {
+					memcpy(&decode_table[cur_table_end],
+					       decode_table,
+					       cur_table_end *
+						sizeof(decode_table[0]));
+					cur_table_end <<= 1;
+				}
+				return true;
+			}
+			/*
+			 * To advance to the lexicographically next codeword in
+			 * the canonical code, the codeword must be incremented,
+			 * then 0's must be appended to the codeword as needed
+			 * to match the next codeword's length.
+			 *
+			 * Since the codeword is bit-reversed, appending 0's is
+			 * a no-op.  However, incrementing it is nontrivial.  To
+			 * do so efficiently, use the 'bsr' instruction to find
+			 * the last (highest order) 0 bit in the codeword, set
+			 * it, and clear any later (higher order) 1 bits.  But
+			 * 'bsr' actually finds the highest order 1 bit, so to
+			 * use it first flip all bits in the codeword by XOR'ing
+			 * it with (1U << len) - 1 == cur_table_end - 1.
+			 */
+			bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
+			codeword &= bit - 1;
+			codeword |= bit;
+		} while (--count);
+
+		/* Advance to the next codeword length. */
+		do {
+			if (++len <= table_bits) {
+				memcpy(&decode_table[cur_table_end],
+				       decode_table,
+				       cur_table_end * sizeof(decode_table[0]));
+				cur_table_end <<= 1;
+			}
+		} while ((count = len_counts[len]) == 0);
+	}
+
+	/* Process codewords with len > table_bits.  These require subtables. */
+	cur_table_end = 1U << table_bits;
+	subtable_prefix = -1;
+	subtable_start = 0;
+	for (;;) {
+		u32 entry;
+		unsigned i;
+		unsigned stride;
+		unsigned bit;
+
+		/*
+		 * Start a new subtable if the first 'table_bits' bits of the
+		 * codeword don't match the prefix of the current subtable.
+		 */
+		if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
+			subtable_prefix = (codeword & ((1U << table_bits) - 1));
+			subtable_start = cur_table_end;
+			/*
+			 * Calculate the subtable length.  If the codeword has
+			 * length 'table_bits + n', then the subtable needs
+			 * '2^n' entries.  But it may need more; if fewer than
+			 * '2^n' codewords of length 'table_bits + n' remain,
+			 * then the length will need to be incremented to bring
+			 * in longer codewords until the subtable can be
+			 * completely filled.  Note that because the Huffman
+			 * code is complete, it will always be possible to fill
+			 * the subtable eventually.
+			 */
+			subtable_bits = len - table_bits;
+			codespace_used = count;
+			while (codespace_used < (1U << subtable_bits)) {
+				subtable_bits++;
+				codespace_used = (codespace_used << 1) +
+					len_counts[table_bits + subtable_bits];
+			}
+			cur_table_end = subtable_start + (1U << subtable_bits);
+
+			/*
+			 * Create the entry that points from the main table to
+			 * the subtable.
+			 */
+			decode_table[subtable_prefix] =
+				((u32)subtable_start << 16) |
+				HUFFDEC_EXCEPTIONAL |
+				HUFFDEC_SUBTABLE_POINTER |
+				(subtable_bits << 8) | table_bits;
+		}
+
+		/* Fill the subtable entries for the current codeword. */
+		entry = make_decode_table_entry(decode_results, *sorted_syms++,
+						len - table_bits);
+		i = subtable_start + (codeword >> table_bits);
+		stride = 1U << (len - table_bits);
+		do {
+			decode_table[i] = entry;
+			i += stride;
+		} while (i < cur_table_end);
+
+		/* Advance to the next codeword. */
+		if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
+			return true;
+		bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
+		codeword &= bit - 1;
+		codeword |= bit;
+		count--;
+		while (count == 0)
+			count = len_counts[++len];
+	}
+}
+
+/* Build the decode table for the precode.  */
+static bool
+build_precode_decode_table(struct libdeflate_decompressor *d)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
+
+	STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
+		      DEFLATE_NUM_PRECODE_SYMS);
+
+	return build_decode_table(d->u.l.precode_decode_table,
+				  d->u.precode_lens,
+				  DEFLATE_NUM_PRECODE_SYMS,
+				  precode_decode_results,
+				  PRECODE_TABLEBITS,
+				  DEFLATE_MAX_PRE_CODEWORD_LEN,
+				  d->sorted_syms,
+				  NULL);
+}
+
+/* Build the decode table for the literal/length code.  */
+static bool
+build_litlen_decode_table(struct libdeflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
+
+	STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
+		      DEFLATE_NUM_LITLEN_SYMS);
+
+	return build_decode_table(d->u.litlen_decode_table,
+				  d->u.l.lens,
+				  num_litlen_syms,
+				  litlen_decode_results,
+				  LITLEN_TABLEBITS,
+				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+				  d->sorted_syms,
+				  &d->litlen_tablebits);
+}
+
+/* Build the decode table for the offset code.  */
+static bool
+build_offset_decode_table(struct libdeflate_decompressor *d,
+			  unsigned num_litlen_syms, unsigned num_offset_syms)
+{
+	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
+
+	STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
+		      DEFLATE_NUM_OFFSET_SYMS);
+
+	return build_decode_table(d->offset_decode_table,
+				  d->u.l.lens + num_litlen_syms,
+				  num_offset_syms,
+				  offset_decode_results,
+				  OFFSET_TABLEBITS,
+				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
+				  d->sorted_syms,
+				  NULL);
+}
+
+/*****************************************************************************
+ *                         Main decompression routine
+ *****************************************************************************/
+
+typedef enum libdeflate_result (*decompress_func_t)
+	(struct libdeflate_decompressor * restrict d,
+	 const void * restrict in, size_t in_nbytes,
+	 void * restrict out, size_t out_nbytes_avail,
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+#define FUNCNAME deflate_decompress_default
+#undef ATTRIBUTES
+#undef EXTRACT_VARBITS
+#undef EXTRACT_VARBITS8
+#include "decompress_template.h"
+
+/* Include architecture-specific implementation(s) if available. */
+#undef DEFAULT_IMPL
+#undef arch_select_decompress_func
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#  include "x86/decompress_impl.h"
+#endif
+
+#ifndef DEFAULT_IMPL
+#  define DEFAULT_IMPL deflate_decompress_default
+#endif
+
+#ifdef arch_select_decompress_func
+static enum libdeflate_result
+dispatch_decomp(struct libdeflate_decompressor *d,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes_avail,
+		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+
+static volatile decompress_func_t decompress_impl = dispatch_decomp;
+
+/* Choose the best implementation at runtime. */
+static enum libdeflate_result
+dispatch_decomp(struct libdeflate_decompressor *d,
+		const void *in, size_t in_nbytes,
+		void *out, size_t out_nbytes_avail,
+		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+{
+	decompress_func_t f = arch_select_decompress_func();
+
+	if (f == NULL)
+		f = DEFAULT_IMPL;
+
+	decompress_impl = f;
+	return f(d, in, in_nbytes, out, out_nbytes_avail,
+		 actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+#else
+/* The best implementation is statically known, so call it directly. */
+#  define decompress_impl DEFAULT_IMPL
+#endif
+
+/*
+ * This is the main DEFLATE decompression routine.  See libdeflate.h for the
+ * documentation.
+ *
+ * Note that the real code is in decompress_template.h.  The part here just
+ * handles calling the appropriate implementation depending on the CPU features
+ * at runtime.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret)
+{
+	return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
+			       actual_in_nbytes_ret, actual_out_nbytes_ret);
+}
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_out_nbytes_ret)
+{
+	return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
+						out, out_nbytes_avail,
+						NULL, actual_out_nbytes_ret);
+}
+
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void)
+{
+	/*
+	 * Note that only certain parts of the decompressor actually must be
+	 * initialized here:
+	 *
+	 * - 'static_codes_loaded' must be initialized to false.
+	 *
+	 * - The first half of the main portion of each decode table must be
+	 *   initialized to any value, to avoid reading from uninitialized
+	 *   memory during table expansion in build_decode_table().  (Although,
+	 *   this is really just to avoid warnings with dynamic tools like
+	 *   valgrind, since build_decode_table() is guaranteed to initialize
+	 *   all entries eventually anyway.)
+	 *
+	 * But for simplicity, we currently just zero the whole decompressor.
+	 */
+	struct libdeflate_decompressor *d = libdeflate_malloc(sizeof(*d));
+
+	if (d == NULL)
+		return NULL;
+	memset(d, 0, sizeof(*d));
+	return d;
+}
+
+LIBDEFLATEAPI void
+libdeflate_free_decompressor(struct libdeflate_decompressor *d)
+{
+	libdeflate_free(d);
+}
diff --git a/tools/z64compress/src/enc/libdeflate/lib/gzip_compress.c b/tools/z64compress/src/enc/libdeflate/lib/gzip_compress.c
new file mode 100644
index 000000000..e343e5068
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/gzip_compress.c
@@ -0,0 +1,92 @@
+/*
+ * gzip_compress.c - compress with a gzip wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "gzip_constants.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEAPI size_t
+libdeflate_gzip_compress(struct libdeflate_compressor *c,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail)
+{
+	u8 *out_next = out;
+	unsigned compression_level;
+	u8 xfl;
+	size_t deflate_size;
+
+	if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
+		return 0;
+
+	/* ID1 */
+	*out_next++ = GZIP_ID1;
+	/* ID2 */
+	*out_next++ = GZIP_ID2;
+	/* CM */
+	*out_next++ = GZIP_CM_DEFLATE;
+	/* FLG */
+	*out_next++ = 0;
+	/* MTIME */
+	put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next);
+	out_next += 4;
+	/* XFL */
+	xfl = 0;
+	compression_level = libdeflate_get_compression_level(c);
+	if (compression_level < 2)
+		xfl |= GZIP_XFL_FASTEST_COMPRESSION;
+	else if (compression_level >= 8)
+		xfl |= GZIP_XFL_SLOWEST_COMPRESSION;
+	*out_next++ = xfl;
+	/* OS */
+	*out_next++ = GZIP_OS_UNKNOWN;	/* OS  */
+
+	/* Compressed data  */
+	deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
+					out_nbytes_avail - GZIP_MIN_OVERHEAD);
+	if (deflate_size == 0)
+		return 0;
+	out_next += deflate_size;
+
+	/* CRC32 */
+	put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next);
+	out_next += 4;
+
+	/* ISIZE */
+	put_unaligned_le32((u32)in_nbytes, out_next);
+	out_next += 4;
+
+	return out_next - (u8 *)out;
+}
+
+LIBDEFLATEAPI size_t
+libdeflate_gzip_compress_bound(struct libdeflate_compressor *c,
+			       size_t in_nbytes)
+{
+	return GZIP_MIN_OVERHEAD +
+	       libdeflate_deflate_compress_bound(c, in_nbytes);
+}
diff --git a/tools/z64compress/src/enc/libdeflate/lib/gzip_constants.h b/tools/z64compress/src/enc/libdeflate/lib/gzip_constants.h
new file mode 100644
index 000000000..35e4728d8
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/gzip_constants.h
@@ -0,0 +1,45 @@
+/*
+ * gzip_constants.h - constants for the gzip wrapper format
+ */
+
+#ifndef LIB_GZIP_CONSTANTS_H
+#define LIB_GZIP_CONSTANTS_H
+
+#define GZIP_MIN_HEADER_SIZE	10
+#define GZIP_FOOTER_SIZE	8
+#define GZIP_MIN_OVERHEAD	(GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
+
+#define GZIP_ID1		0x1F
+#define GZIP_ID2		0x8B
+
+#define GZIP_CM_DEFLATE		8
+
+#define GZIP_FTEXT		0x01
+#define GZIP_FHCRC		0x02
+#define GZIP_FEXTRA		0x04
+#define GZIP_FNAME		0x08
+#define GZIP_FCOMMENT		0x10
+#define GZIP_FRESERVED		0xE0
+
+#define GZIP_MTIME_UNAVAILABLE	0
+
+#define GZIP_XFL_SLOWEST_COMPRESSION	0x02
+#define GZIP_XFL_FASTEST_COMPRESSION	0x04
+
+#define GZIP_OS_FAT		0
+#define GZIP_OS_AMIGA		1
+#define GZIP_OS_VMS		2
+#define GZIP_OS_UNIX		3
+#define GZIP_OS_VM_CMS		4
+#define GZIP_OS_ATARI_TOS	5
+#define GZIP_OS_HPFS		6
+#define GZIP_OS_MACINTOSH	7
+#define GZIP_OS_Z_SYSTEM	8
+#define GZIP_OS_CP_M		9
+#define GZIP_OS_TOPS_20		10
+#define GZIP_OS_NTFS		11
+#define GZIP_OS_QDOS		12
+#define GZIP_OS_RISCOS		13
+#define GZIP_OS_UNKNOWN		255
+
+#endif /* LIB_GZIP_CONSTANTS_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/gzip_decompress.c b/tools/z64compress/src/enc/libdeflate/lib/gzip_decompress.c
new file mode 100644
index 000000000..9518e7047
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/gzip_decompress.c
@@ -0,0 +1,146 @@
+/*
+ * gzip_decompress.c - decompress with a gzip wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+#include "gzip_constants.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret)
+{
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	u8 flg;
+	size_t actual_in_nbytes;
+	size_t actual_out_nbytes;
+	enum libdeflate_result result;
+
+	if (in_nbytes < GZIP_MIN_OVERHEAD)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* ID1 */
+	if (*in_next++ != GZIP_ID1)
+		return LIBDEFLATE_BAD_DATA;
+	/* ID2 */
+	if (*in_next++ != GZIP_ID2)
+		return LIBDEFLATE_BAD_DATA;
+	/* CM */
+	if (*in_next++ != GZIP_CM_DEFLATE)
+		return LIBDEFLATE_BAD_DATA;
+	flg = *in_next++;
+	/* MTIME */
+	in_next += 4;
+	/* XFL */
+	in_next += 1;
+	/* OS */
+	in_next += 1;
+
+	if (flg & GZIP_FRESERVED)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* Extra field */
+	if (flg & GZIP_FEXTRA) {
+		u16 xlen = get_unaligned_le16(in_next);
+		in_next += 2;
+
+		if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+
+		in_next += xlen;
+	}
+
+	/* Original file name (zero terminated) */
+	if (flg & GZIP_FNAME) {
+		while (*in_next++ != 0 && in_next != in_end)
+			;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+	}
+
+	/* File comment (zero terminated) */
+	if (flg & GZIP_FCOMMENT) {
+		while (*in_next++ != 0 && in_next != in_end)
+			;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+	}
+
+	/* CRC16 for gzip header */
+	if (flg & GZIP_FHCRC) {
+		in_next += 2;
+		if (in_end - in_next < GZIP_FOOTER_SIZE)
+			return LIBDEFLATE_BAD_DATA;
+	}
+
+	/* Compressed data  */
+	result = libdeflate_deflate_decompress_ex(d, in_next,
+					in_end - GZIP_FOOTER_SIZE - in_next,
+					out, out_nbytes_avail,
+					&actual_in_nbytes,
+					actual_out_nbytes_ret);
+	if (result != LIBDEFLATE_SUCCESS)
+		return result;
+
+	if (actual_out_nbytes_ret)
+		actual_out_nbytes = *actual_out_nbytes_ret;
+	else
+		actual_out_nbytes = out_nbytes_avail;
+
+	in_next += actual_in_nbytes;
+
+	/* CRC32 */
+	if (libdeflate_crc32(0, out, actual_out_nbytes) !=
+	    get_unaligned_le32(in_next))
+		return LIBDEFLATE_BAD_DATA;
+	in_next += 4;
+
+	/* ISIZE */
+	if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
+		return LIBDEFLATE_BAD_DATA;
+	in_next += 4;
+
+	if (actual_in_nbytes_ret)
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+
+	return LIBDEFLATE_SUCCESS;
+}
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail,
+			   size_t *actual_out_nbytes_ret)
+{
+	return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
+					     out, out_nbytes_avail,
+					     NULL, actual_out_nbytes_ret);
+}
diff --git a/tools/z64compress/src/enc/libdeflate/lib/hc_matchfinder.h b/tools/z64compress/src/enc/libdeflate/lib/hc_matchfinder.h
new file mode 100644
index 000000000..a0cddfca1
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/hc_matchfinder.h
@@ -0,0 +1,401 @@
+/*
+ * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ *				   Algorithm
+ *
+ * This is a Hash Chains (hc) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * linked list (or "chain") of sequences whose first 4 bytes share the same hash
+ * code.  Each sequence is identified by its starting position in the input
+ * buffer.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, this hash
+ * bucket's linked list is searched for matches.  Then, a new linked list node
+ * is created to represent the current sequence and is prepended to the list.
+ *
+ * This algorithm has several useful properties:
+ *
+ * - It only finds true Lempel-Ziv matches; i.e., those where the matching
+ *   sequence occurs prior to the sequence being matched against.
+ *
+ * - The sequences in each linked list are always sorted by decreasing starting
+ *   position.  Therefore, the closest (smallest offset) matches are found
+ *   first, which in many compression formats tend to be the cheapest to encode.
+ *
+ * - Although fast running time is not guaranteed due to the possibility of the
+ *   lists getting very long, the worst degenerate behavior can be easily
+ *   prevented by capping the number of nodes searched at each position.
+ *
+ * - If the compressor decides not to search for matches at a certain position,
+ *   then that position can be quickly inserted without searching the list.
+ *
+ * - The algorithm is adaptable to sliding windows: just store the positions
+ *   relative to a "base" value that is updated from time to time, and stop
+ *   searching each list when the sequences get too far away.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ *				 Optimizations
+ *
+ * The main hash table and chains handle length 4+ matches.  Length 3 matches
+ * are handled by a separate hash table with no chains.  This works well for
+ * typical "greedy" or "lazy"-style compressors, where length 3 matches are
+ * often only helpful if they have small offsets.  Instead of searching a full
+ * chain for length 3+ matches, the algorithm just checks for one close length 3
+ * match, then focuses on finding length 4+ matches.
+ *
+ * The longest_match() and skip_bytes() functions are inlined into the
+ * compressors that use them.  This isn't just about saving the overhead of a
+ * function call.  These functions are intended to be called from the inner
+ * loops of compressors, where giving the compiler more control over register
+ * allocation is very helpful.  There is also significant benefit to be gained
+ * from allowing the CPU to predict branches independently at each call site.
+ * For example, "lazy"-style compressors can be written with two calls to
+ * longest_match(), each of which starts with a different 'best_len' and
+ * therefore has significantly different performance characteristics.
+ *
+ * Although any hash function can be used, a multiplicative hash is fast and
+ * works well.
+ *
+ * On some processors, it is significantly faster to extend matches by whole
+ * words (32 or 64 bits) instead of by individual bytes.  For this to be the
+ * case, the processor must implement unaligned memory accesses efficiently and
+ * must have either a fast "find first set bit" instruction or a fast "find last
+ * set bit" instruction, depending on the processor's endianness.
+ *
+ * The code uses one loop for finding the first match and one loop for finding a
+ * longer match.  Each of these loops is tuned for its respective task and in
+ * combination are faster than a single generalized loop that handles both
+ * tasks.
+ *
+ * The code also uses a tight inner loop that only compares the last and first
+ * bytes of a potential match.  It is only when these bytes match that a full
+ * match extension is attempted.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+#ifndef LIB_HC_MATCHFINDER_H
+#define LIB_HC_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define HC_MATCHFINDER_HASH3_ORDER	15
+#define HC_MATCHFINDER_HASH4_ORDER	16
+
+#define HC_MATCHFINDER_TOTAL_HASH_SIZE			\
+	(((1UL << HC_MATCHFINDER_HASH3_ORDER) +		\
+	  (1UL << HC_MATCHFINDER_HASH4_ORDER)) * sizeof(mf_pos_t))
+
+struct MATCHFINDER_ALIGNED hc_matchfinder  {
+
+	/* The hash table for finding length 3 matches  */
+	mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
+
+	/* The hash table which contains the first nodes of the linked lists for
+	 * finding length 4+ matches  */
+	mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
+
+	/* The "next node" references for the linked lists.  The "next node" of
+	 * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
+	mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
+};
+
+/* Prepare the matchfinder for a new input buffer.  */
+static forceinline void
+hc_matchfinder_init(struct hc_matchfinder *mf)
+{
+	STATIC_ASSERT(HC_MATCHFINDER_TOTAL_HASH_SIZE %
+		      MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_SIZE);
+}
+
+static forceinline void
+hc_matchfinder_slide_window(struct hc_matchfinder *mf)
+{
+	STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+/*
+ * Find the longest match longer than 'best_len' bytes.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base_p
+ *	Location of a pointer which points to the place in the input data the
+ *	matchfinder currently stores positions relative to.  This may be updated
+ *	by this function.
+ * @in_next
+ *	Pointer to the next position in the input buffer, i.e. the sequence
+ *	being matched against.
+ * @best_len
+ *	Require a match longer than this length.
+ * @max_len
+ *	The maximum permissible match length at this position.
+ * @nice_len
+ *	Stop searching if a match of at least this length is found.
+ *	Must be <= @max_len.
+ * @max_search_depth
+ *	Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *	The precomputed hash codes for the sequence beginning at @in_next.
+ *	These will be used and then updated with the precomputed hashcodes for
+ *	the sequence beginning at @in_next + 1.
+ * @offset_ret
+ *	If a match is found, its offset is returned in this location.
+ *
+ * Return the length of the match found, or 'best_len' if no match longer than
+ * 'best_len' was found.
+ */
+static forceinline u32
+hc_matchfinder_longest_match(struct hc_matchfinder * const mf,
+			     const u8 ** const in_base_p,
+			     const u8 * const in_next,
+			     u32 best_len,
+			     const u32 max_len,
+			     const u32 nice_len,
+			     const u32 max_search_depth,
+			     u32 * const next_hashes,
+			     u32 * const offset_ret)
+{
+	u32 depth_remaining = max_search_depth;
+	const u8 *best_matchptr = in_next;
+	mf_pos_t cur_node3, cur_node4;
+	u32 hash3, hash4;
+	u32 next_hashseq;
+	u32 seq4;
+	const u8 *matchptr;
+	u32 len;
+	u32 cur_pos = in_next - *in_base_p;
+	const u8 *in_base;
+	mf_pos_t cutoff;
+
+	if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+		hc_matchfinder_slide_window(mf);
+		*in_base_p += MATCHFINDER_WINDOW_SIZE;
+		cur_pos = 0;
+	}
+
+	in_base = *in_base_p;
+	cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+
+	if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
+		goto out;
+
+	/* Get the precomputed hash codes.  */
+	hash3 = next_hashes[0];
+	hash4 = next_hashes[1];
+
+	/* From the hash buckets, get the first node of each linked list.  */
+	cur_node3 = mf->hash3_tab[hash3];
+	cur_node4 = mf->hash4_tab[hash4];
+
+	/* Update for length 3 matches.  This replaces the singleton node in the
+	 * 'hash3' bucket with the node for the current sequence.  */
+	mf->hash3_tab[hash3] = cur_pos;
+
+	/* Update for length 4 matches.  This prepends the node for the current
+	 * sequence to the linked list in the 'hash4' bucket.  */
+	mf->hash4_tab[hash4] = cur_pos;
+	mf->next_tab[cur_pos] = cur_node4;
+
+	/* Compute the next hash codes.  */
+	next_hashseq = get_unaligned_le32(in_next + 1);
+	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+	next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+	prefetchw(&mf->hash3_tab[next_hashes[0]]);
+	prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+	if (best_len < 4) {  /* No match of length >= 4 found yet?  */
+
+		/* Check for a length 3 match if needed.  */
+
+		if (cur_node3 <= cutoff)
+			goto out;
+
+		seq4 = load_u32_unaligned(in_next);
+
+		if (best_len < 3) {
+			matchptr = &in_base[cur_node3];
+			if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
+				best_len = 3;
+				best_matchptr = matchptr;
+			}
+		}
+
+		/* Check for a length 4 match.  */
+
+		if (cur_node4 <= cutoff)
+			goto out;
+
+		for (;;) {
+			/* No length 4 match found yet.  Check the first 4 bytes.  */
+			matchptr = &in_base[cur_node4];
+
+			if (load_u32_unaligned(matchptr) == seq4)
+				break;
+
+			/* The first 4 bytes did not match.  Keep trying.  */
+			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+			if (cur_node4 <= cutoff || !--depth_remaining)
+				goto out;
+		}
+
+		/* Found a match of length >= 4.  Extend it to its full length.  */
+		best_matchptr = matchptr;
+		best_len = lz_extend(in_next, best_matchptr, 4, max_len);
+		if (best_len >= nice_len)
+			goto out;
+		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+		if (cur_node4 <= cutoff || !--depth_remaining)
+			goto out;
+	} else {
+		if (cur_node4 <= cutoff || best_len >= nice_len)
+			goto out;
+	}
+
+	/* Check for matches of length >= 5.  */
+
+	for (;;) {
+		for (;;) {
+			matchptr = &in_base[cur_node4];
+
+			/* Already found a length 4 match.  Try for a longer
+			 * match; start by checking either the last 4 bytes and
+			 * the first 4 bytes, or the last byte.  (The last byte,
+			 * the one which would extend the match length by 1, is
+			 * the most important.)  */
+		#if UNALIGNED_ACCESS_IS_FAST
+			if ((load_u32_unaligned(matchptr + best_len - 3) ==
+			     load_u32_unaligned(in_next + best_len - 3)) &&
+			    (load_u32_unaligned(matchptr) ==
+			     load_u32_unaligned(in_next)))
+		#else
+			if (matchptr[best_len] == in_next[best_len])
+		#endif
+				break;
+
+			/* Continue to the next node in the list.  */
+			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+			if (cur_node4 <= cutoff || !--depth_remaining)
+				goto out;
+		}
+
+	#if UNALIGNED_ACCESS_IS_FAST
+		len = 4;
+	#else
+		len = 0;
+	#endif
+		len = lz_extend(in_next, matchptr, len, max_len);
+		if (len > best_len) {
+			/* This is the new longest match.  */
+			best_len = len;
+			best_matchptr = matchptr;
+			if (best_len >= nice_len)
+				goto out;
+		}
+
+		/* Continue to the next node in the list.  */
+		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
+		if (cur_node4 <= cutoff || !--depth_remaining)
+			goto out;
+	}
+out:
+	*offset_ret = in_next - best_matchptr;
+	return best_len;
+}
+
+/*
+ * Advance the matchfinder, but don't search for matches.
+ *
+ * @mf
+ *	The matchfinder structure.
+ * @in_base_p
+ *	Location of a pointer which points to the place in the input data the
+ *	matchfinder currently stores positions relative to.  This may be updated
+ *	by this function.
+ * @in_next
+ *	Pointer to the next position in the input buffer.
+ * @in_end
+ *	Pointer to the end of the input buffer.
+ * @count
+ *	The number of bytes to advance.  Must be > 0.
+ * @next_hashes
+ *	The precomputed hash codes for the sequence beginning at @in_next.
+ *	These will be used and then updated with the precomputed hashcodes for
+ *	the sequence beginning at @in_next + @count.
+ */
+static forceinline void
+hc_matchfinder_skip_bytes(struct hc_matchfinder * const mf,
+			  const u8 ** const in_base_p,
+			  const u8 *in_next,
+			  const u8 * const in_end,
+			  const u32 count,
+			  u32 * const next_hashes)
+{
+	u32 cur_pos;
+	u32 hash3, hash4;
+	u32 next_hashseq;
+	u32 remaining = count;
+
+	if (unlikely(count + 5 > in_end - in_next))
+		return;
+
+	cur_pos = in_next - *in_base_p;
+	hash3 = next_hashes[0];
+	hash4 = next_hashes[1];
+	do {
+		if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+			hc_matchfinder_slide_window(mf);
+			*in_base_p += MATCHFINDER_WINDOW_SIZE;
+			cur_pos = 0;
+		}
+		mf->hash3_tab[hash3] = cur_pos;
+		mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
+		mf->hash4_tab[hash4] = cur_pos;
+
+		next_hashseq = get_unaligned_le32(++in_next);
+		hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+		hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+		cur_pos++;
+	} while (--remaining);
+
+	prefetchw(&mf->hash3_tab[hash3]);
+	prefetchw(&mf->hash4_tab[hash4]);
+	next_hashes[0] = hash3;
+	next_hashes[1] = hash4;
+}
+
+#endif /* LIB_HC_MATCHFINDER_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/ht_matchfinder.h b/tools/z64compress/src/enc/libdeflate/lib/ht_matchfinder.h
new file mode 100644
index 000000000..6e5a187c1
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/ht_matchfinder.h
@@ -0,0 +1,234 @@
+/*
+ * ht_matchfinder.h - Lempel-Ziv matchfinding with a hash table
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ * This is a Hash Table (ht) matchfinder.
+ *
+ * This is a variant of the Hash Chains (hc) matchfinder that is optimized for
+ * very fast compression.  The ht_matchfinder stores the hash chains inline in
+ * the hash table, whereas the hc_matchfinder stores them in a separate array.
+ * Storing the hash chains inline is the faster method when max_search_depth
+ * (the maximum chain length) is very small.  It is not appropriate when
+ * max_search_depth is larger, as then it uses too much memory.
+ *
+ * Due to its focus on speed, the ht_matchfinder doesn't support length 3
+ * matches.  It also doesn't allow max_search_depth to vary at runtime; it is
+ * fixed at build time as HT_MATCHFINDER_BUCKET_SIZE.
+ *
+ * See hc_matchfinder.h for more information.
+ */
+
+#ifndef LIB_HT_MATCHFINDER_H
+#define LIB_HT_MATCHFINDER_H
+
+#include "matchfinder_common.h"
+
+#define HT_MATCHFINDER_HASH_ORDER	15
+#define HT_MATCHFINDER_BUCKET_SIZE	2
+
+#define HT_MATCHFINDER_MIN_MATCH_LEN	4
+/* Minimum value of max_len for ht_matchfinder_longest_match() */
+#define HT_MATCHFINDER_REQUIRED_NBYTES	5
+
+struct MATCHFINDER_ALIGNED ht_matchfinder {
+	mf_pos_t hash_tab[1UL << HT_MATCHFINDER_HASH_ORDER]
+			 [HT_MATCHFINDER_BUCKET_SIZE];
+};
+
+static forceinline void
+ht_matchfinder_init(struct ht_matchfinder *mf)
+{
+	STATIC_ASSERT(sizeof(*mf) % MATCHFINDER_SIZE_ALIGNMENT == 0);
+
+	matchfinder_init((mf_pos_t *)mf, sizeof(*mf));
+}
+
+static forceinline void
+ht_matchfinder_slide_window(struct ht_matchfinder *mf)
+{
+	matchfinder_rebase((mf_pos_t *)mf, sizeof(*mf));
+}
+
+/* Note: max_len must be >= HT_MATCHFINDER_REQUIRED_NBYTES */
+static forceinline u32
+ht_matchfinder_longest_match(struct ht_matchfinder * const mf,
+			     const u8 ** const in_base_p,
+			     const u8 * const in_next,
+			     const u32 max_len,
+			     const u32 nice_len,
+			     u32 * const next_hash,
+			     u32 * const offset_ret)
+{
+	u32 best_len = 0;
+	const u8 *best_matchptr = in_next;
+	u32 cur_pos = in_next - *in_base_p;
+	const u8 *in_base;
+	mf_pos_t cutoff;
+	u32 hash;
+	u32 seq;
+	mf_pos_t cur_node;
+	const u8 *matchptr;
+#if HT_MATCHFINDER_BUCKET_SIZE > 1
+	mf_pos_t to_insert;
+	u32 len;
+#endif
+#if HT_MATCHFINDER_BUCKET_SIZE > 2
+	int i;
+#endif
+
+	/* This is assumed throughout this function. */
+	STATIC_ASSERT(HT_MATCHFINDER_MIN_MATCH_LEN == 4);
+
+	if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
+		ht_matchfinder_slide_window(mf);
+		*in_base_p += MATCHFINDER_WINDOW_SIZE;
+		cur_pos = 0;
+	}
+	in_base = *in_base_p;
+	cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
+
+	hash = *next_hash;
+	STATIC_ASSERT(HT_MATCHFINDER_REQUIRED_NBYTES == 5);
+	*next_hash = lz_hash(get_unaligned_le32(in_next + 1),
+			     HT_MATCHFINDER_HASH_ORDER);
+	seq = load_u32_unaligned(in_next);
+	prefetchw(&mf->hash_tab[*next_hash]);
+#if HT_MATCHFINDER_BUCKET_SIZE == 1
+	/* Hand-unrolled version for BUCKET_SIZE == 1 */
+	cur_node = mf->hash_tab[hash][0];
+	mf->hash_tab[hash][0] = cur_pos;
+	if (cur_node <= cutoff)
+		goto out;
+	matchptr = &in_base[cur_node];
+	if (load_u32_unaligned(matchptr) == seq) {
+		best_len = lz_extend(in_next, matchptr, 4, max_len);
+		best_matchptr = matchptr;
+	}
+#elif HT_MATCHFINDER_BUCKET_SIZE == 2
+	/*
+	 * Hand-unrolled version for BUCKET_SIZE == 2.  The logic here also
+	 * differs slightly in that it copies the first entry to the second even
+	 * if nice_len is reached on the first, as this can be slightly faster.
+	 */
+	cur_node = mf->hash_tab[hash][0];
+	mf->hash_tab[hash][0] = cur_pos;
+	if (cur_node <= cutoff)
+		goto out;
+	matchptr = &in_base[cur_node];
+
+	to_insert = cur_node;
+	cur_node = mf->hash_tab[hash][1];
+	mf->hash_tab[hash][1] = to_insert;
+
+	if (load_u32_unaligned(matchptr) == seq) {
+		best_len = lz_extend(in_next, matchptr, 4, max_len);
+		best_matchptr = matchptr;
+		if (cur_node <= cutoff || best_len >= nice_len)
+			goto out;
+		matchptr = &in_base[cur_node];
+		if (load_u32_unaligned(matchptr) == seq &&
+		    load_u32_unaligned(matchptr + best_len - 3) ==
+		    load_u32_unaligned(in_next + best_len - 3)) {
+			len = lz_extend(in_next, matchptr, 4, max_len);
+			if (len > best_len) {
+				best_len = len;
+				best_matchptr = matchptr;
+			}
+		}
+	} else {
+		if (cur_node <= cutoff)
+			goto out;
+		matchptr = &in_base[cur_node];
+		if (load_u32_unaligned(matchptr) == seq) {
+			best_len = lz_extend(in_next, matchptr, 4, max_len);
+			best_matchptr = matchptr;
+		}
+	}
+#else
+	/* Generic version for HT_MATCHFINDER_BUCKET_SIZE > 2 */
+	to_insert = cur_pos;
+	for (i = 0; i < HT_MATCHFINDER_BUCKET_SIZE; i++) {
+		cur_node = mf->hash_tab[hash][i];
+		mf->hash_tab[hash][i] = to_insert;
+		if (cur_node <= cutoff)
+			goto out;
+		matchptr = &in_base[cur_node];
+		if (load_u32_unaligned(matchptr) == seq) {
+			len = lz_extend(in_next, matchptr, 4, max_len);
+			if (len > best_len) {
+				best_len = len;
+				best_matchptr = matchptr;
+				if (best_len >= nice_len)
+					goto out;
+			}
+		}
+		to_insert = cur_node;
+	}
+#endif
+out:
+	*offset_ret = in_next - best_matchptr;
+	return best_len;
+}
+
+static forceinline void
+ht_matchfinder_skip_bytes(struct ht_matchfinder * const mf,
+			  const u8 ** const in_base_p,
+			  const u8 *in_next,
+			  const u8 * const in_end,
+			  const u32 count,
+			  u32 * const next_hash)
+{
+	s32 cur_pos = in_next - *in_base_p;
+	u32 hash;
+	u32 remaining = count;
+	int i;
+
+	if (unlikely(count + HT_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next))
+		return;
+
+	if (cur_pos + count - 1 >= MATCHFINDER_WINDOW_SIZE) {
+		ht_matchfinder_slide_window(mf);
+		*in_base_p += MATCHFINDER_WINDOW_SIZE;
+		cur_pos -= MATCHFINDER_WINDOW_SIZE;
+	}
+
+	hash = *next_hash;
+	do {
+		for (i = HT_MATCHFINDER_BUCKET_SIZE - 1; i > 0; i--)
+			mf->hash_tab[hash][i] = mf->hash_tab[hash][i - 1];
+		mf->hash_tab[hash][0] = cur_pos;
+
+		hash = lz_hash(get_unaligned_le32(++in_next),
+			       HT_MATCHFINDER_HASH_ORDER);
+		cur_pos++;
+	} while (--remaining);
+
+	prefetchw(&mf->hash_tab[hash]);
+	*next_hash = hash;
+}
+
+#endif /* LIB_HT_MATCHFINDER_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/lib_common.h b/tools/z64compress/src/enc/libdeflate/lib/lib_common.h
new file mode 100644
index 000000000..6aad0feec
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/lib_common.h
@@ -0,0 +1,94 @@
+/*
+ * lib_common.h - internal header included by all library code
+ */
+
+#ifndef LIB_LIB_COMMON_H
+#define LIB_LIB_COMMON_H
+
+#include "../common_defs.h"
+
+#ifdef LIBDEFLATE_H
+ /*
+  * When building the library, LIBDEFLATEAPI needs to be defined properly before
+  * including libdeflate.h.
+  */
+#  error "lib_common.h must always be included before libdeflate.h"
+#endif
+
+#if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#  define LIBDEFLATE_EXPORT_SYM  __declspec(dllexport)
+#elif defined(__GNUC__)
+#  define LIBDEFLATE_EXPORT_SYM  __attribute__((visibility("default")))
+#else
+#  define LIBDEFLATE_EXPORT_SYM
+#endif
+
+/*
+ * On i386, gcc assumes that the stack is 16-byte aligned at function entry.
+ * However, some compilers (e.g. MSVC) and programming languages (e.g. Delphi)
+ * only guarantee 4-byte alignment when calling functions.  This is mainly an
+ * issue on Windows, but it has been seen on Linux too.  Work around this ABI
+ * incompatibility by realigning the stack pointer when entering libdeflate.
+ * This prevents crashes in SSE/AVX code.
+ */
+#if defined(__GNUC__) && defined(__i386__)
+#  define LIBDEFLATE_ALIGN_STACK  __attribute__((force_align_arg_pointer))
+#else
+#  define LIBDEFLATE_ALIGN_STACK
+#endif
+
+#define LIBDEFLATEAPI	LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK
+
+void *libdeflate_malloc(size_t size);
+void libdeflate_free(void *ptr);
+
+void *libdeflate_aligned_malloc(size_t alignment, size_t size);
+void libdeflate_aligned_free(void *ptr);
+
+#ifdef FREESTANDING
+/*
+ * With -ffreestanding, <string.h> may be missing, and we must provide
+ * implementations of memset(), memcpy(), memmove(), and memcmp().
+ * See https://gcc.gnu.org/onlinedocs/gcc/Standards.html
+ *
+ * Also, -ffreestanding disables interpreting calls to these functions as
+ * built-ins.  E.g., calling memcpy(&v, p, WORDBYTES) will make a function call,
+ * not be optimized to a single load instruction.  For performance reasons we
+ * don't want that.  So, declare these functions as macros that expand to the
+ * corresponding built-ins.  This approach is recommended in the gcc man page.
+ * We still need the actual function definitions in case gcc calls them.
+ */
+void *memset(void *s, int c, size_t n);
+#define memset(s, c, n)		__builtin_memset((s), (c), (n))
+
+void *memcpy(void *dest, const void *src, size_t n);
+#define memcpy(dest, src, n)	__builtin_memcpy((dest), (src), (n))
+
+void *memmove(void *dest, const void *src, size_t n);
+#define memmove(dest, src, n)	__builtin_memmove((dest), (src), (n))
+
+int memcmp(const void *s1, const void *s2, size_t n);
+#define memcmp(s1, s2, n)	__builtin_memcmp((s1), (s2), (n))
+
+#undef LIBDEFLATE_ENABLE_ASSERTIONS
+#else
+#include <string.h>
+#endif
+
+/*
+ * Runtime assertion support.  Don't enable this in production builds; it may
+ * hurt performance significantly.
+ */
+#ifdef LIBDEFLATE_ENABLE_ASSERTIONS
+void libdeflate_assertion_failed(const char *expr, const char *file, int line);
+#define ASSERT(expr) { if (unlikely(!(expr))) \
+	libdeflate_assertion_failed(#expr, __FILE__, __LINE__); }
+#else
+#define ASSERT(expr) (void)(expr)
+#endif
+
+#define CONCAT_IMPL(a, b)	a##b
+#define CONCAT(a, b)		CONCAT_IMPL(a, b)
+#define ADD_SUFFIX(name)	CONCAT(name, SUFFIX)
+
+#endif /* LIB_LIB_COMMON_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/matchfinder_common.h b/tools/z64compress/src/enc/libdeflate/lib/matchfinder_common.h
new file mode 100644
index 000000000..48a243e1d
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/matchfinder_common.h
@@ -0,0 +1,199 @@
+/*
+ * matchfinder_common.h - common code for Lempel-Ziv matchfinding
+ */
+
+#ifndef LIB_MATCHFINDER_COMMON_H
+#define LIB_MATCHFINDER_COMMON_H
+
+#include "lib_common.h"
+
+#ifndef MATCHFINDER_WINDOW_ORDER
+#  error "MATCHFINDER_WINDOW_ORDER must be defined!"
+#endif
+
+/*
+ * Given a 32-bit value that was loaded with the platform's native endianness,
+ * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
+ * bits contain the first 3 bytes, arranged in octets in a platform-dependent
+ * order, at the memory location from which the input 32-bit value was loaded.
+ */
+static forceinline u32
+loaded_u32_to_u24(u32 v)
+{
+	if (CPU_IS_LITTLE_ENDIAN())
+		return v & 0xFFFFFF;
+	else
+		return v >> 8;
+}
+
+/*
+ * Load the next 3 bytes from @p into the 24 low-order bits of a 32-bit value.
+ * The order in which the 3 bytes will be arranged as octets in the 24 bits is
+ * platform-dependent.  At least 4 bytes (not 3) must be available at @p.
+ */
+static forceinline u32
+load_u24_unaligned(const u8 *p)
+{
+#if UNALIGNED_ACCESS_IS_FAST
+	return loaded_u32_to_u24(load_u32_unaligned(p));
+#else
+	if (CPU_IS_LITTLE_ENDIAN())
+		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+	else
+		return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
+#endif
+}
+
+#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
+
+typedef s16 mf_pos_t;
+
+#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
+
+/*
+ * Required alignment of the matchfinder buffer pointer and size.  The values
+ * here come from the AVX-2 implementation, which is the worst case.
+ */
+#define MATCHFINDER_MEM_ALIGNMENT	32
+#define MATCHFINDER_SIZE_ALIGNMENT	128
+
+#undef matchfinder_init
+#undef matchfinder_rebase
+#ifdef _aligned_attribute
+#  define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
+#  if defined(ARCH_ARM32) || defined(ARCH_ARM64)
+#    include "arm/matchfinder_impl.h"
+#  elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
+#    include "x86/matchfinder_impl.h"
+#  endif
+#else
+#  define MATCHFINDER_ALIGNED
+#endif
+
+/*
+ * Initialize the hash table portion of the matchfinder.
+ *
+ * Essentially, this is an optimized memset().
+ *
+ * 'data' must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and
+ * 'size' must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
+ */
+#ifndef matchfinder_init
+static forceinline void
+matchfinder_init(mf_pos_t *data, size_t size)
+{
+	size_t num_entries = size / sizeof(*data);
+	size_t i;
+
+	for (i = 0; i < num_entries; i++)
+		data[i] = MATCHFINDER_INITVAL;
+}
+#endif
+
+/*
+ * Slide the matchfinder by MATCHFINDER_WINDOW_SIZE bytes.
+ *
+ * This must be called just after each MATCHFINDER_WINDOW_SIZE bytes have been
+ * run through the matchfinder.
+ *
+ * This subtracts MATCHFINDER_WINDOW_SIZE bytes from each entry in the given
+ * array, making the entries be relative to the current position rather than the
+ * position MATCHFINDER_WINDOW_SIZE bytes prior.  To avoid integer underflows,
+ * entries that would become less than -MATCHFINDER_WINDOW_SIZE stay at
+ * -MATCHFINDER_WINDOW_SIZE, keeping them permanently out of bounds.
+ *
+ * The given array must contain all matchfinder data that is position-relative:
+ * the hash table(s) as well as any hash chain or binary tree links.  Its
+ * address must be aligned to a MATCHFINDER_MEM_ALIGNMENT boundary, and its size
+ * must be a multiple of MATCHFINDER_SIZE_ALIGNMENT.
+ */
+#ifndef matchfinder_rebase
+static forceinline void
+matchfinder_rebase(mf_pos_t *data, size_t size)
+{
+	size_t num_entries = size / sizeof(*data);
+	size_t i;
+
+	if (MATCHFINDER_WINDOW_SIZE == 32768) {
+		/*
+		 * Branchless version for 32768-byte windows.  Clear all bits if
+		 * the value was already negative, then set the sign bit.  This
+		 * is equivalent to subtracting 32768 with signed saturation.
+		 */
+		for (i = 0; i < num_entries; i++)
+			data[i] = 0x8000 | (data[i] & ~(data[i] >> 15));
+	} else {
+		for (i = 0; i < num_entries; i++) {
+			if (data[i] >= 0)
+				data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+			else
+				data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+		}
+	}
+}
+#endif
+
+/*
+ * The hash function: given a sequence prefix held in the low-order bits of a
+ * 32-bit value, multiply by a carefully-chosen large constant.  Discard any
+ * bits of the product that don't fit in a 32-bit value, but take the
+ * next-highest @num_bits bits of the product as the hash value, as those have
+ * the most randomness.
+ */
+static forceinline u32
+lz_hash(u32 seq, unsigned num_bits)
+{
+	return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
+}
+
+/*
+ * Return the number of bytes at @matchptr that match the bytes at @strptr, up
+ * to a maximum of @max_len.  Initially, @start_len bytes are matched.
+ */
+static forceinline unsigned
+lz_extend(const u8 * const strptr, const u8 * const matchptr,
+	  const unsigned start_len, const unsigned max_len)
+{
+	unsigned len = start_len;
+	machine_word_t v_word;
+
+	if (UNALIGNED_ACCESS_IS_FAST) {
+
+		if (likely(max_len - len >= 4 * WORDBYTES)) {
+
+		#define COMPARE_WORD_STEP				\
+			v_word = load_word_unaligned(&matchptr[len]) ^	\
+				 load_word_unaligned(&strptr[len]);	\
+			if (v_word != 0)				\
+				goto word_differs;			\
+			len += WORDBYTES;				\
+
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+			COMPARE_WORD_STEP
+		#undef COMPARE_WORD_STEP
+		}
+
+		while (len + WORDBYTES <= max_len) {
+			v_word = load_word_unaligned(&matchptr[len]) ^
+				 load_word_unaligned(&strptr[len]);
+			if (v_word != 0)
+				goto word_differs;
+			len += WORDBYTES;
+		}
+	}
+
+	while (len < max_len && matchptr[len] == strptr[len])
+		len++;
+	return len;
+
+word_differs:
+	if (CPU_IS_LITTLE_ENDIAN())
+		len += (bsfw(v_word) >> 3);
+	else
+		len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
+	return len;
+}
+
+#endif /* LIB_MATCHFINDER_COMMON_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/utils.c b/tools/z64compress/src/enc/libdeflate/lib/utils.c
new file mode 100644
index 000000000..c8e5121e5
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/utils.c
@@ -0,0 +1,153 @@
+/*
+ * utils.c - utility functions for libdeflate
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+
+#include "libdeflate.h"
+
+#ifdef FREESTANDING
+#  define malloc NULL
+#  define free NULL
+#else
+#  include <stdlib.h>
+#endif
+
+static void *(*libdeflate_malloc_func)(size_t) = malloc;
+static void (*libdeflate_free_func)(void *) = free;
+
+void *
+libdeflate_malloc(size_t size)
+{
+	return (*libdeflate_malloc_func)(size);
+}
+
+void
+libdeflate_free(void *ptr)
+{
+	(*libdeflate_free_func)(ptr);
+}
+
+void *
+libdeflate_aligned_malloc(size_t alignment, size_t size)
+{
+	void *ptr = libdeflate_malloc(sizeof(void *) + alignment - 1 + size);
+	if (ptr) {
+		void *orig_ptr = ptr;
+		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
+		((void **)ptr)[-1] = orig_ptr;
+	}
+	return ptr;
+}
+
+void
+libdeflate_aligned_free(void *ptr)
+{
+	if (ptr)
+		libdeflate_free(((void **)ptr)[-1]);
+}
+
+LIBDEFLATEAPI void
+libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
+				void (*free_func)(void *))
+{
+	libdeflate_malloc_func = malloc_func;
+	libdeflate_free_func = free_func;
+}
+
+/*
+ * Implementations of libc functions for freestanding library builds.
+ * Normal library builds don't use these.  Not optimized yet; usually the
+ * compiler expands these functions and doesn't actually call them anyway.
+ */
+#ifdef FREESTANDING
+#undef memset
+void * __attribute__((weak))
+memset(void *s, int c, size_t n)
+{
+	u8 *p = s;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		p[i] = c;
+	return s;
+}
+
+#undef memcpy
+void * __attribute__((weak))
+memcpy(void *dest, const void *src, size_t n)
+{
+	u8 *d = dest;
+	const u8 *s = src;
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		d[i] = s[i];
+	return dest;
+}
+
+#undef memmove
+void * __attribute__((weak))
+memmove(void *dest, const void *src, size_t n)
+{
+	u8 *d = dest;
+	const u8 *s = src;
+	size_t i;
+
+	if (d <= s)
+		return memcpy(d, s, n);
+
+	for (i = n; i > 0; i--)
+		d[i - 1] = s[i - 1];
+	return dest;
+}
+
+#undef memcmp
+int __attribute__((weak))
+memcmp(const void *s1, const void *s2, size_t n)
+{
+	const u8 *p1 = s1;
+	const u8 *p2 = s2;
+	size_t i;
+
+	for (i = 0; i < n; i++) {
+		if (p1[i] != p2[i])
+			return (int)p1[i] - (int)p2[i];
+	}
+	return 0;
+}
+#endif /* FREESTANDING */
+
+#ifdef LIBDEFLATE_ENABLE_ASSERTIONS
+#include <stdio.h>
+#include <stdlib.h>
+void
+libdeflate_assertion_failed(const char *expr, const char *file, int line)
+{
+	fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line);
+	abort();
+}
+#endif /* LIBDEFLATE_ENABLE_ASSERTIONS */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/x86/adler32_impl.h b/tools/z64compress/src/enc/libdeflate/lib/x86/adler32_impl.h
new file mode 100644
index 000000000..6285dc80a
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/x86/adler32_impl.h
@@ -0,0 +1,287 @@
+/*
+ * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_ADLER32_IMPL_H
+#define LIB_X86_ADLER32_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * The following macros horizontally sum the s1 counters and add them to the
+ * real s1, and likewise for s2.  They do this via a series of reductions, each
+ * of which halves the vector length, until just one counter remains.
+ *
+ * The s1 reductions don't depend on the s2 reductions and vice versa, so for
+ * efficiency they are interleaved.  Also, every other s1 counter is 0 due to
+ * the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
+ * 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
+ */
+
+#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2)		    \
+{									    \
+	__m128i /* __v4su */ s1_last = (v_s1), s2_last = (v_s2);	    \
+									    \
+	/* 128 => 32 bits */						    \
+	s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x31)); \
+	s1_last = _mm_add_epi32(s1_last, _mm_shuffle_epi32(s1_last, 0x02)); \
+	s2_last = _mm_add_epi32(s2_last, _mm_shuffle_epi32(s2_last, 0x02)); \
+									    \
+	*(s1) += (u32)_mm_cvtsi128_si32(s1_last);			    \
+	*(s2) += (u32)_mm_cvtsi128_si32(s2_last);			    \
+}
+
+#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2)		    \
+{									    \
+	__m128i /* __v4su */ s1_128bit, s2_128bit;			    \
+									    \
+	/* 256 => 128 bits */						    \
+	s1_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s1), 0),	    \
+				  _mm256_extracti128_si256((v_s1), 1));	    \
+	s2_128bit = _mm_add_epi32(_mm256_extracti128_si256((v_s2), 0),	    \
+				  _mm256_extracti128_si256((v_s2), 1));	    \
+									    \
+	ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit);	    \
+}
+
+/*
+ * This is a very silly partial workaround for gcc bug
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892.  The bug causes gcc to
+ * generate extra move instructions in some loops containing vector intrinsics.
+ *
+ * An alternate workaround would be to use gcc native vector operations instead
+ * of vector intrinsics.  But that would result in MSVC needing its own code.
+ */
+#if GCC_PREREQ(1, 0)
+#  define GCC_UPDATE_VARS(a, b, c, d, e, f) \
+	__asm__("" : "+x" (a), "+x" (b), "+x" (c), "+x" (d), "+x" (e), "+x" (f))
+#else
+#  define GCC_UPDATE_VARS(a, b, c, d, e, f) \
+	(void)a, (void)b, (void)c, (void)d, (void)e, (void)f
+#endif
+
+/* SSE2 implementation */
+#if HAVE_SSE2_INTRIN
+#  define adler32_sse2		adler32_sse2
+#  define FUNCNAME		adler32_sse2
+#  define FUNCNAME_CHUNK	adler32_sse2_chunk
+#  define IMPL_ALIGNMENT	16
+#  define IMPL_SEGMENT_LEN	32
+/*
+ * The 16-bit precision byte counters must not be allowed to undergo *signed*
+ * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
+ * would behave incorrectly.
+ */
+#  define IMPL_MAX_CHUNK_LEN	(32 * (0x7FFF / 0xFF))
+#  if HAVE_SSE2_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("sse2")
+#  endif
+#  include <emmintrin.h>
+static forceinline ATTRIBUTES void
+adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
+{
+	const __m128i zeroes = _mm_setzero_si128();
+	const __m128i /* __v8hu */ mults_a =
+		_mm_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25);
+	const __m128i /* __v8hu */ mults_b =
+		_mm_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17);
+	const __m128i /* __v8hu */ mults_c =
+		_mm_setr_epi16(16, 15, 14, 13, 12, 11, 10, 9);
+	const __m128i /* __v8hu */ mults_d =
+		_mm_setr_epi16(8,  7,  6,  5,  4,  3,  2,  1);
+
+	/* s1 counters: 32-bit, sum of bytes */
+	__m128i /* __v4su */ v_s1 = zeroes;
+
+	/* s2 counters: 32-bit, sum of s1 values */
+	__m128i /* __v4su */ v_s2 = zeroes;
+
+	/*
+	 * Thirty-two 16-bit counters for byte sums.  Each accumulates the bytes
+	 * that eventually need to be multiplied by a number 32...1 for addition
+	 * into s2.
+	 */
+	__m128i /* __v8hu */ v_byte_sums_a = zeroes;
+	__m128i /* __v8hu */ v_byte_sums_b = zeroes;
+	__m128i /* __v8hu */ v_byte_sums_c = zeroes;
+	__m128i /* __v8hu */ v_byte_sums_d = zeroes;
+
+	do {
+		/* Load the next 32 bytes. */
+		const __m128i bytes1 = *p++;
+		const __m128i bytes2 = *p++;
+
+		/*
+		 * Accumulate the previous s1 counters into the s2 counters.
+		 * Logically, this really should be v_s2 += v_s1 * 32, but we
+		 * can do the multiplication (or left shift) later.
+		 */
+		v_s2 = _mm_add_epi32(v_s2, v_s1);
+
+		/*
+		 * s1 update: use "Packed Sum of Absolute Differences" to add
+		 * the bytes horizontally with 8 bytes per sum.  Then add the
+		 * sums to the s1 counters.
+		 */
+		v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes1, zeroes));
+		v_s1 = _mm_add_epi32(v_s1, _mm_sad_epu8(bytes2, zeroes));
+
+		/*
+		 * Also accumulate the bytes into 32 separate counters that have
+		 * 16-bit precision.
+		 */
+		v_byte_sums_a = _mm_add_epi16(
+			v_byte_sums_a, _mm_unpacklo_epi8(bytes1, zeroes));
+		v_byte_sums_b = _mm_add_epi16(
+			v_byte_sums_b, _mm_unpackhi_epi8(bytes1, zeroes));
+		v_byte_sums_c = _mm_add_epi16(
+			v_byte_sums_c, _mm_unpacklo_epi8(bytes2, zeroes));
+		v_byte_sums_d = _mm_add_epi16(
+			v_byte_sums_d, _mm_unpackhi_epi8(bytes2, zeroes));
+
+		GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
+				v_byte_sums_c, v_byte_sums_d);
+	} while (p != end);
+
+	/* Finish calculating the s2 counters. */
+	v_s2 = _mm_slli_epi32(v_s2, 5);
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_a, mults_a));
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_b, mults_b));
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_c, mults_c));
+	v_s2 = _mm_add_epi32(v_s2, _mm_madd_epi16(v_byte_sums_d, mults_d));
+
+	/* Add the counters to the real s1 and s2. */
+	ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* HAVE_SSE2_INTRIN */
+
+/*
+ * AVX2 implementation.  Basically the same as the SSE2 one, but with the vector
+ * width doubled.
+ */
+#if HAVE_AVX2_INTRIN
+#  define adler32_avx2		adler32_avx2
+#  define FUNCNAME		adler32_avx2
+#  define FUNCNAME_CHUNK	adler32_avx2_chunk
+#  define IMPL_ALIGNMENT	32
+#  define IMPL_SEGMENT_LEN	64
+#  define IMPL_MAX_CHUNK_LEN	(64 * (0x7FFF / 0xFF))
+#  if HAVE_AVX2_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("avx2")
+#  endif
+#  include <immintrin.h>
+  /*
+   * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
+   * including some sub-headers.
+   */
+#  if defined(__clang__) && defined(_MSC_VER)
+#    include <avxintrin.h>
+#    include <avx2intrin.h>
+#  endif
+static forceinline ATTRIBUTES void
+adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
+{
+	const __m256i zeroes = _mm256_setzero_si256();
+	/*
+	 * Note, the multipliers have to be in this order because
+	 * _mm256_unpack{lo,hi}_epi8 work on each 128-bit lane separately.
+	 */
+	const __m256i /* __v16hu */ mults_a =
+		_mm256_setr_epi16(64, 63, 62, 61, 60, 59, 58, 57,
+				  48, 47, 46, 45, 44, 43, 42, 41);
+	const __m256i /* __v16hu */ mults_b =
+		_mm256_setr_epi16(56, 55, 54, 53, 52, 51, 50, 49,
+				  40, 39, 38, 37, 36, 35, 34, 33);
+	const __m256i /* __v16hu */ mults_c =
+		_mm256_setr_epi16(32, 31, 30, 29, 28, 27, 26, 25,
+				  16, 15, 14, 13, 12, 11, 10,  9);
+	const __m256i /* __v16hu */ mults_d =
+		_mm256_setr_epi16(24, 23, 22, 21, 20, 19, 18, 17,
+				  8,  7,  6,  5,  4,  3,  2,  1);
+	__m256i /* __v8su */ v_s1 = zeroes;
+	__m256i /* __v8su */ v_s2 = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_a = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_b = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_c = zeroes;
+	__m256i /* __v16hu */ v_byte_sums_d = zeroes;
+
+	do {
+		const __m256i bytes1 = *p++;
+		const __m256i bytes2 = *p++;
+
+		v_s2 = _mm256_add_epi32(v_s2, v_s1);
+		v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes1, zeroes));
+		v_s1 = _mm256_add_epi32(v_s1, _mm256_sad_epu8(bytes2, zeroes));
+		v_byte_sums_a = _mm256_add_epi16(
+			v_byte_sums_a, _mm256_unpacklo_epi8(bytes1, zeroes));
+		v_byte_sums_b = _mm256_add_epi16(
+			v_byte_sums_b, _mm256_unpackhi_epi8(bytes1, zeroes));
+		v_byte_sums_c = _mm256_add_epi16(
+			v_byte_sums_c, _mm256_unpacklo_epi8(bytes2, zeroes));
+		v_byte_sums_d = _mm256_add_epi16(
+			v_byte_sums_d, _mm256_unpackhi_epi8(bytes2, zeroes));
+
+		GCC_UPDATE_VARS(v_s1, v_s2, v_byte_sums_a, v_byte_sums_b,
+				v_byte_sums_c, v_byte_sums_d);
+	} while (p != end);
+
+	v_s2 = _mm256_slli_epi32(v_s2, 6);
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_a, mults_a));
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_b, mults_b));
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_c, mults_c));
+	v_s2 = _mm256_add_epi32(v_s2, _mm256_madd_epi16(v_byte_sums_d, mults_d));
+	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
+}
+#  include "../adler32_vec_template.h"
+#endif /* HAVE_AVX2_INTRIN */
+
+#if defined(adler32_avx2) && HAVE_AVX2_NATIVE
+#define DEFAULT_IMPL	adler32_avx2
+#else
+static inline adler32_func_t
+arch_select_adler32_func(void)
+{
+	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
+
+#ifdef adler32_avx2
+	if (HAVE_AVX2(features))
+		return adler32_avx2;
+#endif
+#ifdef adler32_sse2
+	if (HAVE_SSE2(features))
+		return adler32_sse2;
+#endif
+	return NULL;
+}
+#define arch_select_adler32_func	arch_select_adler32_func
+#endif
+
+#endif /* LIB_X86_ADLER32_IMPL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/x86/cpu_features.c b/tools/z64compress/src/enc/libdeflate/lib/x86/cpu_features.c
new file mode 100644
index 000000000..958777ebd
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/x86/cpu_features.c
@@ -0,0 +1,151 @@
+/*
+ * x86/cpu_features.c - feature detection for x86 CPUs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../cpu_features_common.h" /* must be included first */
+#include "cpu_features.h"
+
+#if HAVE_DYNAMIC_X86_CPU_FEATURES
+
+/* With old GCC versions we have to manually save and restore the x86_32 PIC
+ * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
+#if defined(ARCH_X86_32) && defined(__PIC__)
+#  define EBX_CONSTRAINT "=&r"
+#else
+#  define EBX_CONSTRAINT "=b"
+#endif
+
+/* Execute the CPUID instruction.  */
+static inline void
+cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+#ifdef _MSC_VER
+	int result[4];
+
+	__cpuidex(result, leaf, subleaf);
+	*a = result[0];
+	*b = result[1];
+	*c = result[2];
+	*d = result[3];
+#else
+	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+		"cpuid                                  \n"
+		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
+		: "a" (leaf), "c" (subleaf));
+#endif
+}
+
+/* Read an extended control register.  */
+static inline u64
+read_xcr(u32 index)
+{
+#ifdef _MSC_VER
+	return _xgetbv(index);
+#else
+	u32 edx, eax;
+
+	/* Execute the "xgetbv" instruction.  Old versions of binutils do not
+	 * recognize this instruction, so list the raw bytes instead.  */
+	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
+
+	return ((u64)edx << 32) | eax;
+#endif
+}
+
+#undef BIT
+#define BIT(nr)			(1UL << (nr))
+
+#define XCR0_BIT_SSE		BIT(1)
+#define XCR0_BIT_AVX		BIT(2)
+
+#define IS_SET(reg, nr)		((reg) & BIT(nr))
+#define IS_ALL_SET(reg, mask)	(((reg) & (mask)) == (mask))
+
+static const struct cpu_feature x86_cpu_feature_table[] = {
+	{X86_CPU_FEATURE_SSE2,		"sse2"},
+	{X86_CPU_FEATURE_PCLMUL,	"pclmul"},
+	{X86_CPU_FEATURE_AVX,		"avx"},
+	{X86_CPU_FEATURE_AVX2,		"avx2"},
+	{X86_CPU_FEATURE_BMI2,		"bmi2"},
+};
+
+volatile u32 libdeflate_x86_cpu_features = 0;
+
+/* Initialize libdeflate_x86_cpu_features. */
+void libdeflate_init_x86_cpu_features(void)
+{
+	u32 features = 0;
+	u32 dummy1, dummy2, dummy3, dummy4;
+	u32 max_function;
+	u32 features_1, features_2, features_3, features_4;
+	bool os_avx_support = false;
+
+	/* Get maximum supported function  */
+	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
+	if (max_function < 1)
+		goto out;
+
+	/* Standard feature flags  */
+	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
+
+	if (IS_SET(features_1, 26))
+		features |= X86_CPU_FEATURE_SSE2;
+
+	if (IS_SET(features_2, 1))
+		features |= X86_CPU_FEATURE_PCLMUL;
+
+	if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
+		u64 xcr0 = read_xcr(0);
+
+		os_avx_support = IS_ALL_SET(xcr0,
+					    XCR0_BIT_SSE |
+					    XCR0_BIT_AVX);
+	}
+
+	if (os_avx_support && IS_SET(features_2, 28))
+		features |= X86_CPU_FEATURE_AVX;
+
+	if (max_function < 7)
+		goto out;
+
+	/* Extended feature flags  */
+	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
+
+	if (os_avx_support && IS_SET(features_3, 5))
+		features |= X86_CPU_FEATURE_AVX2;
+
+	if (IS_SET(features_3, 8))
+		features |= X86_CPU_FEATURE_BMI2;
+
+out:
+	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
+					 ARRAY_LEN(x86_cpu_feature_table));
+
+	libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
+}
+
+#endif /* HAVE_DYNAMIC_X86_CPU_FEATURES */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/x86/cpu_features.h b/tools/z64compress/src/enc/libdeflate/lib/x86/cpu_features.h
new file mode 100644
index 000000000..561bd567f
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/x86/cpu_features.h
@@ -0,0 +1,155 @@
+/*
+ * x86/cpu_features.h - feature detection for x86 CPUs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_CPU_FEATURES_H
+#define LIB_X86_CPU_FEATURES_H
+
+#include "../lib_common.h"
+
+#define HAVE_DYNAMIC_X86_CPU_FEATURES	0
+
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+
+#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)
+#  undef HAVE_DYNAMIC_X86_CPU_FEATURES
+#  define HAVE_DYNAMIC_X86_CPU_FEATURES	1
+#endif
+
+#define X86_CPU_FEATURE_SSE2		0x00000001
+#define X86_CPU_FEATURE_PCLMUL		0x00000002
+#define X86_CPU_FEATURE_AVX		0x00000004
+#define X86_CPU_FEATURE_AVX2		0x00000008
+#define X86_CPU_FEATURE_BMI2		0x00000010
+
+#define HAVE_SSE2(features)	(HAVE_SSE2_NATIVE     || ((features) & X86_CPU_FEATURE_SSE2))
+#define HAVE_PCLMUL(features)	(HAVE_PCLMUL_NATIVE   || ((features) & X86_CPU_FEATURE_PCLMUL))
+#define HAVE_AVX(features)	(HAVE_AVX_NATIVE      || ((features) & X86_CPU_FEATURE_AVX))
+#define HAVE_AVX2(features)	(HAVE_AVX2_NATIVE     || ((features) & X86_CPU_FEATURE_AVX2))
+#define HAVE_BMI2(features)	(HAVE_BMI2_NATIVE     || ((features) & X86_CPU_FEATURE_BMI2))
+
+#if HAVE_DYNAMIC_X86_CPU_FEATURES
+#define X86_CPU_FEATURES_KNOWN		0x80000000
+extern volatile u32 libdeflate_x86_cpu_features;
+
+void libdeflate_init_x86_cpu_features(void);
+
+static inline u32 get_x86_cpu_features(void)
+{
+	if (libdeflate_x86_cpu_features == 0)
+		libdeflate_init_x86_cpu_features();
+	return libdeflate_x86_cpu_features;
+}
+#else /* HAVE_DYNAMIC_X86_CPU_FEATURES */
+static inline u32 get_x86_cpu_features(void) { return 0; }
+#endif /* !HAVE_DYNAMIC_X86_CPU_FEATURES */
+
+/*
+ * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics not
+ * available in the main target couldn't be used in 'target' attribute
+ * functions.  Unfortunately clang has no feature test macro for this, so we
+ * have to check its version.
+ */
+#if HAVE_DYNAMIC_X86_CPU_FEATURES && \
+	(GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) || defined(_MSC_VER))
+#  define HAVE_TARGET_INTRINSICS	1
+#else
+#  define HAVE_TARGET_INTRINSICS	0
+#endif
+
+/* SSE2 */
+#if defined(__SSE2__) || \
+	(defined(_MSC_VER) && \
+	 (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)))
+#  define HAVE_SSE2_NATIVE	1
+#else
+#  define HAVE_SSE2_NATIVE	0
+#endif
+#define HAVE_SSE2_INTRIN	(HAVE_SSE2_NATIVE || HAVE_TARGET_INTRINSICS)
+
+/* PCLMUL */
+#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
+#  define HAVE_PCLMUL_NATIVE	1
+#else
+#  define HAVE_PCLMUL_NATIVE	0
+#endif
+#if HAVE_PCLMUL_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			   (GCC_PREREQ(4, 4) || \
+			    __has_builtin(__builtin_ia32_pclmulqdq128) || \
+			    defined(_MSC_VER)))
+#  define HAVE_PCLMUL_INTRIN	1
+#else
+#  define HAVE_PCLMUL_INTRIN	0
+#endif
+
+/* AVX */
+#ifdef __AVX__
+#  define HAVE_AVX_NATIVE	1
+#else
+#  define HAVE_AVX_NATIVE	0
+#endif
+#if HAVE_AVX_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			(GCC_PREREQ(4, 6) || \
+			 __has_builtin(__builtin_ia32_maxps256) || \
+			 defined(_MSC_VER)))
+#  define HAVE_AVX_INTRIN	1
+#else
+#  define HAVE_AVX_INTRIN	0
+#endif
+
+/* AVX2 */
+#ifdef __AVX2__
+#  define HAVE_AVX2_NATIVE	1
+#else
+#  define HAVE_AVX2_NATIVE	0
+#endif
+#if HAVE_AVX2_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			 (GCC_PREREQ(4, 7) || \
+			  __has_builtin(__builtin_ia32_psadbw256) || \
+			  defined(_MSC_VER)))
+#  define HAVE_AVX2_INTRIN	1
+#else
+#  define HAVE_AVX2_INTRIN	0
+#endif
+
+/* BMI2 */
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#  define HAVE_BMI2_NATIVE	1
+#else
+#  define HAVE_BMI2_NATIVE	0
+#endif
+#if HAVE_BMI2_NATIVE || (HAVE_TARGET_INTRINSICS && \
+			 (GCC_PREREQ(4, 7) || \
+			  __has_builtin(__builtin_ia32_pdep_di) || \
+			  defined(_MSC_VER)))
+#  define HAVE_BMI2_INTRIN	1
+#else
+#  define HAVE_BMI2_INTRIN	0
+#endif
+
+#endif /* ARCH_X86_32 || ARCH_X86_64 */
+
+#endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/x86/crc32_impl.h b/tools/z64compress/src/enc/libdeflate/lib/x86/crc32_impl.h
new file mode 100644
index 000000000..79cc7944e
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/x86/crc32_impl.h
@@ -0,0 +1,96 @@
+/*
+ * x86/crc32_impl.h - x86 implementations of the gzip CRC-32 algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_CRC32_IMPL_H
+#define LIB_X86_CRC32_IMPL_H
+
+#include "cpu_features.h"
+
+/* PCLMUL implementation */
+#if HAVE_PCLMUL_INTRIN
+#  define crc32_x86_pclmul	crc32_x86_pclmul
+#  define SUFFIX			 _pclmul
+#  if HAVE_PCLMUL_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("pclmul")
+#  endif
+#  define FOLD_PARTIAL_VECS	0
+#  include "crc32_pclmul_template.h"
+#endif
+
+/*
+ * PCLMUL/AVX implementation.  This implementation has two benefits over the
+ * regular PCLMUL one.  First, simply compiling against the AVX target can
+ * improve performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake)
+ * without actually using any AVX intrinsics, probably due to the availability
+ * of non-destructive VEX-encoded instructions.  Second, AVX support implies
+ * SSSE3 and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for
+ * efficient handling of partial blocks.  (We *could* compile a variant with
+ * PCLMUL+SSSE3+SSE4.1 w/o AVX, but for simplicity we don't currently bother.)
+ *
+ * FIXME: with MSVC, this isn't actually compiled with AVX code generation
+ * enabled yet.  That would require that this be moved to its own .c file.
+ */
+#if HAVE_PCLMUL_INTRIN && HAVE_AVX_INTRIN
+#  define crc32_x86_pclmul_avx	crc32_x86_pclmul_avx
+#  define SUFFIX			 _pclmul_avx
+#  if HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE
+#    define ATTRIBUTES
+#  else
+#    define ATTRIBUTES		_target_attribute("pclmul,avx")
+#  endif
+#  define FOLD_PARTIAL_VECS	1
+#  include "crc32_pclmul_template.h"
+#endif
+
+/*
+ * If the best implementation is statically available, use it unconditionally.
+ * Otherwise choose the best implementation at runtime.
+ */
+#if defined(crc32_x86_pclmul_avx) && HAVE_PCLMUL_NATIVE && HAVE_AVX_NATIVE
+#define DEFAULT_IMPL	crc32_x86_pclmul_avx
+#else
+static inline crc32_func_t
+arch_select_crc32_func(void)
+{
+	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
+
+#ifdef crc32_x86_pclmul_avx
+	if (HAVE_PCLMUL(features) && HAVE_AVX(features))
+		return crc32_x86_pclmul_avx;
+#endif
+#ifdef crc32_x86_pclmul
+	if (HAVE_PCLMUL(features))
+		return crc32_x86_pclmul;
+#endif
+	return NULL;
+}
+#define arch_select_crc32_func	arch_select_crc32_func
+#endif
+
+#endif /* LIB_X86_CRC32_IMPL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/x86/crc32_pclmul_template.h b/tools/z64compress/src/enc/libdeflate/lib/x86/crc32_pclmul_template.h
new file mode 100644
index 000000000..1d5782375
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/x86/crc32_pclmul_template.h
@@ -0,0 +1,354 @@
+/*
+ * x86/crc32_pclmul_template.h - gzip CRC-32 with PCLMULQDQ instructions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file is a "template" for instantiating PCLMULQDQ-based crc32_x86
+ * functions.  The "parameters" are:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.
+ * FOLD_PARTIAL_VECS:
+ *	Use vector instructions to handle any partial blocks at the beginning
+ *	and end, instead of falling back to scalar instructions for those parts.
+ *	Requires SSSE3 and SSE4.1 intrinsics.
+ *
+ * The overall algorithm used is CRC folding with carryless multiplication
+ * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
+ * for a different polynomial, not the gzip one.  For an explanation of CRC
+ * folding with carryless multiplication instructions, see
+ * scripts/gen_crc32_multipliers.c and the following paper:
+ *
+ *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *	https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ */
+
+#include <immintrin.h>
+/*
+ * With clang in MSVC compatibility mode, immintrin.h incorrectly skips
+ * including some sub-headers.
+ */
+#if defined(__clang__) && defined(_MSC_VER)
+#  include <tmmintrin.h>
+#  include <smmintrin.h>
+#  include <wmmintrin.h>
+#endif
+
+#undef fold_vec
+static forceinline ATTRIBUTES __m128i
+ADD_SUFFIX(fold_vec)(__m128i src, __m128i dst, __m128i /* __v2di */ multipliers)
+{
+	/*
+	 * The immediate constant for PCLMULQDQ specifies which 64-bit halves of
+	 * the 128-bit vectors to multiply:
+	 *
+	 * 0x00 means low halves (higher degree polynomial terms for us)
+	 * 0x11 means high halves (lower degree polynomial terms for us)
+	 */
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00));
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11));
+	return dst;
+}
+#define fold_vec	ADD_SUFFIX(fold_vec)
+
+#if FOLD_PARTIAL_VECS
+/*
+ * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
+ * next '1 <= len <= 15' data bytes, rearrange the concatenation of v and the
+ * data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
+ * respectively.  Then fold x0 into x1 and return the result.  Assumes that
+ * 'p + len - 16' is in-bounds.
+ */
+#undef fold_partial_vec
+static forceinline ATTRIBUTES __m128i
+ADD_SUFFIX(fold_partial_vec)(__m128i v, const u8 *p, size_t len,
+			     __m128i /* __v2du */ multipliers_1)
+{
+	/*
+	 * pshufb(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
+	 * pshufb(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
+	 */
+	static const u8 shift_tab[48] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	};
+	__m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]);
+	__m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]);
+	__m128i x0, x1;
+
+	/* x0 = v left-shifted by '16 - len' bytes */
+	x0 = _mm_shuffle_epi8(v, lshift);
+
+	/*
+	 * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
+	 * bytes) followed by the remaining data.
+	 */
+	x1 = _mm_blendv_epi8(_mm_shuffle_epi8(v, rshift),
+			     _mm_loadu_si128((const void *)(p + len - 16)),
+			     /* msb 0/1 of each byte selects byte from arg1/2 */
+			     rshift);
+
+	return fold_vec(x0, x1, multipliers_1);
+}
+#define fold_partial_vec	ADD_SUFFIX(fold_partial_vec)
+#endif /* FOLD_PARTIAL_VECS */
+
+static u32 ATTRIBUTES MAYBE_UNUSED
+ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
+{
+	const __m128i /* __v2du */ multipliers_8 =
+		_mm_set_epi64x(CRC32_8VECS_MULT_2, CRC32_8VECS_MULT_1);
+	const __m128i /* __v2du */ multipliers_4 =
+		_mm_set_epi64x(CRC32_4VECS_MULT_2, CRC32_4VECS_MULT_1);
+	const __m128i /* __v2du */ multipliers_2 =
+		_mm_set_epi64x(CRC32_2VECS_MULT_2, CRC32_2VECS_MULT_1);
+	const __m128i /* __v2du */ multipliers_1 =
+		_mm_set_epi64x(CRC32_1VECS_MULT_2, CRC32_1VECS_MULT_1);
+	const __m128i /* __v2du */ final_multiplier =
+		_mm_set_epi64x(0, CRC32_FINAL_MULT);
+	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
+	const __m128i /* __v2du */ barrett_reduction_constants =
+		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2,
+			       CRC32_BARRETT_CONSTANT_1);
+	__m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+	/*
+	 * There are two overall code paths.  The first path supports all
+	 * lengths, but is intended for short lengths; it uses unaligned loads
+	 * and does at most 4-way folds.  The second path only supports longer
+	 * lengths, aligns the pointer in order to do aligned loads, and does up
+	 * to 8-way folds.  The length check below decides which path to take.
+	 */
+	if (len < 1024) {
+		if (len < 16)
+			return crc32_slice1(crc, p, len);
+
+		v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
+				   _mm_cvtsi32_si128(crc));
+		p += 16;
+
+		if (len >= 64) {
+			v1 = _mm_loadu_si128((const void *)(p + 0));
+			v2 = _mm_loadu_si128((const void *)(p + 16));
+			v3 = _mm_loadu_si128((const void *)(p + 32));
+			p += 48;
+			while (len >= 64 + 64) {
+				v0 = fold_vec(v0, _mm_loadu_si128((const void *)(p + 0)),
+					      multipliers_4);
+				v1 = fold_vec(v1, _mm_loadu_si128((const void *)(p + 16)),
+					      multipliers_4);
+				v2 = fold_vec(v2, _mm_loadu_si128((const void *)(p + 32)),
+					      multipliers_4);
+				v3 = fold_vec(v3, _mm_loadu_si128((const void *)(p + 48)),
+					      multipliers_4);
+				p += 64;
+				len -= 64;
+			}
+			v0 = fold_vec(v0, v2, multipliers_2);
+			v1 = fold_vec(v1, v3, multipliers_2);
+			if (len & 32) {
+				v0 = fold_vec(v0, _mm_loadu_si128((const void *)(p + 0)),
+					      multipliers_2);
+				v1 = fold_vec(v1, _mm_loadu_si128((const void *)(p + 16)),
+					      multipliers_2);
+				p += 32;
+			}
+			v0 = fold_vec(v0, v1, multipliers_1);
+			if (len & 16) {
+				v0 = fold_vec(v0, _mm_loadu_si128((const void *)p),
+					      multipliers_1);
+				p += 16;
+			}
+		} else {
+			if (len >= 32) {
+				v0 = fold_vec(v0, _mm_loadu_si128((const void *)p),
+					      multipliers_1);
+				p += 16;
+				if (len >= 48) {
+					v0 = fold_vec(v0, _mm_loadu_si128((const void *)p),
+						      multipliers_1);
+					p += 16;
+				}
+			}
+		}
+	} else {
+		const size_t align = -(uintptr_t)p & 15;
+		const __m128i *vp;
+
+	#if FOLD_PARTIAL_VECS
+		v0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
+				   _mm_cvtsi32_si128(crc));
+		p += 16;
+		/* Align p to the next 16-byte boundary. */
+		if (align) {
+			v0 = fold_partial_vec(v0, p, align, multipliers_1);
+			p += align;
+			len -= align;
+		}
+		vp = (const __m128i *)p;
+	#else
+		/* Align p to the next 16-byte boundary. */
+		if (align) {
+			crc = crc32_slice1(crc, p, align);
+			p += align;
+			len -= align;
+		}
+		vp = (const __m128i *)p;
+		v0 = _mm_xor_si128(*vp++, _mm_cvtsi32_si128(crc));
+	#endif
+		v1 = *vp++;
+		v2 = *vp++;
+		v3 = *vp++;
+		v4 = *vp++;
+		v5 = *vp++;
+		v6 = *vp++;
+		v7 = *vp++;
+		do {
+			v0 = fold_vec(v0, *vp++, multipliers_8);
+			v1 = fold_vec(v1, *vp++, multipliers_8);
+			v2 = fold_vec(v2, *vp++, multipliers_8);
+			v3 = fold_vec(v3, *vp++, multipliers_8);
+			v4 = fold_vec(v4, *vp++, multipliers_8);
+			v5 = fold_vec(v5, *vp++, multipliers_8);
+			v6 = fold_vec(v6, *vp++, multipliers_8);
+			v7 = fold_vec(v7, *vp++, multipliers_8);
+			len -= 128;
+		} while (len >= 128 + 128);
+
+		v0 = fold_vec(v0, v4, multipliers_4);
+		v1 = fold_vec(v1, v5, multipliers_4);
+		v2 = fold_vec(v2, v6, multipliers_4);
+		v3 = fold_vec(v3, v7, multipliers_4);
+		if (len & 64) {
+			v0 = fold_vec(v0, *vp++, multipliers_4);
+			v1 = fold_vec(v1, *vp++, multipliers_4);
+			v2 = fold_vec(v2, *vp++, multipliers_4);
+			v3 = fold_vec(v3, *vp++, multipliers_4);
+		}
+
+		v0 = fold_vec(v0, v2, multipliers_2);
+		v1 = fold_vec(v1, v3, multipliers_2);
+		if (len & 32) {
+			v0 = fold_vec(v0, *vp++, multipliers_2);
+			v1 = fold_vec(v1, *vp++, multipliers_2);
+		}
+
+		v0 = fold_vec(v0, v1, multipliers_1);
+		if (len & 16)
+			v0 = fold_vec(v0, *vp++, multipliers_1);
+
+		p = (const u8 *)vp;
+	}
+	len &= 15;
+
+	/*
+	 * If fold_partial_vec() is available, handle any remaining partial
+	 * block now before reducing to 32 bits.
+	 */
+#if FOLD_PARTIAL_VECS
+	if (len)
+		v0 = fold_partial_vec(v0, p, len, multipliers_1);
+#endif
+
+	/*
+	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+	 * which is equivalent to multiplying by x^32.  This is needed because
+	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+	 */
+	v0 = _mm_xor_si128(_mm_srli_si128(v0, 8),
+			   _mm_clmulepi64_si128(v0, multipliers_1, 0x10));
+
+	/* Fold 96 => 64 bits. */
+	v0 = _mm_xor_si128(_mm_srli_si128(v0, 4),
+			   _mm_clmulepi64_si128(_mm_and_si128(v0, mask32),
+						final_multiplier, 0x00));
+
+	/*
+	 * Reduce 64 => 32 bits using Barrett reduction.
+	 *
+	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
+	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
+	 *
+	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
+	 *	     = (A(x)*x^32) mod G(x) + B(x)
+	 *
+	 * Then, by the Division Algorithm there exists a unique q(x) such that:
+	 *
+	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+	 *
+	 * Since the left-hand side is of maximum degree 31, the right-hand side
+	 * must be too.  This implies that we can apply 'mod x^32' to the
+	 * right-hand side without changing its value:
+	 *
+	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+	 *
+	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
+	 *
+	 * We also know that:
+	 *
+	 *	              / A(x)*x^32 \
+	 *	q(x) = floor (  ---------  )
+	 *	              \    G(x)   /
+	 *
+	 * To compute this efficiently, we can multiply the top and bottom by
+	 * x^32 and move the division by G(x) to the top:
+	 *
+	 *	              / A(x) * floor(x^64 / G(x)) \
+	 *	q(x) = floor (  -------------------------  )
+	 *	              \           x^32            /
+	 *
+	 * Note that floor(x^64 / G(x)) is a constant.
+	 *
+	 * So finally we have:
+	 *
+	 *	                          / A(x) * floor(x^64 / G(x)) \
+	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
+	 *	                          \           x^32            /
+	 */
+	v1 = _mm_clmulepi64_si128(_mm_and_si128(v0, mask32),
+				  barrett_reduction_constants, 0x00);
+	v1 = _mm_clmulepi64_si128(_mm_and_si128(v1, mask32),
+				  barrett_reduction_constants, 0x10);
+	v0 = _mm_xor_si128(v0, v1);
+#if FOLD_PARTIAL_VECS
+	crc = _mm_extract_epi32(v0, 1);
+#else
+	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(v0, 0x01));
+	/* Process up to 15 bytes left over at the end. */
+	crc = crc32_slice1(crc, p, len);
+#endif
+	return crc;
+}
+
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef FOLD_PARTIAL_VECS
diff --git a/tools/z64compress/src/enc/libdeflate/lib/x86/decompress_impl.h b/tools/z64compress/src/enc/libdeflate/lib/x86/decompress_impl.h
new file mode 100644
index 000000000..3e2ec37e7
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/x86/decompress_impl.h
@@ -0,0 +1,54 @@
+#ifndef LIB_X86_DECOMPRESS_IMPL_H
+#define LIB_X86_DECOMPRESS_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * BMI2 optimized version
+ *
+ * FIXME: with MSVC, this isn't actually compiled with BMI2 code generation
+ * enabled yet.  That would require that this be moved to its own .c file.
+ */
+#if HAVE_BMI2_INTRIN
+#  define deflate_decompress_bmi2	deflate_decompress_bmi2
+#  define FUNCNAME			deflate_decompress_bmi2
+#  if !HAVE_BMI2_NATIVE
+#    define ATTRIBUTES			_target_attribute("bmi2")
+#  endif
+   /*
+    * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
+    * bzhi instruction for 'word & BITMASK(count)'.  So use the bzhi intrinsic
+    * explicitly.  EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
+    * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
+    * Nevertheless, their implementation using the bzhi intrinsic is identical,
+    * as the bzhi instruction truncates the count to 8 bits implicitly.
+    */
+#  ifndef __clang__
+#    include <immintrin.h>
+#    ifdef ARCH_X86_64
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u64((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
+#    else
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u32((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
+#    endif
+#  endif
+#  include "../decompress_template.h"
+#endif /* HAVE_BMI2_INTRIN */
+
+#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
+#define DEFAULT_IMPL	deflate_decompress_bmi2
+#else
+static inline decompress_func_t
+arch_select_decompress_func(void)
+{
+#ifdef deflate_decompress_bmi2
+	if (HAVE_BMI2(get_x86_cpu_features()))
+		return deflate_decompress_bmi2;
+#endif
+	return NULL;
+}
+#define arch_select_decompress_func	arch_select_decompress_func
+#endif
+
+#endif /* LIB_X86_DECOMPRESS_IMPL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/x86/matchfinder_impl.h b/tools/z64compress/src/enc/libdeflate/lib/x86/matchfinder_impl.h
new file mode 100644
index 000000000..8433b9b10
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/x86/matchfinder_impl.h
@@ -0,0 +1,124 @@
+/*
+ * x86/matchfinder_impl.h - x86 implementations of matchfinder functions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_MATCHFINDER_IMPL_H
+#define LIB_X86_MATCHFINDER_IMPL_H
+
+#include "cpu_features.h"
+
+#if HAVE_AVX2_NATIVE
+#  include <immintrin.h>
+static forceinline void
+matchfinder_init_avx2(mf_pos_t *data, size_t size)
+{
+	__m256i *p = (__m256i *)data;
+	__m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_avx2
+
+static forceinline void
+matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
+{
+	__m256i *p = (__m256i *)data;
+	__m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm256_adds_epi16(p[0], v);
+		p[1] = _mm256_adds_epi16(p[1], v);
+		p[2] = _mm256_adds_epi16(p[2], v);
+		p[3] = _mm256_adds_epi16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_avx2
+
+#elif HAVE_SSE2_NATIVE
+#  include <emmintrin.h>
+static forceinline void
+matchfinder_init_sse2(mf_pos_t *data, size_t size)
+{
+	__m128i *p = (__m128i *)data;
+	__m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_sse2
+
+static forceinline void
+matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
+{
+	__m128i *p = (__m128i *)data;
+	__m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm_adds_epi16(p[0], v);
+		p[1] = _mm_adds_epi16(p[1], v);
+		p[2] = _mm_adds_epi16(p[2], v);
+		p[3] = _mm_adds_epi16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_sse2
+#endif /* HAVE_SSE2_NATIVE */
+
+#endif /* LIB_X86_MATCHFINDER_IMPL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/zlib_compress.c b/tools/z64compress/src/enc/libdeflate/lib/zlib_compress.c
new file mode 100644
index 000000000..4f9cc6f08
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/zlib_compress.c
@@ -0,0 +1,84 @@
+/*
+ * zlib_compress.c - compress with a zlib wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "deflate_compress.h"
+#include "zlib_constants.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEAPI size_t
+libdeflate_zlib_compress(struct libdeflate_compressor *c,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail)
+{
+	u8 *out_next = out;
+	u16 hdr;
+	unsigned compression_level;
+	unsigned level_hint;
+	size_t deflate_size;
+
+	if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
+		return 0;
+
+	/* 2 byte header: CMF and FLG  */
+	hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
+	compression_level = libdeflate_get_compression_level(c);
+	if (compression_level < 2)
+		level_hint = ZLIB_FASTEST_COMPRESSION;
+	else if (compression_level < 6)
+		level_hint = ZLIB_FAST_COMPRESSION;
+	else if (compression_level < 8)
+		level_hint = ZLIB_DEFAULT_COMPRESSION;
+	else
+		level_hint = ZLIB_SLOWEST_COMPRESSION;
+	hdr |= level_hint << 6;
+	hdr |= 31 - (hdr % 31);
+
+	put_unaligned_be16(hdr, out_next);
+	out_next += 2;
+
+	/* Compressed data  */
+	deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
+					out_nbytes_avail - ZLIB_MIN_OVERHEAD);
+	if (deflate_size == 0)
+		return 0;
+	out_next += deflate_size;
+
+	/* ADLER32  */
+	put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next);
+	out_next += 4;
+
+	return out_next - (u8 *)out;
+}
+
+LIBDEFLATEAPI size_t
+libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
+			       size_t in_nbytes)
+{
+	return ZLIB_MIN_OVERHEAD +
+	       libdeflate_deflate_compress_bound(c, in_nbytes);
+}
diff --git a/tools/z64compress/src/enc/libdeflate/lib/zlib_constants.h b/tools/z64compress/src/enc/libdeflate/lib/zlib_constants.h
new file mode 100644
index 000000000..f304310c7
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/zlib_constants.h
@@ -0,0 +1,21 @@
+/*
+ * zlib_constants.h - constants for the zlib wrapper format
+ */
+
+#ifndef LIB_ZLIB_CONSTANTS_H
+#define LIB_ZLIB_CONSTANTS_H
+
+#define ZLIB_MIN_HEADER_SIZE	2
+#define ZLIB_FOOTER_SIZE	4
+#define ZLIB_MIN_OVERHEAD	(ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
+
+#define ZLIB_CM_DEFLATE		8
+
+#define ZLIB_CINFO_32K_WINDOW	7
+
+#define ZLIB_FASTEST_COMPRESSION	0
+#define ZLIB_FAST_COMPRESSION		1
+#define ZLIB_DEFAULT_COMPRESSION	2
+#define ZLIB_SLOWEST_COMPRESSION	3
+
+#endif /* LIB_ZLIB_CONSTANTS_H */
diff --git a/tools/z64compress/src/enc/libdeflate/lib/zlib_decompress.c b/tools/z64compress/src/enc/libdeflate/lib/zlib_decompress.c
new file mode 100644
index 000000000..b7b3b1f95
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/lib/zlib_decompress.c
@@ -0,0 +1,106 @@
+/*
+ * zlib_decompress.c - decompress with a zlib wrapper
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "lib_common.h"
+#include "zlib_constants.h"
+
+#include "libdeflate.h"
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret)
+{
+	const u8 *in_next = in;
+	const u8 * const in_end = in_next + in_nbytes;
+	u16 hdr;
+	size_t actual_in_nbytes;
+	size_t actual_out_nbytes;
+	enum libdeflate_result result;
+
+	if (in_nbytes < ZLIB_MIN_OVERHEAD)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* 2 byte header: CMF and FLG  */
+	hdr = get_unaligned_be16(in_next);
+	in_next += 2;
+
+	/* FCHECK */
+	if ((hdr % 31) != 0)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* CM */
+	if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* CINFO */
+	if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* FDICT */
+	if ((hdr >> 5) & 1)
+		return LIBDEFLATE_BAD_DATA;
+
+	/* Compressed data  */
+	result = libdeflate_deflate_decompress_ex(d, in_next,
+					in_end - ZLIB_FOOTER_SIZE - in_next,
+					out, out_nbytes_avail,
+					&actual_in_nbytes, actual_out_nbytes_ret);
+	if (result != LIBDEFLATE_SUCCESS)
+		return result;
+
+	if (actual_out_nbytes_ret)
+		actual_out_nbytes = *actual_out_nbytes_ret;
+	else
+		actual_out_nbytes = out_nbytes_avail;
+
+	in_next += actual_in_nbytes;
+
+	/* ADLER32  */
+	if (libdeflate_adler32(1, out, actual_out_nbytes) !=
+	    get_unaligned_be32(in_next))
+		return LIBDEFLATE_BAD_DATA;
+	in_next += 4;
+
+	if (actual_in_nbytes_ret)
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+
+	return LIBDEFLATE_SUCCESS;
+}
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail,
+			   size_t *actual_out_nbytes_ret)
+{
+	return libdeflate_zlib_decompress_ex(d, in, in_nbytes,
+					     out, out_nbytes_avail,
+					     NULL, actual_out_nbytes_ret);
+}
diff --git a/tools/z64compress/src/enc/libdeflate/libdeflate-config.cmake.in b/tools/z64compress/src/enc/libdeflate/libdeflate-config.cmake.in
new file mode 100644
index 000000000..747799df9
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/libdeflate-config.cmake.in
@@ -0,0 +1,3 @@
+@PACKAGE_INIT@
+
+include("${CMAKE_CURRENT_LIST_DIR}/libdeflate-targets.cmake")
diff --git a/tools/z64compress/src/enc/libdeflate/libdeflate.h b/tools/z64compress/src/enc/libdeflate/libdeflate.h
new file mode 100644
index 000000000..f26087597
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/libdeflate.h
@@ -0,0 +1,368 @@
+/*
+ * libdeflate.h - public header for libdeflate
+ */
+
+#ifndef LIBDEFLATE_H
+#define LIBDEFLATE_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define LIBDEFLATE_VERSION_MAJOR	1
+#define LIBDEFLATE_VERSION_MINOR	15
+#define LIBDEFLATE_VERSION_STRING	"1.15"
+
+/*
+ * Users of libdeflate.dll on Windows can define LIBDEFLATE_DLL to cause
+ * __declspec(dllimport) to be used.  This should be done when it's easy to do.
+ * Otherwise it's fine to skip it, since it is a very minor performance
+ * optimization that is irrelevant for most use cases of libdeflate.
+ */
+#ifndef LIBDEFLATEAPI
+#  if defined(LIBDEFLATE_DLL) && (defined(_WIN32) || defined(__CYGWIN__))
+#    define LIBDEFLATEAPI	__declspec(dllimport)
+#  else
+#    define LIBDEFLATEAPI
+#  endif
+#endif
+
+/* ========================================================================== */
+/*                             Compression                                    */
+/* ========================================================================== */
+
+struct libdeflate_compressor;
+
+/*
+ * libdeflate_alloc_compressor() allocates a new compressor that supports
+ * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
+ * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
+ * medium/default, 9 = slow, 12 = slowest).  Level 0 is also supported and means
+ * "no compression", specifically "create a valid stream, but only emit
+ * uncompressed blocks" (this will expand the data slightly).
+ *
+ * The return value is a pointer to the new compressor, or NULL if out of memory
+ * or if the compression level is invalid (i.e. outside the range [0, 12]).
+ *
+ * Note: for compression, the sliding window size is defined at compilation time
+ * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
+ * changed at runtime.
+ *
+ * A single compressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different compressors concurrently.
+ */
+LIBDEFLATEAPI struct libdeflate_compressor *
+libdeflate_alloc_compressor(int compression_level);
+
+/*
+ * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
+ * data.  It attempts to compress 'in_nbytes' bytes of data located at 'in' and
+ * write the result to 'out', which has space for 'out_nbytes_avail' bytes.  The
+ * return value is the compressed size in bytes, or 0 if the data could not be
+ * compressed to 'out_nbytes_avail' bytes or fewer (but see note below).
+ *
+ * If compression is successful, then the output data is guaranteed to be a
+ * valid DEFLATE stream that decompresses to the input data.  No other
+ * guarantees are made about the output data.  Notably, different versions of
+ * libdeflate can produce different compressed data for the same uncompressed
+ * data, even at the same compression level.  Do ***NOT*** do things like
+ * writing tests that compare compressed data to a golden output, as this can
+ * break when libdeflate is updated.  (This property isn't specific to
+ * libdeflate; the same is true for zlib and other compression libraries too.)
+ *
+ * Note: due to a performance optimization, libdeflate_deflate_compress()
+ * currently needs a small amount of slack space at the end of the output
+ * buffer.  As a result, it can't actually report compressed sizes very close to
+ * 'out_nbytes_avail'.  This doesn't matter in real-world use cases, and
+ * libdeflate_deflate_compress_bound() already includes the slack space.
+ * However, it does mean that testing code that redundantly compresses data
+ * using an exact-sized output buffer won't work as might be expected:
+ *
+ *	out_nbytes = libdeflate_deflate_compress(c, in, in_nbytes, out,
+ *						 libdeflate_deflate_compress_bound(in_nbytes));
+ *	// The following assertion will fail.
+ *	assert(libdeflate_deflate_compress(c, in, in_nbytes, out, out_nbytes) != 0);
+ *
+ * To avoid this, either don't write tests like the above, or make sure to
+ * include at least 9 bytes of slack space in 'out_nbytes_avail'.
+ */
+LIBDEFLATEAPI size_t
+libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
+			    const void *in, size_t in_nbytes,
+			    void *out, size_t out_nbytes_avail);
+
+/*
+ * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
+ * number of bytes of compressed data that may be produced by compressing any
+ * buffer of length less than or equal to 'in_nbytes' using
+ * libdeflate_deflate_compress() with the specified compressor.  This bound will
+ * necessarily be a number greater than or equal to 'in_nbytes'.  It may be an
+ * overestimate of the true upper bound.  The return value is guaranteed to be
+ * the same for all invocations with the same compressor and same 'in_nbytes'.
+ *
+ * As a special case, 'compressor' may be NULL.  This causes the bound to be
+ * taken across *any* libdeflate_compressor that could ever be allocated with
+ * this build of the library, with any options.
+ *
+ * Note that this function is not necessary in many applications.  With
+ * block-based compression, it is usually preferable to separately store the
+ * uncompressed size of each block and to store any blocks that did not compress
+ * to less than their original size uncompressed.  In that scenario, there is no
+ * need to know the worst-case compressed size, since the maximum number of
+ * bytes of compressed data that may be used would always be one less than the
+ * input length.  You can just pass a buffer of that size to
+ * libdeflate_deflate_compress() and store the data uncompressed if
+ * libdeflate_deflate_compress() returns 0, indicating that the compressed data
+ * did not fit into the provided output buffer.
+ */
+LIBDEFLATEAPI size_t
+libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
+				  size_t in_nbytes);
+
+/*
+ * Like libdeflate_deflate_compress(), but uses the zlib wrapper format instead
+ * of raw DEFLATE.
+ */
+LIBDEFLATEAPI size_t
+libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail);
+
+/*
+ * Like libdeflate_deflate_compress_bound(), but assumes the data will be
+ * compressed with libdeflate_zlib_compress() rather than with
+ * libdeflate_deflate_compress().
+ */
+LIBDEFLATEAPI size_t
+libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
+			       size_t in_nbytes);
+
+/*
+ * Like libdeflate_deflate_compress(), but uses the gzip wrapper format instead
+ * of raw DEFLATE.
+ */
+LIBDEFLATEAPI size_t
+libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
+			 const void *in, size_t in_nbytes,
+			 void *out, size_t out_nbytes_avail);
+
+/*
+ * Like libdeflate_deflate_compress_bound(), but assumes the data will be
+ * compressed with libdeflate_gzip_compress() rather than with
+ * libdeflate_deflate_compress().
+ */
+LIBDEFLATEAPI size_t
+libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
+			       size_t in_nbytes);
+
+/*
+ * libdeflate_free_compressor() frees a compressor that was allocated with
+ * libdeflate_alloc_compressor().  If a NULL pointer is passed in, no action is
+ * taken.
+ */
+LIBDEFLATEAPI void
+libdeflate_free_compressor(struct libdeflate_compressor *compressor);
+
+/* ========================================================================== */
+/*                             Decompression                                  */
+/* ========================================================================== */
+
+struct libdeflate_decompressor;
+
+/*
+ * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
+ * for DEFLATE, zlib, and gzip decompression.  The return value is a pointer to
+ * the new decompressor, or NULL if out of memory.
+ *
+ * This function takes no parameters, and the returned decompressor is valid for
+ * decompressing data that was compressed at any compression level and with any
+ * sliding window size.
+ *
+ * A single decompressor is not safe to use by multiple threads concurrently.
+ * However, different threads may use different decompressors concurrently.
+ */
+LIBDEFLATEAPI struct libdeflate_decompressor *
+libdeflate_alloc_decompressor(void);
+
+/*
+ * Result of a call to libdeflate_deflate_decompress(),
+ * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
+ */
+enum libdeflate_result {
+	/* Decompression was successful.  */
+	LIBDEFLATE_SUCCESS = 0,
+
+	/* Decompression failed because the compressed data was invalid,
+	 * corrupt, or otherwise unsupported.  */
+	LIBDEFLATE_BAD_DATA = 1,
+
+	/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
+	 * decompressed to fewer than 'out_nbytes_avail' bytes.  */
+	LIBDEFLATE_SHORT_OUTPUT = 2,
+
+	/* The data would have decompressed to more than 'out_nbytes_avail'
+	 * bytes.  */
+	LIBDEFLATE_INSUFFICIENT_SPACE = 3,
+};
+
+/*
+ * libdeflate_deflate_decompress() decompresses a DEFLATE stream from the buffer
+ * 'in' with compressed size up to 'in_nbytes' bytes.  The uncompressed data is
+ * written to 'out', a buffer with size 'out_nbytes_avail' bytes.  If
+ * decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.  Otherwise,
+ * a nonzero result code such as LIBDEFLATE_BAD_DATA is returned, and the
+ * contents of the output buffer are undefined.
+ *
+ * Decompression stops at the end of the DEFLATE stream (as indicated by the
+ * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
+ *
+ * libdeflate_deflate_decompress() can be used in cases where the actual
+ * uncompressed size is known (recommended) or unknown (not recommended):
+ *
+ *   - If the actual uncompressed size is known, then pass the actual
+ *     uncompressed size as 'out_nbytes_avail' and pass NULL for
+ *     'actual_out_nbytes_ret'.  This makes libdeflate_deflate_decompress() fail
+ *     with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
+ *     specified number of bytes.
+ *
+ *   - If the actual uncompressed size is unknown, then provide a non-NULL
+ *     'actual_out_nbytes_ret' and provide a buffer with some size
+ *     'out_nbytes_avail' that you think is large enough to hold all the
+ *     uncompressed data.  In this case, if the data decompresses to less than
+ *     or equal to 'out_nbytes_avail' bytes, then
+ *     libdeflate_deflate_decompress() will write the actual uncompressed size
+ *     to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS).  Otherwise,
+ *     it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
+ *     not large enough but no other problems were encountered, or another
+ *     nonzero result code if decompression failed for another reason.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
+ * then the actual compressed size of the DEFLATE stream (aligned to the next
+ * byte boundary) is written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
+ * instead of raw DEFLATE.
+ *
+ * Decompression will stop at the end of the zlib stream, even if it is shorter
+ * than 'in_nbytes'.  If you need to know exactly where the zlib stream ended,
+ * use libdeflate_zlib_decompress_ex().
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail,
+			   size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_zlib_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first zlib-compressed stream in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *decompressor,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
+ * instead of raw DEFLATE.
+ *
+ * If multiple gzip-compressed members are concatenated, then only the first
+ * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
+ * multi-member support.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail,
+			   size_t *actual_out_nbytes_ret);
+
+/*
+ * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first gzip-compressed member in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret);
+
+/*
+ * libdeflate_free_decompressor() frees a decompressor that was allocated with
+ * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
+ * is taken.
+ */
+LIBDEFLATEAPI void
+libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
+
+/* ========================================================================== */
+/*                                Checksums                                   */
+/* ========================================================================== */
+
+/*
+ * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
+ * data and returns the updated checksum.  When starting a new checksum, the
+ * required initial value for 'adler' is 1.  This value is also returned when
+ * 'buffer' is specified as NULL.
+ */
+LIBDEFLATEAPI uint32_t
+libdeflate_adler32(uint32_t adler, const void *buffer, size_t len);
+
+
+/*
+ * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
+ * and returns the updated checksum.  When starting a new checksum, the required
+ * initial value for 'crc' is 0.  This value is also returned when 'buffer' is
+ * specified as NULL.
+ */
+LIBDEFLATEAPI uint32_t
+libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
+
+/* ========================================================================== */
+/*                           Custom memory allocator                          */
+/* ========================================================================== */
+
+/*
+ * Install a custom memory allocator which libdeflate will use for all memory
+ * allocations.  'malloc_func' is a function that must behave like malloc(), and
+ * 'free_func' is a function that must behave like free().
+ *
+ * There must not be any libdeflate_compressor or libdeflate_decompressor
+ * structures in existence when calling this function.
+ */
+LIBDEFLATEAPI void
+libdeflate_set_memory_allocator(void *(*malloc_func)(size_t),
+				void (*free_func)(void *));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBDEFLATE_H */
diff --git a/tools/z64compress/src/enc/libdeflate/libdeflate.pc.in b/tools/z64compress/src/enc/libdeflate/libdeflate.pc.in
new file mode 100644
index 000000000..b8ced3c69
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/libdeflate.pc.in
@@ -0,0 +1,18 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+includedir=@CMAKE_PKGCONFIG_INCLUDEDIR@
+libdir=@CMAKE_PKGCONFIG_LIBDIR@
+
+Name: libdeflate
+Description: Fast implementation of DEFLATE, zlib, and gzip
+Version: @PROJECT_VERSION@
+Libs: -L${libdir} -ldeflate
+Cflags: -I${includedir}
+
+# Note: this library's public header allows LIBDEFLATE_DLL to be defined when
+# linking to the DLL on Windows, to make __declspec(dllimport) be used.
+# However, the only way to define a shared-library-only flag in a pkgconfig file
+# is to use the weird workaround of unconditionally defining it in Cflags, then
+# undefining it in Cflags.private.  Just don't bother with this, since
+# __declspec(dllimport) is optional anyway.  It is a very minor performance
+# optimization that is irrelevant for most use cases of libdeflate.
diff --git a/tools/z64compress/src/enc/libdeflate/programs/benchmark.c b/tools/z64compress/src/enc/libdeflate/programs/benchmark.c
new file mode 100644
index 000000000..52af8dafc
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/benchmark.c
@@ -0,0 +1,696 @@
+/*
+ * benchmark.c - a compression testing and benchmark program
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "test_util.h"
+
+static const tchar *const optstring = T("0::1::2::3::4::5::6::7::8::9::C:D:eghs:VYZz");
+
+enum format {
+	DEFLATE_FORMAT,
+	ZLIB_FORMAT,
+	GZIP_FORMAT,
+};
+
+struct compressor {
+	int level;
+	enum format format;
+	const struct engine *engine;
+	void *private;
+};
+
+struct decompressor {
+	enum format format;
+	const struct engine *engine;
+	void *private;
+};
+
+struct engine {
+	const tchar *name;
+
+	bool (*init_compressor)(struct compressor *);
+	size_t (*compress_bound)(struct compressor *, size_t);
+	size_t (*compress)(struct compressor *, const void *, size_t,
+			   void *, size_t);
+	void (*destroy_compressor)(struct compressor *);
+
+	bool (*init_decompressor)(struct decompressor *);
+	bool (*decompress)(struct decompressor *, const void *, size_t,
+			   void *, size_t);
+	void (*destroy_decompressor)(struct decompressor *);
+};
+
+/******************************************************************************/
+
+static bool
+libdeflate_engine_init_compressor(struct compressor *c)
+{
+	c->private = alloc_compressor(c->level);
+	return c->private != NULL;
+}
+
+static size_t
+libdeflate_engine_compress_bound(struct compressor *c, size_t in_nbytes)
+{
+	switch (c->format) {
+	case ZLIB_FORMAT:
+		return libdeflate_zlib_compress_bound(c->private, in_nbytes);
+	case GZIP_FORMAT:
+		return libdeflate_gzip_compress_bound(c->private, in_nbytes);
+	default:
+		return libdeflate_deflate_compress_bound(c->private, in_nbytes);
+	}
+}
+
+static size_t
+libdeflate_engine_compress(struct compressor *c, const void *in,
+			   size_t in_nbytes, void *out, size_t out_nbytes_avail)
+{
+	switch (c->format) {
+	case ZLIB_FORMAT:
+		return libdeflate_zlib_compress(c->private, in, in_nbytes,
+						out, out_nbytes_avail);
+	case GZIP_FORMAT:
+		return libdeflate_gzip_compress(c->private, in, in_nbytes,
+						out, out_nbytes_avail);
+	default:
+		return libdeflate_deflate_compress(c->private, in, in_nbytes,
+						   out, out_nbytes_avail);
+	}
+}
+
+static void
+libdeflate_engine_destroy_compressor(struct compressor *c)
+{
+	libdeflate_free_compressor(c->private);
+}
+
+static bool
+libdeflate_engine_init_decompressor(struct decompressor *d)
+{
+	d->private = alloc_decompressor();
+	return d->private != NULL;
+}
+
+static bool
+libdeflate_engine_decompress(struct decompressor *d, const void *in,
+			     size_t in_nbytes, void *out, size_t out_nbytes)
+{
+	switch (d->format) {
+	case ZLIB_FORMAT:
+		return !libdeflate_zlib_decompress(d->private, in, in_nbytes,
+						   out, out_nbytes, NULL);
+	case GZIP_FORMAT:
+		return !libdeflate_gzip_decompress(d->private, in, in_nbytes,
+						   out, out_nbytes, NULL);
+	default:
+		return !libdeflate_deflate_decompress(d->private, in, in_nbytes,
+						      out, out_nbytes, NULL);
+	}
+}
+
+static void
+libdeflate_engine_destroy_decompressor(struct decompressor *d)
+{
+	libdeflate_free_decompressor(d->private);
+}
+
+static const struct engine libdeflate_engine = {
+	.name			= T("libdeflate"),
+
+	.init_compressor	= libdeflate_engine_init_compressor,
+	.compress_bound		= libdeflate_engine_compress_bound,
+	.compress		= libdeflate_engine_compress,
+	.destroy_compressor	= libdeflate_engine_destroy_compressor,
+
+	.init_decompressor	= libdeflate_engine_init_decompressor,
+	.decompress		= libdeflate_engine_decompress,
+	.destroy_decompressor	= libdeflate_engine_destroy_decompressor,
+};
+
+/******************************************************************************/
+
+static int
+get_libz_window_bits(enum format format)
+{
+	const int windowBits = 15;
+	switch (format) {
+	case ZLIB_FORMAT:
+		return windowBits;
+	case GZIP_FORMAT:
+		return windowBits + 16;
+	default:
+		return -windowBits;
+	}
+}
+
+static bool
+libz_engine_init_compressor(struct compressor *c)
+{
+	z_stream *z;
+
+	if (c->level > 9) {
+		msg("libz only supports up to compression level 9");
+		return false;
+	}
+
+	z = xmalloc(sizeof(*z));
+	if (z == NULL)
+		return false;
+
+	z->next_in = NULL;
+	z->avail_in = 0;
+	z->zalloc = NULL;
+	z->zfree = NULL;
+	z->opaque = NULL;
+	if (deflateInit2(z, c->level, Z_DEFLATED,
+			 get_libz_window_bits(c->format),
+			 8, Z_DEFAULT_STRATEGY) != Z_OK)
+	{
+		msg("unable to initialize deflater");
+		free(z);
+		return false;
+	}
+
+	c->private = z;
+	return true;
+}
+
+static size_t
+libz_engine_compress_bound(struct compressor *c, size_t in_nbytes)
+{
+	return deflateBound(c->private, in_nbytes);
+}
+
+static size_t
+libz_engine_compress(struct compressor *c, const void *in, size_t in_nbytes,
+		     void *out, size_t out_nbytes_avail)
+{
+	z_stream *z = c->private;
+
+	deflateReset(z);
+
+	z->next_in = (void *)in;
+	z->avail_in = in_nbytes;
+	z->next_out = out;
+	z->avail_out = out_nbytes_avail;
+
+	if (deflate(z, Z_FINISH) != Z_STREAM_END)
+		return 0;
+
+	return out_nbytes_avail - z->avail_out;
+}
+
+static void
+libz_engine_destroy_compressor(struct compressor *c)
+{
+	z_stream *z = c->private;
+
+	deflateEnd(z);
+	free(z);
+}
+
+static bool
+libz_engine_init_decompressor(struct decompressor *d)
+{
+	z_stream *z;
+
+	z = xmalloc(sizeof(*z));
+	if (z == NULL)
+		return false;
+
+	z->next_in = NULL;
+	z->avail_in = 0;
+	z->zalloc = NULL;
+	z->zfree = NULL;
+	z->opaque = NULL;
+	if (inflateInit2(z, get_libz_window_bits(d->format)) != Z_OK) {
+		msg("unable to initialize inflater");
+		free(z);
+		return false;
+	}
+
+	d->private = z;
+	return true;
+}
+
+static bool
+libz_engine_decompress(struct decompressor *d, const void *in, size_t in_nbytes,
+		       void *out, size_t out_nbytes)
+{
+	z_stream *z = d->private;
+
+	inflateReset(z);
+
+	z->next_in = (void *)in;
+	z->avail_in = in_nbytes;
+	z->next_out = out;
+	z->avail_out = out_nbytes;
+
+	return inflate(z, Z_FINISH) == Z_STREAM_END && z->avail_out == 0;
+}
+
+static void
+libz_engine_destroy_decompressor(struct decompressor *d)
+{
+	z_stream *z = d->private;
+
+	inflateEnd(z);
+	free(z);
+}
+
+static const struct engine libz_engine = {
+	.name			= T("libz"),
+
+	.init_compressor	= libz_engine_init_compressor,
+	.compress_bound		= libz_engine_compress_bound,
+	.compress		= libz_engine_compress,
+	.destroy_compressor	= libz_engine_destroy_compressor,
+
+	.init_decompressor	= libz_engine_init_decompressor,
+	.decompress		= libz_engine_decompress,
+	.destroy_decompressor	= libz_engine_destroy_decompressor,
+};
+
+/******************************************************************************/
+
+static const struct engine * const all_engines[] = {
+	&libdeflate_engine,
+	&libz_engine,
+};
+
+#define DEFAULT_ENGINE libdeflate_engine
+
+static const struct engine *
+name_to_engine(const tchar *name)
+{
+	size_t i;
+
+	for (i = 0; i < ARRAY_LEN(all_engines); i++)
+		if (tstrcmp(all_engines[i]->name, name) == 0)
+			return all_engines[i];
+	return NULL;
+}
+
+/******************************************************************************/
+
+static bool
+compressor_init(struct compressor *c, int level, enum format format,
+		const struct engine *engine)
+{
+	c->level = level;
+	c->format = format;
+	c->engine = engine;
+	return engine->init_compressor(c);
+}
+
+static size_t
+compress_bound(struct compressor *c, size_t in_nbytes)
+{
+	return c->engine->compress_bound(c, in_nbytes);
+}
+
+static size_t
+do_compress(struct compressor *c, const void *in, size_t in_nbytes,
+	    void *out, size_t out_nbytes_avail)
+{
+	return c->engine->compress(c, in, in_nbytes, out, out_nbytes_avail);
+}
+
+static void
+compressor_destroy(struct compressor *c)
+{
+	if (c->engine != NULL)
+		c->engine->destroy_compressor(c);
+}
+
+static bool
+decompressor_init(struct decompressor *d, enum format format,
+		  const struct engine *engine)
+{
+	d->format = format;
+	d->engine = engine;
+	return engine->init_decompressor(d);
+}
+
+static bool
+do_decompress(struct decompressor *d, const void *in, size_t in_nbytes,
+	      void *out, size_t out_nbytes)
+{
+	return d->engine->decompress(d, in, in_nbytes, out, out_nbytes);
+}
+
+static void
+decompressor_destroy(struct decompressor *d)
+{
+	if (d->engine != NULL)
+		d->engine->destroy_decompressor(d);
+}
+
+/******************************************************************************/
+
+static void
+show_available_engines(FILE *fp)
+{
+	size_t i;
+
+	fprintf(fp, "Available ENGINEs are: ");
+	for (i = 0; i < ARRAY_LEN(all_engines); i++) {
+		fprintf(fp, "%"TS, all_engines[i]->name);
+		if (i < ARRAY_LEN(all_engines) - 1)
+			fprintf(fp, ", ");
+	}
+	fprintf(fp, ".  Default is %"TS"\n", DEFAULT_ENGINE.name);
+}
+
+static void
+show_usage(FILE *fp)
+{
+	fprintf(fp,
+"Usage: %"TS" [-LVL] [-C ENGINE] [-D ENGINE] [-ghVz] [-s SIZE] [FILE]...\n"
+"Benchmark DEFLATE compression and decompression on the specified FILEs.\n"
+"\n"
+"Options:\n"
+"  -0        no compression\n"
+"  -1        fastest (worst) compression\n"
+"  -6        medium compression (default)\n"
+"  -12       slowest (best) compression\n"
+"  -C ENGINE compression engine\n"
+"  -D ENGINE decompression engine\n"
+"  -e        allow chunks to be expanded (implied by -0)\n"
+"  -g        use gzip format instead of raw DEFLATE\n"
+"  -h        print this help\n"
+"  -s SIZE   chunk size\n"
+"  -V        show version and legal information\n"
+"  -z        use zlib format instead of raw DEFLATE\n"
+"\n", prog_invocation_name);
+
+	show_available_engines(fp);
+}
+
+static void
+show_version(void)
+{
+	printf(
+"libdeflate compression benchmark program v" LIBDEFLATE_VERSION_STRING "\n"
+"Copyright 2016 Eric Biggers\n"
+"\n"
+"This program is free software which may be modified and/or redistributed\n"
+"under the terms of the MIT license.  There is NO WARRANTY, to the extent\n"
+"permitted by law.  See the COPYING file for details.\n"
+	);
+}
+
+
+/******************************************************************************/
+
+static int
+do_benchmark(struct file_stream *in, void *original_buf, void *compressed_buf,
+	     void *decompressed_buf, u32 chunk_size,
+	     bool allow_expansion, size_t compressed_buf_size,
+	     struct compressor *compressor,
+	     struct decompressor *decompressor)
+{
+	u64 total_uncompressed_size = 0;
+	u64 total_compressed_size = 0;
+	u64 total_compress_time = 0;
+	u64 total_decompress_time = 0;
+	ssize_t ret;
+
+	while ((ret = xread(in, original_buf, chunk_size)) > 0) {
+		u32 original_size = ret;
+		size_t out_nbytes_avail;
+		u32 compressed_size;
+		u64 start_time;
+		bool ok;
+
+		total_uncompressed_size += original_size;
+
+		if (allow_expansion) {
+			out_nbytes_avail = compress_bound(compressor,
+							  original_size);
+			if (out_nbytes_avail > compressed_buf_size) {
+				msg("%"TS": bug in compress_bound()", in->name);
+				return -1;
+			}
+		} else {
+			out_nbytes_avail = original_size - 1;
+		}
+
+		/* Compress the chunk of data. */
+		start_time = timer_ticks();
+		compressed_size = do_compress(compressor,
+					      original_buf,
+					      original_size,
+					      compressed_buf,
+					      out_nbytes_avail);
+		total_compress_time += timer_ticks() - start_time;
+
+		if (compressed_size) {
+			/* Successfully compressed the chunk of data. */
+
+			/* Decompress the data we just compressed and compare
+			 * the result with the original. */
+			start_time = timer_ticks();
+			ok = do_decompress(decompressor,
+					   compressed_buf, compressed_size,
+					   decompressed_buf, original_size);
+			total_decompress_time += timer_ticks() - start_time;
+
+			if (!ok) {
+				msg("%"TS": failed to decompress data",
+				    in->name);
+				return -1;
+			}
+
+			if (memcmp(original_buf, decompressed_buf,
+				   original_size) != 0)
+			{
+				msg("%"TS": data did not decompress to "
+				    "original", in->name);
+				return -1;
+			}
+
+			total_compressed_size += compressed_size;
+		} else {
+			/*
+			 * The chunk would have compressed to more than
+			 * out_nbytes_avail bytes.
+			 */
+			if (allow_expansion) {
+				msg("%"TS": bug in compress_bound()", in->name);
+				return -1;
+			}
+			total_compressed_size += original_size;
+		}
+	}
+
+	if (ret < 0)
+		return ret;
+
+	if (total_uncompressed_size == 0) {
+		printf("\tFile was empty.\n");
+		return 0;
+	}
+
+	if (total_compress_time == 0)
+		total_compress_time = 1;
+	if (total_decompress_time == 0)
+		total_decompress_time = 1;
+
+	printf("\tCompressed %"PRIu64 " => %"PRIu64" bytes (%u.%03u%%)\n",
+	       total_uncompressed_size, total_compressed_size,
+	       (unsigned int)(total_compressed_size * 100 /
+				total_uncompressed_size),
+	       (unsigned int)(total_compressed_size * 100000 /
+				total_uncompressed_size % 1000));
+	printf("\tCompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n",
+	       timer_ticks_to_ms(total_compress_time),
+	       timer_MB_per_s(total_uncompressed_size, total_compress_time));
+	printf("\tDecompression time: %"PRIu64" ms (%"PRIu64" MB/s)\n",
+	       timer_ticks_to_ms(total_decompress_time),
+	       timer_MB_per_s(total_uncompressed_size, total_decompress_time));
+
+	return 0;
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	u32 chunk_size = 1048576;
+	int level = 6;
+	enum format format = DEFLATE_FORMAT;
+	const struct engine *compress_engine = &DEFAULT_ENGINE;
+	const struct engine *decompress_engine = &DEFAULT_ENGINE;
+	bool allow_expansion = false;
+	struct compressor compressor = { 0 };
+	struct decompressor decompressor = { 0 };
+	size_t compressed_buf_size;
+	void *original_buf = NULL;
+	void *compressed_buf = NULL;
+	void *decompressed_buf = NULL;
+	tchar *default_file_list[] = { NULL };
+	int opt_char;
+	int i;
+	int ret;
+
+	begin_program(argv);
+
+	while ((opt_char = tgetopt(argc, argv, optstring)) != -1) {
+		switch (opt_char) {
+		case '0':
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			level = parse_compression_level(opt_char, toptarg);
+			if (level < 0)
+				return 1;
+			break;
+		case 'C':
+			compress_engine = name_to_engine(toptarg);
+			if (compress_engine == NULL) {
+				msg("invalid compression engine: \"%"TS"\"", toptarg);
+				show_available_engines(stderr);
+				return 1;
+			}
+			break;
+		case 'D':
+			decompress_engine = name_to_engine(toptarg);
+			if (decompress_engine == NULL) {
+				msg("invalid decompression engine: \"%"TS"\"", toptarg);
+				show_available_engines(stderr);
+				return 1;
+			}
+			break;
+		case 'e':
+			allow_expansion = true;
+			break;
+		case 'g':
+			format = GZIP_FORMAT;
+			break;
+		case 'h':
+			show_usage(stdout);
+			return 0;
+		case 's':
+			chunk_size = tstrtoul(toptarg, NULL, 10);
+			if (chunk_size == 0) {
+				msg("invalid chunk size: \"%"TS"\"", toptarg);
+				return 1;
+			}
+			break;
+		case 'V':
+			show_version();
+			return 0;
+		case 'Y': /* deprecated, use '-C libz' instead */
+			compress_engine = &libz_engine;
+			break;
+		case 'Z': /* deprecated, use '-D libz' instead */
+			decompress_engine = &libz_engine;
+			break;
+		case 'z':
+			format = ZLIB_FORMAT;
+			break;
+		default:
+			show_usage(stderr);
+			return 1;
+		}
+	}
+
+	argc -= toptind;
+	argv += toptind;
+
+	if (level == 0)
+		allow_expansion = true;
+
+	ret = -1;
+	if (!compressor_init(&compressor, level, format, compress_engine))
+		goto out;
+	if (!decompressor_init(&decompressor, format, decompress_engine))
+		goto out;
+
+	if (allow_expansion)
+		compressed_buf_size = compress_bound(&compressor, chunk_size);
+	else
+		compressed_buf_size = chunk_size - 1;
+
+	original_buf = xmalloc(chunk_size);
+	compressed_buf = xmalloc(compressed_buf_size);
+	decompressed_buf = xmalloc(chunk_size);
+
+	ret = -1;
+	if (original_buf == NULL || compressed_buf == NULL ||
+	    decompressed_buf == NULL)
+		goto out;
+
+	if (argc == 0) {
+		argv = default_file_list;
+		argc = ARRAY_LEN(default_file_list);
+	} else {
+		for (i = 0; i < argc; i++)
+			if (argv[i][0] == '-' && argv[i][1] == '\0')
+				argv[i] = NULL;
+	}
+
+	printf("Benchmarking %s compression:\n",
+	       format == DEFLATE_FORMAT ? "DEFLATE" :
+	       format == ZLIB_FORMAT ? "zlib" : "gzip");
+	printf("\tCompression level: %d\n", level);
+	printf("\tChunk size: %"PRIu32"\n", chunk_size);
+	printf("\tCompression engine: %"TS"\n", compress_engine->name);
+	printf("\tDecompression engine: %"TS"\n", decompress_engine->name);
+
+	for (i = 0; i < argc; i++) {
+		struct file_stream in;
+
+		ret = xopen_for_read(argv[i], true, &in);
+		if (ret != 0)
+			goto out;
+
+		printf("Processing %"TS"...\n", in.name);
+
+		ret = do_benchmark(&in, original_buf, compressed_buf,
+				   decompressed_buf, chunk_size,
+				   allow_expansion, compressed_buf_size,
+				   &compressor, &decompressor);
+		xclose(&in);
+		if (ret != 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	free(decompressed_buf);
+	free(compressed_buf);
+	free(original_buf);
+	decompressor_destroy(&decompressor);
+	compressor_destroy(&compressor);
+	return -ret;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/checksum.c b/tools/z64compress/src/enc/libdeflate/programs/checksum.c
new file mode 100644
index 000000000..68cd43c91
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/checksum.c
@@ -0,0 +1,218 @@
+/*
+ * checksum.c - Adler-32 and CRC-32 checksumming program
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "test_util.h"
+
+static const tchar *const optstring = T("Ahm:s:tZ");
+
+static void
+show_usage(FILE *fp)
+{
+	fprintf(fp,
+"Usage: %"TS" [-A] [-h] [-m ALIGN] [-s SIZE] [-t] [-Z] [FILE]...\n"
+"Calculate Adler-32 or CRC-32 checksums of the specified FILEs.\n"
+"\n"
+"Options:\n"
+"  -A        use Adler-32 (default is CRC-32)\n"
+"  -h        print this help\n"
+"  -m ALIGN  misalign the buffer by ALIGN bytes\n"
+"  -s SIZE   chunk size in bytes\n"
+"  -t        show checksum speed, excluding I/O\n"
+"  -Z        use zlib implementation instead of libdeflate\n",
+	prog_invocation_name);
+}
+
+typedef u32 (*cksum_fn_t)(u32, const void *, size_t);
+
+static u32
+adler32_libdeflate(u32 adler, const void *buf, size_t len)
+{
+	return libdeflate_adler32(adler, buf, len);
+}
+
+static u32
+crc32_libdeflate(u32 crc, const void *buf, size_t len)
+{
+	return libdeflate_crc32(crc, buf, len);
+}
+
+static u32
+adler32_zlib(u32 adler, const void *buf, size_t len)
+{
+	return adler32(adler, buf, len);
+}
+
+static u32
+crc32_zlib(u32 crc, const void *buf, size_t len)
+{
+	return crc32(crc, buf, len);
+}
+
+static int
+checksum_stream(struct file_stream *in, cksum_fn_t cksum, u32 *sum,
+		void *buf, size_t bufsize, u64 *size_ret, u64 *elapsed_ret)
+{
+	u64 size = 0;
+	u64 elapsed = 0;
+
+	for (;;) {
+		ssize_t ret;
+		u64 start_time;
+
+		ret = xread(in, buf, bufsize);
+		if (ret < 0)
+			return ret;
+		if (ret == 0)
+			break;
+
+		size += ret;
+		start_time = timer_ticks();
+		*sum = cksum(*sum, buf, ret);
+		elapsed += timer_ticks() - start_time;
+	}
+
+	if (elapsed == 0)
+		elapsed = 1;
+	*size_ret = size;
+	*elapsed_ret = elapsed;
+	return 0;
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	bool use_adler32 = false;
+	bool use_zlib_impl = false;
+	bool do_timing = false;
+	void *orig_buf = NULL;
+	void *buf;
+	size_t misalignment = 0;
+	size_t bufsize = 131072;
+	tchar *default_file_list[] = { NULL };
+	cksum_fn_t cksum;
+	int opt_char;
+	int i;
+	int ret;
+
+	begin_program(argv);
+
+	while ((opt_char = tgetopt(argc, argv, optstring)) != -1) {
+		switch (opt_char) {
+		case 'A':
+			use_adler32 = true;
+			break;
+		case 'h':
+			show_usage(stdout);
+			return 0;
+		case 'm':
+			misalignment = tstrtoul(toptarg, NULL, 10);
+			if (misalignment >= 4096) {
+				msg("invalid misalignment: \"%"TS"\"", toptarg);
+				return 1;
+			}
+			break;
+		case 's':
+			bufsize = tstrtoul(toptarg, NULL, 10);
+			if (bufsize == 0 || bufsize > SIZE_MAX / 2) {
+				msg("invalid chunk size: \"%"TS"\"", toptarg);
+				return 1;
+			}
+			break;
+		case 't':
+			do_timing = true;
+			break;
+		case 'Z':
+			use_zlib_impl = true;
+			break;
+		default:
+			show_usage(stderr);
+			return 1;
+		}
+	}
+
+	argc -= toptind;
+	argv += toptind;
+
+	if (use_adler32) {
+		if (use_zlib_impl)
+			cksum = adler32_zlib;
+		else
+			cksum = adler32_libdeflate;
+	} else {
+		if (use_zlib_impl)
+			cksum = crc32_zlib;
+		else
+			cksum = crc32_libdeflate;
+	}
+
+	orig_buf = xmalloc(bufsize + 4096 + misalignment);
+	if (orig_buf == NULL)
+		return 1;
+	buf = (u8 *)orig_buf + (-(uintptr_t)orig_buf % 4096) + misalignment;
+
+	if (argc == 0) {
+		argv = default_file_list;
+		argc = ARRAY_LEN(default_file_list);
+	} else {
+		for (i = 0; i < argc; i++)
+			if (argv[i][0] == '-' && argv[i][1] == '\0')
+				argv[i] = NULL;
+	}
+
+	for (i = 0; i < argc; i++) {
+		struct file_stream in;
+		u32 sum = cksum(0, NULL, 0);
+		u64 size = 0;
+		u64 elapsed = 0;
+
+		ret = xopen_for_read(argv[i], true, &in);
+		if (ret != 0)
+			goto out;
+
+		ret = checksum_stream(&in, cksum, &sum, buf, bufsize,
+				      &size, &elapsed);
+		if (ret == 0) {
+			if (do_timing) {
+				printf("%08"PRIx32"\t%"TS"\t"
+				       "%"PRIu64" ms\t%"PRIu64" MB/s\n",
+				       sum, in.name, timer_ticks_to_ms(elapsed),
+				       timer_MB_per_s(size, elapsed));
+			} else {
+				printf("%08"PRIx32"\t%"TS"\t\n", sum, in.name);
+			}
+		}
+
+		xclose(&in);
+
+		if (ret != 0)
+			goto out;
+	}
+	ret = 0;
+out:
+	free(orig_buf);
+	return -ret;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/config.h.in b/tools/z64compress/src/enc/libdeflate/programs/config.h.in
new file mode 100644
index 000000000..588aa8dca
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/config.h.in
@@ -0,0 +1,22 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+/* Is the clock_gettime() function available? */
+#cmakedefine HAVE_CLOCK_GETTIME
+
+/* Is the futimens() function available? */
+#cmakedefine HAVE_FUTIMENS
+
+/* Is the futimes() function available? */
+#cmakedefine HAVE_FUTIMES
+
+/* Is the posix_fadvise() function available? */
+#cmakedefine HAVE_POSIX_FADVISE
+
+/* Is the posix_madvise() function available? */
+#cmakedefine HAVE_POSIX_MADVISE
+
+/* Does stat() provide nanosecond-precision timestamps? */
+#cmakedefine HAVE_STAT_NANOSECOND_PRECISION
+
+#endif /* CONFIG_H */
diff --git a/tools/z64compress/src/enc/libdeflate/programs/gzip.c b/tools/z64compress/src/enc/libdeflate/programs/gzip.c
new file mode 100644
index 000000000..c13474af5
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/gzip.c
@@ -0,0 +1,701 @@
+/*
+ * gzip.c - a file compression and decompression program
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef __sun
+#  define __EXTENSIONS__ /* for futimens() */
+#endif
+
+#include "prog_util.h"
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#ifdef _WIN32
+#  include <sys/utime.h>
+#else
+#  include <sys/time.h>
+#  include <unistd.h>
+#  include <utime.h>
+#endif
+
+#define GZIP_MIN_HEADER_SIZE	10
+#define GZIP_FOOTER_SIZE	8
+#define GZIP_MIN_OVERHEAD	(GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
+#define GZIP_ID1		0x1F
+#define GZIP_ID2		0x8B
+
+struct options {
+	bool to_stdout;
+	bool decompress;
+	bool force;
+	bool keep;
+	bool test;
+	int compression_level;
+	const tchar *suffix;
+};
+
+static const tchar *const optstring = T("1::2::3::4::5::6::7::8::9::cdfhknqS:tV");
+
+static void
+show_usage(FILE *fp)
+{
+	fprintf(fp,
+"Usage: %"TS" [-LEVEL] [-cdfhkqtV] [-S SUF] FILE...\n"
+"Compress or decompress the specified FILEs.\n"
+"\n"
+"Options:\n"
+"  -1        fastest (worst) compression\n"
+"  -6        medium compression (default)\n"
+"  -12       slowest (best) compression\n"
+"  -c        write to standard output\n"
+"  -d        decompress\n"
+"  -f        overwrite existing output files; (de)compress hard-linked files;\n"
+"            allow reading/writing compressed data from/to terminal;\n"
+"            with gunzip -c, pass through non-gzipped data\n"
+"  -h        print this help\n"
+"  -k        don't delete input files\n"
+"  -q        suppress warnings\n"
+"  -S SUF    use suffix SUF instead of .gz\n"
+"  -t        test file integrity\n"
+"  -V        show version and legal information\n",
+	prog_invocation_name);
+}
+
+static void
+show_version(void)
+{
+	printf(
+"gzip compression program v" LIBDEFLATE_VERSION_STRING "\n"
+"Copyright 2016 Eric Biggers\n"
+"\n"
+"This program is free software which may be modified and/or redistributed\n"
+"under the terms of the MIT license.  There is NO WARRANTY, to the extent\n"
+"permitted by law.  See the COPYING file for details.\n"
+	);
+}
+
+/* Was the program invoked in decompression mode? */
+static bool
+is_gunzip(void)
+{
+	if (tstrxcmp(prog_invocation_name, T("gunzip")) == 0)
+		return true;
+	if (tstrxcmp(prog_invocation_name, T("libdeflate-gunzip")) == 0)
+		return true;
+#ifdef _WIN32
+	if (tstrxcmp(prog_invocation_name, T("gunzip.exe")) == 0)
+		return true;
+	if (tstrxcmp(prog_invocation_name, T("libdeflate-gunzip.exe")) == 0)
+		return true;
+#endif
+	return false;
+}
+
+static const tchar *
+get_suffix(const tchar *path, const tchar *suffix)
+{
+	size_t path_len = tstrlen(path);
+	size_t suffix_len = tstrlen(suffix);
+	const tchar *p;
+
+	if (path_len <= suffix_len)
+		return NULL;
+	p = &path[path_len - suffix_len];
+	if (tstrxcmp(p, suffix) == 0)
+		return p;
+	return NULL;
+}
+
+static bool
+has_suffix(const tchar *path, const tchar *suffix)
+{
+	return get_suffix(path, suffix) != NULL;
+}
+
+static tchar *
+append_suffix(const tchar *path, const tchar *suffix)
+{
+	size_t path_len = tstrlen(path);
+	size_t suffix_len = tstrlen(suffix);
+	tchar *suffixed_path;
+
+	suffixed_path = xmalloc((path_len + suffix_len + 1) * sizeof(tchar));
+	if (suffixed_path == NULL)
+		return NULL;
+	tmemcpy(suffixed_path, path, path_len);
+	tmemcpy(&suffixed_path[path_len], suffix, suffix_len + 1);
+	return suffixed_path;
+}
+
+static int
+do_compress(struct libdeflate_compressor *compressor,
+	    struct file_stream *in, struct file_stream *out)
+{
+	const void *uncompressed_data = in->mmap_mem;
+	size_t uncompressed_size = in->mmap_size;
+	void *compressed_data;
+	size_t actual_compressed_size;
+	size_t max_compressed_size;
+	int ret;
+
+	max_compressed_size = libdeflate_gzip_compress_bound(compressor,
+							     uncompressed_size);
+	compressed_data = xmalloc(max_compressed_size);
+	if (compressed_data == NULL) {
+		msg("%"TS": file is probably too large to be processed by this "
+		    "program", in->name);
+		ret = -1;
+		goto out;
+	}
+
+	actual_compressed_size = libdeflate_gzip_compress(compressor,
+							  uncompressed_data,
+							  uncompressed_size,
+							  compressed_data,
+							  max_compressed_size);
+	if (actual_compressed_size == 0) {
+		msg("Bug in libdeflate_gzip_compress_bound()!");
+		ret = -1;
+		goto out;
+	}
+
+	ret = full_write(out, compressed_data, actual_compressed_size);
+out:
+	free(compressed_data);
+	return ret;
+}
+
+static int
+do_decompress(struct libdeflate_decompressor *decompressor,
+	      struct file_stream *in, struct file_stream *out,
+	      const struct options *options)
+{
+	const u8 *compressed_data = in->mmap_mem;
+	size_t compressed_size = in->mmap_size;
+	void *uncompressed_data = NULL;
+	size_t uncompressed_size;
+	size_t max_uncompressed_size;
+	size_t actual_in_nbytes;
+	size_t actual_out_nbytes;
+	enum libdeflate_result result;
+	int ret = 0;
+
+	if (compressed_size < GZIP_MIN_OVERHEAD ||
+	    compressed_data[0] != GZIP_ID1 ||
+	    compressed_data[1] != GZIP_ID2) {
+		if (options->force && options->to_stdout)
+			return full_write(out, compressed_data, compressed_size);
+		msg("%"TS": not in gzip format", in->name);
+		return -1;
+	}
+
+	/*
+	 * Use the ISIZE field as a hint for the decompressed data size.  It may
+	 * need to be increased later, however, because the file may contain
+	 * multiple gzip members and the particular ISIZE we happen to use may
+	 * not be the largest; or the real size may be >= 4 GiB, causing ISIZE
+	 * to overflow.  In any case, make sure to allocate at least one byte.
+	 */
+	uncompressed_size =
+		get_unaligned_le32(&compressed_data[compressed_size - 4]);
+	if (uncompressed_size == 0)
+		uncompressed_size = 1;
+
+	/*
+	 * DEFLATE cannot expand data more than 1032x, so there's no need to
+	 * ever allocate a buffer more than 1032 times larger than the
+	 * compressed data.  This is a fail-safe, albeit not a very good one, if
+	 * ISIZE becomes corrupted on a small file.  (The 1032x number comes
+	 * from each 2 bits generating a 258-byte match.  This is a hard upper
+	 * bound; the real upper bound is slightly smaller due to overhead.)
+	 */
+	if (compressed_size <= SIZE_MAX / 1032)
+		max_uncompressed_size = compressed_size * 1032;
+	else
+		max_uncompressed_size = SIZE_MAX;
+
+	do {
+		if (uncompressed_data == NULL) {
+			uncompressed_size = MIN(uncompressed_size,
+						max_uncompressed_size);
+			uncompressed_data = xmalloc(uncompressed_size);
+			if (uncompressed_data == NULL) {
+				msg("%"TS": file is probably too large to be "
+				    "processed by this program", in->name);
+				ret = -1;
+				goto out;
+			}
+		}
+
+		result = libdeflate_gzip_decompress_ex(decompressor,
+						       compressed_data,
+						       compressed_size,
+						       uncompressed_data,
+						       uncompressed_size,
+						       &actual_in_nbytes,
+						       &actual_out_nbytes);
+
+		if (result == LIBDEFLATE_INSUFFICIENT_SPACE) {
+			if (uncompressed_size >= max_uncompressed_size) {
+				msg("Bug in libdeflate_gzip_decompress_ex(): data expanded too much!");
+				ret = -1;
+				goto out;
+			}
+			if (uncompressed_size * 2 <= uncompressed_size) {
+				msg("%"TS": file corrupt or too large to be "
+				    "processed by this program", in->name);
+				ret = -1;
+				goto out;
+			}
+			uncompressed_size *= 2;
+			free(uncompressed_data);
+			uncompressed_data = NULL;
+			continue;
+		}
+
+		if (result != LIBDEFLATE_SUCCESS) {
+			msg("%"TS": file corrupt or not in gzip format",
+			    in->name);
+			ret = -1;
+			goto out;
+		}
+
+		if (actual_in_nbytes == 0 ||
+		    actual_in_nbytes > compressed_size ||
+		    actual_out_nbytes > uncompressed_size) {
+			msg("Bug in libdeflate_gzip_decompress_ex(): impossible actual_nbytes value!");
+			ret = -1;
+			goto out;
+		}
+
+		if (!options->test) {
+			ret = full_write(out, uncompressed_data, actual_out_nbytes);
+			if (ret != 0)
+				goto out;
+		}
+
+		compressed_data += actual_in_nbytes;
+		compressed_size -= actual_in_nbytes;
+
+	} while (compressed_size != 0);
+out:
+	free(uncompressed_data);
+	return ret;
+}
+
+static int
+stat_file(struct file_stream *in, stat_t *stbuf, bool allow_hard_links)
+{
+	if (tfstat(in->fd, stbuf) != 0) {
+		msg("%"TS": unable to stat file", in->name);
+		return -1;
+	}
+
+	if (!S_ISREG(stbuf->st_mode) && !in->is_standard_stream) {
+		warn("%"TS" is %s -- skipping",
+		     in->name, S_ISDIR(stbuf->st_mode) ? "a directory" :
+							 "not a regular file");
+		return -2;
+	}
+
+	if (stbuf->st_nlink > 1 && !allow_hard_links) {
+		warn("%"TS" has multiple hard links -- skipping (use -f to process anyway)",
+		     in->name);
+		return -2;
+	}
+
+	return 0;
+}
+
+static void
+restore_mode(struct file_stream *out, const stat_t *stbuf)
+{
+#ifndef _WIN32
+	if (fchmod(out->fd, stbuf->st_mode) != 0)
+		msg_errno("%"TS": unable to preserve mode", out->name);
+#endif
+}
+
+static void
+restore_owner_and_group(struct file_stream *out, const stat_t *stbuf)
+{
+#ifndef _WIN32
+	if (fchown(out->fd, stbuf->st_uid, stbuf->st_gid) != 0) {
+		msg_errno("%"TS": unable to preserve owner and group",
+			  out->name);
+	}
+#endif
+}
+
+static void
+restore_timestamps(struct file_stream *out, const tchar *newpath,
+		   const stat_t *stbuf)
+{
+	int ret;
+#ifdef __APPLE__
+	struct timespec times[2] = {
+		{ stbuf->st_atime, stbuf->st_atimensec },
+		{ stbuf->st_mtime, stbuf->st_mtimensec },
+	};
+	ret = futimens(out->fd, times);
+#elif defined(HAVE_FUTIMENS) && defined(HAVE_STAT_NANOSECOND_PRECISION)
+	struct timespec times[2] = {
+		stbuf->st_atim, stbuf->st_mtim,
+	};
+	ret = futimens(out->fd, times);
+#elif defined(HAVE_FUTIMES) && defined(HAVE_STAT_NANOSECOND_PRECISION)
+	struct timeval times[2] = {
+		{ stbuf->st_atim.tv_sec, stbuf->st_atim.tv_nsec / 1000, },
+		{ stbuf->st_mtim.tv_sec, stbuf->st_mtim.tv_nsec / 1000, },
+	};
+	ret = futimes(out->fd, times);
+#else
+	struct tutimbuf times = {
+		stbuf->st_atime, stbuf->st_mtime,
+	};
+	ret = tutime(newpath, &times);
+#endif
+	if (ret != 0)
+		msg_errno("%"TS": unable to preserve timestamps", out->name);
+}
+
+static void
+restore_metadata(struct file_stream *out, const tchar *newpath,
+		 const stat_t *stbuf)
+{
+	restore_mode(out, stbuf);
+	restore_owner_and_group(out, stbuf);
+	restore_timestamps(out, newpath, stbuf);
+}
+
+static int
+decompress_file(struct libdeflate_decompressor *decompressor, const tchar *path,
+		const struct options *options)
+{
+	tchar *oldpath = (tchar *)path;
+	tchar *newpath = NULL;
+	struct file_stream in;
+	struct file_stream out;
+	stat_t stbuf;
+	int ret;
+	int ret2;
+
+	if (path != NULL) {
+		const tchar *suffix = get_suffix(path, options->suffix);
+		if (suffix == NULL) {
+			/*
+			 * Input file is unsuffixed.  If the file doesn't exist,
+			 * then try it suffixed.  Otherwise, if we're not
+			 * writing to stdout, skip the file with warning status.
+			 * Otherwise, go ahead and try to open the file anyway
+			 * (which will very likely fail).
+			 */
+			if (tstat(path, &stbuf) != 0 && errno == ENOENT) {
+				oldpath = append_suffix(path, options->suffix);
+				if (oldpath == NULL)
+					return -1;
+				if (!options->to_stdout)
+					newpath = (tchar *)path;
+			} else if (!options->to_stdout) {
+				warn("\"%"TS"\" does not end with the %"TS" suffix -- skipping",
+				     path, options->suffix);
+				return -2;
+			}
+		} else if (!options->to_stdout) {
+			/*
+			 * Input file is suffixed, and we're not writing to
+			 * stdout.  Strip the suffix to get the path to the
+			 * output file.
+			 */
+			newpath = xmalloc((suffix - oldpath + 1) *
+					  sizeof(tchar));
+			if (newpath == NULL)
+				return -1;
+			tmemcpy(newpath, oldpath, suffix - oldpath);
+			newpath[suffix - oldpath] = '\0';
+		}
+	}
+
+	ret = xopen_for_read(oldpath, options->force || options->to_stdout,
+			     &in);
+	if (ret != 0)
+		goto out_free_paths;
+
+	if (!options->force && isatty(in.fd)) {
+		msg("Refusing to read compressed data from terminal.  "
+		    "Use -f to override.\nFor help, use -h.");
+		ret = -1;
+		goto out_close_in;
+	}
+
+	ret = stat_file(&in, &stbuf, options->force || options->keep ||
+			oldpath == NULL || newpath == NULL);
+	if (ret != 0)
+		goto out_close_in;
+
+	ret = xopen_for_write(newpath, options->force, &out);
+	if (ret != 0)
+		goto out_close_in;
+
+	/* TODO: need a streaming-friendly solution */
+	ret = map_file_contents(&in, stbuf.st_size);
+	if (ret != 0)
+		goto out_close_out;
+
+	ret = do_decompress(decompressor, &in, &out, options);
+	if (ret != 0)
+		goto out_close_out;
+
+	if (oldpath != NULL && newpath != NULL)
+		restore_metadata(&out, newpath, &stbuf);
+	ret = 0;
+out_close_out:
+	ret2 = xclose(&out);
+	if (ret == 0)
+		ret = ret2;
+	if (ret != 0 && newpath != NULL)
+		tunlink(newpath);
+out_close_in:
+	xclose(&in);
+	if (ret == 0 && oldpath != NULL && newpath != NULL && !options->keep)
+		tunlink(oldpath);
+out_free_paths:
+	if (newpath != path)
+		free(newpath);
+	if (oldpath != path)
+		free(oldpath);
+	return ret;
+}
+
+static int
+compress_file(struct libdeflate_compressor *compressor, const tchar *path,
+	      const struct options *options)
+{
+	tchar *newpath = NULL;
+	struct file_stream in;
+	struct file_stream out;
+	stat_t stbuf;
+	int ret;
+	int ret2;
+
+	if (path != NULL && !options->to_stdout) {
+		if (!options->force && has_suffix(path, options->suffix)) {
+			msg("%"TS": already has %"TS" suffix -- skipping",
+			    path, options->suffix);
+			return 0;
+		}
+		newpath = append_suffix(path, options->suffix);
+		if (newpath == NULL)
+			return -1;
+	}
+
+	ret = xopen_for_read(path, options->force || options->to_stdout, &in);
+	if (ret != 0)
+		goto out_free_newpath;
+
+	ret = stat_file(&in, &stbuf, options->force || options->keep ||
+			path == NULL || newpath == NULL);
+	if (ret != 0)
+		goto out_close_in;
+
+	ret = xopen_for_write(newpath, options->force, &out);
+	if (ret != 0)
+		goto out_close_in;
+
+	if (!options->force && isatty(out.fd)) {
+		msg("Refusing to write compressed data to terminal. "
+		    "Use -f to override.\nFor help, use -h.");
+		ret = -1;
+		goto out_close_out;
+	}
+
+	/* TODO: need a streaming-friendly solution */
+	ret = map_file_contents(&in, stbuf.st_size);
+	if (ret != 0)
+		goto out_close_out;
+
+	ret = do_compress(compressor, &in, &out);
+	if (ret != 0)
+		goto out_close_out;
+
+	if (path != NULL && newpath != NULL)
+		restore_metadata(&out, newpath, &stbuf);
+	ret = 0;
+out_close_out:
+	ret2 = xclose(&out);
+	if (ret == 0)
+		ret = ret2;
+	if (ret != 0 && newpath != NULL)
+		tunlink(newpath);
+out_close_in:
+	xclose(&in);
+	if (ret == 0 && path != NULL && newpath != NULL && !options->keep)
+		tunlink(path);
+out_free_newpath:
+	free(newpath);
+	return ret;
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	tchar *default_file_list[] = { NULL };
+	struct options options;
+	int opt_char;
+	int i;
+	int ret;
+
+	begin_program(argv);
+
+	options.to_stdout = false;
+	options.decompress = is_gunzip();
+	options.force = false;
+	options.keep = false;
+	options.test = false;
+	options.compression_level = 6;
+	options.suffix = T(".gz");
+
+	while ((opt_char = tgetopt(argc, argv, optstring)) != -1) {
+		switch (opt_char) {
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+			options.compression_level =
+				parse_compression_level(opt_char, toptarg);
+			if (options.compression_level < 0)
+				return 1;
+			break;
+		case 'c':
+			options.to_stdout = true;
+			break;
+		case 'd':
+			options.decompress = true;
+			break;
+		case 'f':
+			options.force = true;
+			break;
+		case 'h':
+			show_usage(stdout);
+			return 0;
+		case 'k':
+			options.keep = true;
+			break;
+		case 'n':
+			/*
+			 * -n means don't save or restore the original filename
+			 *  in the gzip header.  Currently this implementation
+			 *  already behaves this way by default, so accept the
+			 *  option as a no-op.
+			 */
+			break;
+		case 'q':
+			suppress_warnings = true;
+			break;
+		case 'S':
+			options.suffix = toptarg;
+			if (options.suffix[0] == T('\0')) {
+				msg("invalid suffix");
+				return 1;
+			}
+			break;
+		case 't':
+			options.test = true;
+			options.decompress = true;
+			options.to_stdout = true;
+			/*
+			 * -t behaves just like the more commonly used -c
+			 * option, except that -t doesn't actually write
+			 * anything.  For ease of implementation, just pretend
+			 * that -c was specified too.
+			 */
+			break;
+		case 'V':
+			show_version();
+			return 0;
+		default:
+			show_usage(stderr);
+			return 1;
+		}
+	}
+
+	argv += toptind;
+	argc -= toptind;
+
+	if (argc == 0) {
+		argv = default_file_list;
+		argc = ARRAY_LEN(default_file_list);
+	} else {
+		for (i = 0; i < argc; i++)
+			if (argv[i][0] == '-' && argv[i][1] == '\0')
+				argv[i] = NULL;
+	}
+
+	ret = 0;
+	if (options.decompress) {
+		struct libdeflate_decompressor *d;
+
+		d = alloc_decompressor();
+		if (d == NULL)
+			return 1;
+
+		for (i = 0; i < argc; i++)
+			ret |= -decompress_file(d, argv[i], &options);
+
+		libdeflate_free_decompressor(d);
+	} else {
+		struct libdeflate_compressor *c;
+
+		c = alloc_compressor(options.compression_level);
+		if (c == NULL)
+			return 1;
+
+		for (i = 0; i < argc; i++)
+			ret |= -compress_file(c, argv[i], &options);
+
+		libdeflate_free_compressor(c);
+	}
+
+	switch (ret) {
+	case 0:
+		/* No warnings or errors */
+		return 0;
+	case 2:
+		/* At least one warning, but no errors */
+		if (suppress_warnings)
+			return 0;
+		return 2;
+	default:
+		/* At least one error */
+		return 1;
+	}
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/prog_util.c b/tools/z64compress/src/enc/libdeflate/programs/prog_util.c
new file mode 100644
index 000000000..a4bf1c47d
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/prog_util.c
@@ -0,0 +1,522 @@
+/*
+ * prog_util.c - utility functions for programs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef __APPLE__
+/* for O_NOFOLLOW */
+#  undef _POSIX_C_SOURCE
+#  define _DARWIN_C_SOURCE
+#endif
+
+#include "prog_util.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#ifdef _WIN32
+#  include <windows.h>
+#else
+#  include <unistd.h>
+#  include <sys/mman.h>
+#endif
+
+#ifndef O_BINARY
+#  define O_BINARY 0
+#endif
+#ifndef O_SEQUENTIAL
+#  define O_SEQUENTIAL 0
+#endif
+#ifndef O_NOFOLLOW
+#  define O_NOFOLLOW 0
+#endif
+#ifndef O_NONBLOCK
+#  define O_NONBLOCK 0
+#endif
+#ifndef O_NOCTTY
+#  define O_NOCTTY 0
+#endif
+
+/* The invocation name of the program (filename component only) */
+const tchar *prog_invocation_name;
+
+/* Whether to suppress warning messages or not */
+bool suppress_warnings;
+
+static void
+do_msg(const char *format, bool with_errno, va_list va)
+{
+	int saved_errno = errno;
+
+	fprintf(stderr, "%"TS": ", prog_invocation_name);
+	vfprintf(stderr, format, va);
+	if (with_errno)
+		fprintf(stderr, ": %s\n", strerror(saved_errno));
+	else
+		fprintf(stderr, "\n");
+
+	errno = saved_errno;
+}
+
+/* Print a message to standard error */
+void
+msg(const char *format, ...)
+{
+	va_list va;
+
+	va_start(va, format);
+	do_msg(format, false, va);
+	va_end(va);
+}
+
+/* Print a message to standard error, including a description of errno */
+void
+msg_errno(const char *format, ...)
+{
+	va_list va;
+
+	va_start(va, format);
+	do_msg(format, true, va);
+	va_end(va);
+}
+
+
+/* Same as msg(), but do nothing if 'suppress_warnings' has been set. */
+void
+warn(const char *format, ...)
+{
+	if (!suppress_warnings) {
+		va_list va;
+
+		va_start(va, format);
+		do_msg(format, false, va);
+		va_end(va);
+	}
+}
+
+/* malloc() wrapper */
+void *
+xmalloc(size_t size)
+{
+	void *p = malloc(size);
+	if (p == NULL && size == 0)
+		p = malloc(1);
+	if (p == NULL)
+		msg("Out of memory");
+	return p;
+}
+
+/*
+ * Retrieve a pointer to the filename component of the specified path.
+ *
+ * Note: this does not modify the path.  Therefore, it is not guaranteed to work
+ * properly for directories, since a path to a directory might have trailing
+ * slashes.
+ */
+static const tchar *
+get_filename(const tchar *path)
+{
+	const tchar *slash = tstrrchr(path, '/');
+#ifdef _WIN32
+	const tchar *backslash = tstrrchr(path, '\\');
+	if (backslash != NULL && (slash == NULL || backslash > slash))
+		slash = backslash;
+#endif
+	if (slash != NULL)
+		return slash + 1;
+	return path;
+}
+
+void
+begin_program(tchar *argv[])
+{
+	prog_invocation_name = get_filename(argv[0]);
+
+#ifdef FREESTANDING
+	/* This allows testing freestanding library builds. */
+	libdeflate_set_memory_allocator(malloc, free);
+#endif
+}
+
+/* Create a copy of 'path' surrounded by double quotes */
+static tchar *
+quote_path(const tchar *path)
+{
+	size_t len = tstrlen(path);
+	tchar *result;
+
+	result = xmalloc((1 + len + 1 + 1) * sizeof(tchar));
+	if (result == NULL)
+		return NULL;
+	result[0] = '"';
+	tmemcpy(&result[1], path, len);
+	result[1 + len] = '"';
+	result[1 + len + 1] = '\0';
+	return result;
+}
+
+/* Open a file for reading, or set up standard input for reading */
+int
+xopen_for_read(const tchar *path, bool symlink_ok, struct file_stream *strm)
+{
+	strm->mmap_token = NULL;
+	strm->mmap_mem = NULL;
+
+	if (path == NULL) {
+		strm->is_standard_stream = true;
+		strm->name = T("standard input");
+		strm->fd = STDIN_FILENO;
+	#ifdef _WIN32
+		_setmode(strm->fd, O_BINARY);
+	#endif
+		return 0;
+	}
+
+	strm->is_standard_stream = false;
+
+	strm->name = quote_path(path);
+	if (strm->name == NULL)
+		return -1;
+
+	strm->fd = topen(path, O_RDONLY | O_BINARY | O_NONBLOCK | O_NOCTTY |
+			 (symlink_ok ? 0 : O_NOFOLLOW) | O_SEQUENTIAL);
+	if (strm->fd < 0) {
+		msg_errno("Can't open %"TS" for reading", strm->name);
+		free(strm->name);
+		return -1;
+	}
+
+#if defined(HAVE_POSIX_FADVISE) && (O_SEQUENTIAL == 0)
+	(void)posix_fadvise(strm->fd, 0, 0, POSIX_FADV_SEQUENTIAL);
+#endif
+
+	return 0;
+}
+
+/* Open a file for writing, or set up standard output for writing */
+int
+xopen_for_write(const tchar *path, bool overwrite, struct file_stream *strm)
+{
+	int ret = -1;
+
+	strm->mmap_token = NULL;
+	strm->mmap_mem = NULL;
+
+	if (path == NULL) {
+		strm->is_standard_stream = true;
+		strm->name = T("standard output");
+		strm->fd = STDOUT_FILENO;
+	#ifdef _WIN32
+		_setmode(strm->fd, O_BINARY);
+	#endif
+		return 0;
+	}
+
+	strm->is_standard_stream = false;
+
+	strm->name = quote_path(path);
+	if (strm->name == NULL)
+		goto err;
+retry:
+	strm->fd = topen(path, O_WRONLY | O_BINARY | O_NOFOLLOW |
+				O_CREAT | O_EXCL, 0644);
+	if (strm->fd < 0) {
+		if (errno != EEXIST) {
+			msg_errno("Can't open %"TS" for writing", strm->name);
+			goto err;
+		}
+		if (!overwrite) {
+			if (!isatty(STDERR_FILENO) || !isatty(STDIN_FILENO)) {
+				warn("%"TS" already exists; use -f to overwrite",
+				     strm->name);
+				ret = -2; /* warning only */
+				goto err;
+			}
+			fprintf(stderr, "%"TS": %"TS" already exists; "
+				"overwrite? (y/n) ",
+				prog_invocation_name, strm->name);
+			if (getchar() != 'y') {
+				msg("Not overwriting.");
+				goto err;
+			}
+		}
+		if (tunlink(path) != 0) {
+			msg_errno("Unable to delete %"TS, strm->name);
+			goto err;
+		}
+		goto retry;
+	}
+
+	return 0;
+
+err:
+	free(strm->name);
+	return ret;
+}
+
+/* Read the full contents of a file into memory */
+static int
+read_full_contents(struct file_stream *strm)
+{
+	size_t filled = 0;
+	size_t capacity = 4096;
+	char *buf;
+	int ret;
+
+	buf = xmalloc(capacity);
+	if (buf == NULL)
+		return -1;
+	do {
+		if (filled == capacity) {
+			char *newbuf;
+
+			if (capacity == SIZE_MAX)
+				goto oom;
+			capacity += MIN(SIZE_MAX - capacity, capacity);
+			newbuf = realloc(buf, capacity);
+			if (newbuf == NULL)
+				goto oom;
+			buf = newbuf;
+		}
+		ret = xread(strm, &buf[filled], capacity - filled);
+		if (ret < 0)
+			goto err;
+		filled += ret;
+	} while (ret != 0);
+
+	strm->mmap_mem = buf;
+	strm->mmap_size = filled;
+	return 0;
+
+err:
+	free(buf);
+	return ret;
+oom:
+	msg("Out of memory!  %"TS" is too large to be processed by "
+	    "this program as currently implemented.", strm->name);
+	ret = -1;
+	goto err;
+}
+
+/* Map the contents of a file into memory */
+int
+map_file_contents(struct file_stream *strm, u64 size)
+{
+	if (size == 0) /* mmap isn't supported on empty files */
+		return read_full_contents(strm);
+
+	if (size > SIZE_MAX) {
+		msg("%"TS" is too large to be processed by this program",
+		    strm->name);
+		return -1;
+	}
+#ifdef _WIN32
+	strm->mmap_token = CreateFileMapping(
+				(HANDLE)(intptr_t)_get_osfhandle(strm->fd),
+				NULL, PAGE_READONLY, 0, 0, NULL);
+	if (strm->mmap_token == NULL) {
+		DWORD err = GetLastError();
+		if (err == ERROR_BAD_EXE_FORMAT) /* mmap unsupported */
+			return read_full_contents(strm);
+		msg("Unable create file mapping for %"TS": Windows error %u",
+		    strm->name, (unsigned int)err);
+		return -1;
+	}
+
+	strm->mmap_mem = MapViewOfFile((HANDLE)strm->mmap_token,
+				       FILE_MAP_READ, 0, 0, size);
+	if (strm->mmap_mem == NULL) {
+		msg("Unable to map %"TS" into memory: Windows error %u",
+		    strm->name, (unsigned int)GetLastError());
+		CloseHandle((HANDLE)strm->mmap_token);
+		return -1;
+	}
+#else /* _WIN32 */
+	strm->mmap_mem = mmap(NULL, size, PROT_READ, MAP_SHARED, strm->fd, 0);
+	if (strm->mmap_mem == MAP_FAILED) {
+		strm->mmap_mem = NULL;
+		if (errno == ENODEV /* standard */ ||
+		    errno == EINVAL /* macOS */) {
+			/* mmap isn't supported on this file */
+			return read_full_contents(strm);
+		}
+		if (errno == ENOMEM) {
+			msg("%"TS" is too large to be processed by this "
+			    "program", strm->name);
+		} else {
+			msg_errno("Unable to map %"TS" into memory",
+				  strm->name);
+		}
+		return -1;
+	}
+
+#ifdef HAVE_POSIX_MADVISE
+	(void)posix_madvise(strm->mmap_mem, size, POSIX_MADV_SEQUENTIAL);
+#endif
+	strm->mmap_token = strm; /* anything that's not NULL */
+
+#endif /* !_WIN32 */
+	strm->mmap_size = size;
+	return 0;
+}
+
+/*
+ * Read from a file, returning the full count to indicate all bytes were read, a
+ * short count (possibly 0) to indicate EOF, or -1 to indicate error.
+ */
+ssize_t
+xread(struct file_stream *strm, void *buf, size_t count)
+{
+	char *p = buf;
+	size_t orig_count = count;
+
+	while (count != 0) {
+		ssize_t res = read(strm->fd, p, MIN(count, INT_MAX));
+		if (res == 0)
+			break;
+		if (res < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			msg_errno("Error reading from %"TS, strm->name);
+			return -1;
+		}
+		p += res;
+		count -= res;
+	}
+	return orig_count - count;
+}
+
+/* Write to a file, returning 0 if all bytes were written or -1 on error */
+int
+full_write(struct file_stream *strm, const void *buf, size_t count)
+{
+	const char *p = buf;
+
+	while (count != 0) {
+		ssize_t res = write(strm->fd, p, MIN(count, INT_MAX));
+		if (res <= 0) {
+			msg_errno("Error writing to %"TS, strm->name);
+			return -1;
+		}
+		p += res;
+		count -= res;
+	}
+	return 0;
+}
+
+/* Close a file, returning 0 on success or -1 on error */
+int
+xclose(struct file_stream *strm)
+{
+	int ret = 0;
+
+	if (!strm->is_standard_stream) {
+		if (close(strm->fd) != 0) {
+			msg_errno("Error closing %"TS, strm->name);
+			ret = -1;
+		}
+		free(strm->name);
+	}
+
+	if (strm->mmap_token != NULL) {
+#ifdef _WIN32
+		UnmapViewOfFile(strm->mmap_mem);
+		CloseHandle((HANDLE)strm->mmap_token);
+#else
+		munmap(strm->mmap_mem, strm->mmap_size);
+#endif
+		strm->mmap_token = NULL;
+	} else {
+		free(strm->mmap_mem);
+	}
+	strm->mmap_mem = NULL;
+	strm->fd = -1;
+	strm->name = NULL;
+	return ret;
+}
+
+/*
+ * Parse the compression level given on the command line, returning the
+ * compression level on success or -1 on error
+ */
+int
+parse_compression_level(tchar opt_char, const tchar *arg)
+{
+	int level;
+
+	if (arg == NULL)
+		arg = T("");
+
+	if (opt_char < '0' || opt_char > '9')
+		goto invalid;
+	level = opt_char - '0';
+
+	if (arg[0] != '\0') {
+		if (arg[0] < '0' || arg[0] > '9')
+			goto invalid;
+		if (arg[1] != '\0')	/* Levels are at most 2 digits */
+			goto invalid;
+		if (level == 0)		/* Don't allow arguments like "-01" */
+			goto invalid;
+		level = (level * 10) + (arg[0] - '0');
+	}
+
+	if (level < 0 || level > 12)
+		goto invalid;
+
+	return level;
+
+invalid:
+	msg("Invalid compression level: \"%"TC"%"TS"\".  "
+	    "Must be an integer in the range [0, 12].", opt_char, arg);
+	return -1;
+}
+
+/* Allocate a new DEFLATE compressor */
+struct libdeflate_compressor *
+alloc_compressor(int level)
+{
+	struct libdeflate_compressor *c;
+
+	c = libdeflate_alloc_compressor(level);
+	if (c == NULL) {
+		msg_errno("Unable to allocate compressor with "
+			  "compression level %d", level);
+	}
+	return c;
+}
+
+/* Allocate a new DEFLATE decompressor */
+struct libdeflate_decompressor *
+alloc_decompressor(void)
+{
+	struct libdeflate_decompressor *d;
+
+	d = libdeflate_alloc_decompressor();
+	if (d == NULL)
+		msg_errno("Unable to allocate decompressor");
+
+	return d;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/prog_util.h b/tools/z64compress/src/enc/libdeflate/programs/prog_util.h
new file mode 100644
index 000000000..08f538399
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/prog_util.h
@@ -0,0 +1,177 @@
+/*
+ * prog_util.h - utility functions for programs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef PROGRAMS_PROG_UTIL_H
+#define PROGRAMS_PROG_UTIL_H
+
+/*
+ * To keep the code similar on all platforms, sometimes we intentionally use the
+ * "deprecated" non-underscore-prefixed variants of functions in msvcrt.
+ */
+#if defined(_WIN32) && !defined(_CRT_NONSTDC_NO_DEPRECATE)
+#  define _CRT_NONSTDC_NO_DEPRECATE 1
+#endif
+/*
+ * Similarly, to match other platforms we intentionally use the "non-secure"
+ * variants, which aren't actually any less secure when used properly.
+ */
+#if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
+#  define _CRT_SECURE_NO_WARNINGS 1
+#endif
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "libdeflate.h"
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifndef _WIN32
+#  include <sys/types.h>
+#endif
+
+#include "../common_defs.h"
+
+#if defined(__GNUC__) || __has_attribute(format)
+# define _printf(str_idx, args_idx)	\
+		__attribute__((format(printf, str_idx, args_idx)))
+#else
+# define _printf(str_idx, args_idx)
+#endif
+
+#ifdef _WIN32
+
+/*
+ * Definitions for Windows builds.  Mainly, 'tchar' is defined to be the 2-byte
+ * 'wchar_t' type instead of 'char'.  This is the only "easy" way I know of to
+ * get full Unicode support on Windows...
+ */
+
+#include <io.h>
+#include <wchar.h>
+int wmain(int argc, wchar_t **argv);
+#  define	tmain		wmain
+#  define	tchar		wchar_t
+#  define	_T(text)	L##text
+#  define	T(text)		_T(text)
+#  define	TS		"ls"
+#  define	TC		"lc"
+#  define	tmemcpy		wmemcpy
+#  define	topen		_wopen
+#  define	tstrchr		wcschr
+#  define	tstrcmp		wcscmp
+#  define	tstrlen		wcslen
+#  define	tstrrchr	wcsrchr
+#  define	tstrtoul	wcstoul
+#  define	tstrxcmp	wcsicmp
+#  define	tunlink		_wunlink
+#  define	tutimbuf	__utimbuf64
+#  define	tutime		_wutime64
+#  define	tstat		_wstat64
+#  define	tfstat		_fstat64
+#  define	stat_t		struct _stat64
+#  ifdef _MSC_VER
+#    define	STDIN_FILENO	0
+#    define	STDOUT_FILENO	1
+#    define	STDERR_FILENO	2
+#    define	S_ISREG(m)      (((m) & S_IFMT) == S_IFREG)
+#    define	S_ISDIR(m)      (((m) & S_IFMT) == S_IFDIR)
+#  endif
+
+#else /* _WIN32 */
+
+/* Standard definitions for everyone else */
+
+#  define	tmain		main
+#  define	tchar		char
+#  define	T(text)		text
+#  define	TS		"s"
+#  define	TC		"c"
+#  define	tmemcpy		memcpy
+#  define	topen		open
+#  define	tstrchr		strchr
+#  define	tstrcmp		strcmp
+#  define	tstrlen		strlen
+#  define	tstrrchr	strrchr
+#  define	tstrtoul	strtoul
+#  define	tstrxcmp	strcmp
+#  define	tunlink		unlink
+#  define	tutimbuf	utimbuf
+#  define	tutime		utime
+#  define	tstat		stat
+#  define	tfstat		fstat
+#  define	stat_t		struct stat
+
+#endif /* !_WIN32 */
+
+extern const tchar *prog_invocation_name;
+extern bool suppress_warnings;
+
+void _printf(1, 2) msg(const char *fmt, ...);
+void _printf(1, 2) msg_errno(const char *fmt, ...);
+void _printf(1, 2) warn(const char *fmt, ...);
+
+void *xmalloc(size_t size);
+
+void begin_program(tchar *argv[]);
+
+struct file_stream {
+	int fd;
+	tchar *name;
+	bool is_standard_stream;
+	void *mmap_token;
+	void *mmap_mem;
+	size_t mmap_size;
+};
+
+int xopen_for_read(const tchar *path, bool symlink_ok,
+		   struct file_stream *strm);
+int xopen_for_write(const tchar *path, bool force, struct file_stream *strm);
+int map_file_contents(struct file_stream *strm, u64 size);
+
+ssize_t xread(struct file_stream *strm, void *buf, size_t count);
+int full_write(struct file_stream *strm, const void *buf, size_t count);
+
+int xclose(struct file_stream *strm);
+
+int parse_compression_level(tchar opt_char, const tchar *arg);
+
+struct libdeflate_compressor *alloc_compressor(int level);
+struct libdeflate_decompressor *alloc_decompressor(void);
+
+/* tgetopt.c */
+
+extern tchar *toptarg;
+extern int toptind, topterr, toptopt;
+
+int tgetopt(int argc, tchar *argv[], const tchar *optstring);
+
+#endif /* PROGRAMS_PROG_UTIL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_checksums.c b/tools/z64compress/src/enc/libdeflate/programs/test_checksums.c
new file mode 100644
index 000000000..e66e62443
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_checksums.c
@@ -0,0 +1,200 @@
+/*
+ * test_checksums.c
+ *
+ * Verify that libdeflate's Adler-32 and CRC-32 functions produce the same
+ * results as their zlib equivalents.
+ */
+
+#include <stdlib.h>
+#include <time.h>
+
+#include "test_util.h"
+
+static unsigned int rng_seed;
+
+typedef u32 (*cksum_fn_t)(u32, const void *, size_t);
+
+static u32
+adler32_libdeflate(u32 adler, const void *buf, size_t len)
+{
+	return libdeflate_adler32(adler, buf, len);
+}
+
+static u32
+crc32_libdeflate(u32 crc, const void *buf, size_t len)
+{
+	return libdeflate_crc32(crc, buf, len);
+}
+
+static u32
+adler32_zlib(u32 adler, const void *buf, size_t len)
+{
+	return adler32(adler, buf, len);
+}
+
+static u32
+crc32_zlib(u32 crc, const void *buf, size_t len)
+{
+	return crc32(crc, buf, len);
+}
+
+static u32
+select_initial_crc(void)
+{
+	if (rand() & 1)
+		return 0;
+	return ((u32)rand() << 16) | rand();
+}
+
+static u32
+select_initial_adler(void)
+{
+	u32 lo, hi;
+
+	if (rand() & 1)
+		return 1;
+
+	lo = (rand() % 4 == 0 ? 65520 : rand() % 65521);
+	hi = (rand() % 4 == 0 ? 65520 : rand() % 65521);
+	return (hi << 16) | lo;
+}
+
+static void
+test_initial_values(cksum_fn_t cksum, u32 expected)
+{
+	ASSERT(cksum(0, NULL, 0) == expected);
+	if (cksum != adler32_zlib) /* broken */
+		ASSERT(cksum(0, NULL, 1) == expected);
+	ASSERT(cksum(0, NULL, 1234) == expected);
+	ASSERT(cksum(1234, NULL, 0) == expected);
+	ASSERT(cksum(1234, NULL, 1234) == expected);
+}
+
+static void
+test_multipart(const u8 *buffer, size_t size, const char *name,
+	       cksum_fn_t cksum, u32 v, u32 expected)
+{
+	size_t division = rand() % (size + 1);
+	v = cksum(v, buffer, division);
+	v = cksum(v, buffer + division, size - division);
+	if (v != expected) {
+		fprintf(stderr, "%s checksum failed multipart test\n", name);
+		ASSERT(0);
+	}
+}
+
+static void
+test_checksums(const void *buffer, size_t size, const char *name,
+	       cksum_fn_t cksum1, cksum_fn_t cksum2, u32 initial_value)
+{
+	u32 v1 = cksum1(initial_value, buffer, size);
+	u32 v2 = cksum2(initial_value, buffer, size);
+
+	if (v1 != v2) {
+		fprintf(stderr, "%s checksum mismatch\n", name);
+		fprintf(stderr, "initial_value=0x%08"PRIx32", buffer=%p, "
+			"size=%zu, buffer=", initial_value, buffer, size);
+		for (size_t i = 0; i < MIN(size, 256); i++)
+			fprintf(stderr, "%02x", ((const u8 *)buffer)[i]);
+		if (size > 256)
+			fprintf(stderr, "...");
+		fprintf(stderr, "\n");
+		ASSERT(0);
+	}
+
+	if ((rand() & 15) == 0) {
+		test_multipart(buffer, size, name, cksum1, initial_value, v1);
+		test_multipart(buffer, size, name, cksum2, initial_value, v1);
+	}
+}
+
+static void
+test_crc32(const void *buffer, size_t size, u32 initial_value)
+{
+	test_checksums(buffer, size, "CRC-32",
+		       crc32_libdeflate, crc32_zlib, initial_value);
+}
+
+static void
+test_adler32(const void *buffer, size_t size, u32 initial_value)
+{
+	test_checksums(buffer, size, "Adler-32",
+		       adler32_libdeflate, adler32_zlib, initial_value);
+}
+
+static void test_random_buffers(u8 *buf_start, u8 *buf_end, size_t limit,
+				u32 num_iter)
+{
+	for (u32 i = 0; i < num_iter; i++) {
+		size_t start = rand() % limit;
+		size_t len = rand() % (limit - start);
+		u32 a0 = select_initial_adler();
+		u32 c0 = select_initial_crc();
+
+		for (size_t j = start; j < start + len; j++)
+			buf_start[j] = rand();
+
+		/* Test with chosen size and alignment */
+		test_adler32(&buf_start[start], len, a0);
+		test_crc32(&buf_start[start], len, c0);
+
+		/* Test with chosen size, with guard page before input buffer */
+		memmove(buf_start, &buf_start[start], len);
+		test_adler32(buf_start, len, a0);
+		test_crc32(buf_start, len, c0);
+
+		/* Test with chosen size, with guard page after input buffer */
+		memmove(buf_end - len, buf_start, len);
+		test_adler32(buf_end - len, len, a0);
+		test_crc32(buf_end - len, len, c0);
+	}
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	u8 *buf_start, *buf_end;
+
+	begin_program(argv);
+
+	alloc_guarded_buffer(262144, &buf_start, &buf_end);
+
+	rng_seed = time(NULL);
+	srand(rng_seed);
+
+	test_initial_values(adler32_libdeflate, 1);
+	test_initial_values(adler32_zlib, 1);
+	test_initial_values(crc32_libdeflate, 0);
+	test_initial_values(crc32_zlib, 0);
+
+	/* Test different buffer sizes and alignments */
+	test_random_buffers(buf_start, buf_end, 256,  5000);
+	test_random_buffers(buf_start, buf_end, 1024,  500);
+	test_random_buffers(buf_start, buf_end, 32768,  50);
+	test_random_buffers(buf_start, buf_end, 262144, 25);
+
+	/*
+	 * Test Adler-32 overflow cases.  For example, given all 0xFF bytes and
+	 * the highest possible initial (s1, s2) of (65520, 65520), then s2 if
+	 * stored as a 32-bit unsigned integer will overflow if > 5552 bytes are
+	 * processed.  Implementations must make sure to reduce s2 modulo 65521
+	 * before that point.  Also, some implementations make use of 16-bit
+	 * counters which can overflow earlier.
+	 */
+	memset(buf_start, 0xFF, 32768);
+	for (u32 i = 0; i < 20; i++) {
+		u32 initial_value;
+
+		if (i == 0)
+			initial_value = ((u32)65520 << 16) | 65520;
+		else
+			initial_value = select_initial_adler();
+
+		test_adler32(buf_start, 5553, initial_value);
+		test_adler32(buf_start, rand() % 32769, initial_value);
+		buf_start[rand() % 32768] = 0xFE;
+	}
+
+	free_guarded_buffer(buf_start, buf_end);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_custom_malloc.c b/tools/z64compress/src/enc/libdeflate/programs/test_custom_malloc.c
new file mode 100644
index 000000000..2bbb7f098
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_custom_malloc.c
@@ -0,0 +1,85 @@
+/*
+ * test_custom_malloc.c
+ *
+ * Test libdeflate_set_memory_allocator().
+ * Also test injecting allocation failures.
+ */
+
+#include "test_util.h"
+
+static int malloc_count = 0;
+static int free_count = 0;
+
+static void *do_malloc(size_t size)
+{
+	malloc_count++;
+	return malloc(size);
+}
+
+static void *do_fail_malloc(size_t size)
+{
+	malloc_count++;
+	return NULL;
+}
+
+static void do_free(void *ptr)
+{
+	free_count++;
+	free(ptr);
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	int level;
+	struct libdeflate_compressor *c;
+	struct libdeflate_decompressor *d;
+
+	begin_program(argv);
+
+	/* Test that the custom allocator is actually used when requested. */
+
+	libdeflate_set_memory_allocator(do_malloc, do_free);
+	ASSERT(malloc_count == 0);
+	ASSERT(free_count == 0);
+
+	for (level = 0; level <= 12; level++) {
+		malloc_count = free_count = 0;
+		c = libdeflate_alloc_compressor(level);
+		ASSERT(c != NULL);
+		ASSERT(malloc_count == 1);
+		ASSERT(free_count == 0);
+		libdeflate_free_compressor(c);
+		ASSERT(malloc_count == 1);
+		ASSERT(free_count == 1);
+	}
+
+	malloc_count = free_count = 0;
+	d = libdeflate_alloc_decompressor();
+	ASSERT(d != NULL);
+	ASSERT(malloc_count == 1);
+	ASSERT(free_count == 0);
+	libdeflate_free_decompressor(d);
+	ASSERT(malloc_count == 1);
+	ASSERT(free_count == 1);
+
+	/* As long as we're here, also test injecting allocation failures. */
+
+	libdeflate_set_memory_allocator(do_fail_malloc, do_free);
+
+	for (level = 0; level <= 12; level++) {
+		malloc_count = free_count = 0;
+		c = libdeflate_alloc_compressor(level);
+		ASSERT(c == NULL);
+		ASSERT(malloc_count == 1);
+		ASSERT(free_count == 0);
+	}
+
+	malloc_count = free_count = 0;
+	d = libdeflate_alloc_decompressor();
+	ASSERT(d == NULL);
+	ASSERT(malloc_count == 1);
+	ASSERT(free_count == 0);
+
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_incomplete_codes.c b/tools/z64compress/src/enc/libdeflate/programs/test_incomplete_codes.c
new file mode 100644
index 000000000..4e441bccb
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_incomplete_codes.c
@@ -0,0 +1,385 @@
+/*
+ * test_incomplete_codes.c
+ *
+ * Test that the decompressor accepts incomplete Huffman codes in certain
+ * specific cases.
+ */
+
+#include "test_util.h"
+
+static void
+verify_decompression_libdeflate(const u8 *in, size_t in_nbytes,
+				u8 *out, size_t out_nbytes_avail,
+				const u8 *expected_out,
+				size_t expected_out_nbytes)
+{
+	struct libdeflate_decompressor *d;
+	enum libdeflate_result res;
+	size_t actual_out_nbytes;
+
+	d = libdeflate_alloc_decompressor();
+	ASSERT(d != NULL);
+
+	res = libdeflate_deflate_decompress(d, in, in_nbytes,
+					    out, out_nbytes_avail,
+					    &actual_out_nbytes);
+	ASSERT(res == LIBDEFLATE_SUCCESS);
+	ASSERT(actual_out_nbytes == expected_out_nbytes);
+	ASSERT(memcmp(out, expected_out, actual_out_nbytes) == 0);
+
+	libdeflate_free_decompressor(d);
+}
+
+static void
+verify_decompression_zlib(const u8 *in, size_t in_nbytes,
+			  u8 *out, size_t out_nbytes_avail,
+			  const u8 *expected_out, size_t expected_out_nbytes)
+{
+	z_stream z;
+	int res;
+	size_t actual_out_nbytes;
+
+	memset(&z, 0, sizeof(z));
+	res = inflateInit2(&z, -15);
+	ASSERT(res == Z_OK);
+
+	z.next_in = (void *)in;
+	z.avail_in = in_nbytes;
+	z.next_out = (void *)out;
+	z.avail_out = out_nbytes_avail;
+	res = inflate(&z, Z_FINISH);
+	ASSERT(res == Z_STREAM_END);
+	actual_out_nbytes = out_nbytes_avail - z.avail_out;
+	ASSERT(actual_out_nbytes == expected_out_nbytes);
+	ASSERT(memcmp(out, expected_out, actual_out_nbytes) == 0);
+
+	inflateEnd(&z);
+}
+
+static void
+verify_decompression(const u8 *in, size_t in_nbytes,
+		     u8 *out, size_t out_nbytes_avail,
+		     const u8 *expected_out, size_t expected_out_nbytes)
+{
+	verify_decompression_libdeflate(in, in_nbytes, out, out_nbytes_avail,
+					expected_out, expected_out_nbytes);
+	verify_decompression_zlib(in, in_nbytes, out, out_nbytes_avail,
+				  expected_out, expected_out_nbytes);
+
+}
+
+/* Test that an empty offset code is accepted. */
+static void
+test_empty_offset_code(void)
+{
+	static const u8 expected_out[] = { 'A', 'B', 'A', 'A' };
+	u8 in[128];
+	u8 out[128];
+	struct output_bitstream os = { .next = in, .end = in + sizeof(in) };
+	int i;
+
+	/*
+	 * Generate a DEFLATE stream containing a "dynamic Huffman" block
+	 * containing literals, but no offsets; and having an empty offset code
+	 * (all codeword lengths set to 0).
+	 *
+	 * Litlen code:
+	 *	litlensym_A			freq=3 len=1 codeword= 0
+	 *	litlensym_B			freq=1 len=2 codeword=01
+	 *	litlensym_256 (end-of-block)	freq=1 len=2 codeword=11
+	 * Offset code:
+	 *	(empty)
+	 *
+	 * Litlen and offset codeword lengths:
+	 *	[0..'A'-1]	= 0	presym_18
+	 *	['A']		= 1	presym_1
+	 *	['B']		= 2	presym_2
+	 *	['B'+1..255]	= 0	presym_18 presym_18
+	 *	[256]		= 2	presym_2
+	 *	[257]		= 0	presym_0
+	 *
+	 * Precode:
+	 *	presym_0	freq=1 len=3 codeword=011
+	 *	presym_1	freq=1 len=3 codeword=111
+	 *	presym_2	freq=2 len=2 codeword= 01
+	 *	presym_18	freq=3 len=1 codeword=  0
+	 */
+
+	ASSERT(put_bits(&os, 1, 1));	/* BFINAL: 1 */
+	ASSERT(put_bits(&os, 2, 2));	/* BTYPE: DYNAMIC_HUFFMAN */
+	ASSERT(put_bits(&os, 0, 5));	/* num_litlen_syms: 0 + 257 */
+	ASSERT(put_bits(&os, 0, 5));	/* num_offset_syms: 0 + 1 */
+	ASSERT(put_bits(&os, 14, 4));	/* num_explicit_precode_lens: 14 + 4 */
+
+	/*
+	 * Precode codeword lengths: order is
+	 * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
+	 */
+	for (i = 0; i < 2; i++)		/* presym_{16,17}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 1, 3));	/* presym_18: len=1 */
+	ASSERT(put_bits(&os, 3, 3));	/* presym_0: len=3 */
+	for (i = 0; i < 11; i++)	/* presym_{8,...,13}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 2, 3));	/* presym_2: len=2 */
+	ASSERT(put_bits(&os, 0, 3));	/* presym_14: len=0 */
+	ASSERT(put_bits(&os, 3, 3));	/* presym_1: len=3 */
+
+	/* Litlen and offset codeword lengths */
+	ASSERT(put_bits(&os, 0x0, 1) &&
+	       put_bits(&os, 54, 7));	/* presym_18, 65 zeroes */
+	ASSERT(put_bits(&os, 0x7, 3));	/* presym_1 */
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x0, 1) &&
+	       put_bits(&os, 89, 7));	/* presym_18, 100 zeroes */
+	ASSERT(put_bits(&os, 0x0, 1) &&
+	       put_bits(&os, 78, 7));	/* presym_18, 89 zeroes */
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x3, 3));	/* presym_0 */
+
+	/* Litlen symbols */
+	ASSERT(put_bits(&os, 0x0, 1));	/* litlensym_A */
+	ASSERT(put_bits(&os, 0x1, 2));	/* litlensym_B */
+	ASSERT(put_bits(&os, 0x0, 1));	/* litlensym_A */
+	ASSERT(put_bits(&os, 0x0, 1));	/* litlensym_A */
+	ASSERT(put_bits(&os, 0x3, 2));	/* litlensym_256 (end-of-block) */
+
+	ASSERT(flush_bits(&os));
+
+	verify_decompression(in, os.next - in, out, sizeof(out),
+			     expected_out, sizeof(expected_out));
+}
+
+/* Test that a litrunlen code containing only one symbol is accepted. */
+static void
+test_singleton_litrunlen_code(void)
+{
+	u8 in[128];
+	u8 out[128];
+	struct output_bitstream os = { .next = in, .end = in + sizeof(in) };
+	int i;
+
+	/*
+	 * Litlen code:
+	 *	litlensym_256 (end-of-block)	freq=1 len=1 codeword=0
+	 * Offset code:
+	 *	(empty)
+	 *
+	 * Litlen and offset codeword lengths:
+	 *	[0..256]	= 0	presym_18 presym_18
+	 *	[256]		= 1	presym_1
+	 *	[257]		= 0	presym_0
+	 *
+	 * Precode:
+	 *	presym_0	freq=1 len=2 codeword=01
+	 *	presym_1	freq=1 len=2 codeword=11
+	 *	presym_18	freq=2 len=1 codeword= 0
+	 */
+
+	ASSERT(put_bits(&os, 1, 1));	/* BFINAL: 1 */
+	ASSERT(put_bits(&os, 2, 2));	/* BTYPE: DYNAMIC_HUFFMAN */
+	ASSERT(put_bits(&os, 0, 5));	/* num_litlen_syms: 0 + 257 */
+	ASSERT(put_bits(&os, 0, 5));	/* num_offset_syms: 0 + 1 */
+	ASSERT(put_bits(&os, 14, 4));	/* num_explicit_precode_lens: 14 + 4 */
+
+	/*
+	 * Precode codeword lengths: order is
+	 * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
+	 */
+	for (i = 0; i < 2; i++)		/* presym_{16,17}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 1, 3));	/* presym_18: len=1 */
+	ASSERT(put_bits(&os, 2, 3));	/* presym_0: len=2 */
+	for (i = 0; i < 13; i++)	/* presym_{8,...,14}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 2, 3));	/* presym_1: len=2 */
+
+	/* Litlen and offset codeword lengths */
+	for (i = 0; i < 2; i++) {
+		ASSERT(put_bits(&os, 0, 1) &&	/* presym_18, 128 zeroes */
+		       put_bits(&os, 117, 7));
+	}
+	ASSERT(put_bits(&os, 0x3, 2));	/* presym_1 */
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_0 */
+
+	/* Litlen symbols */
+	ASSERT(put_bits(&os, 0x0, 1));	/* litlensym_256 (end-of-block) */
+
+	ASSERT(flush_bits(&os));
+
+	verify_decompression(in, os.next - in, out, sizeof(out), in, 0);
+}
+
+/* Test that an offset code containing only one symbol is accepted. */
+static void
+test_singleton_offset_code(void)
+{
+	static const u8 expected_out[] = { 255, 255, 255, 255 };
+	u8 in[128];
+	u8 out[128];
+	struct output_bitstream os = { .next = in, .end = in + sizeof(in) };
+	int i;
+
+	ASSERT(put_bits(&os, 1, 1));	/* BFINAL: 1 */
+	ASSERT(put_bits(&os, 2, 2));	/* BTYPE: DYNAMIC_HUFFMAN */
+
+	/*
+	 * Litlen code:
+	 *	litlensym_255			freq=1 len=1 codeword= 0
+	 *	litlensym_256 (end-of-block)	freq=1 len=2 codeword=01
+	 *	litlensym_257 (len 3)		freq=1 len=2 codeword=11
+	 * Offset code:
+	 *	offsetsym_0 (offset 0)		freq=1 len=1 codeword=0
+	 *
+	 * Litlen and offset codeword lengths:
+	 *	[0..254] = 0	presym_{18,18}
+	 *	[255]	 = 1	presym_1
+	 *	[256]	 = 1	presym_2
+	 *	[257]	 = 1	presym_2
+	 *	[258]	 = 1	presym_1
+	 *
+	 * Precode:
+	 *	presym_1	freq=2 len=2 codeword=01
+	 *	presym_2	freq=2 len=2 codeword=11
+	 *	presym_18	freq=2 len=1 codeword= 0
+	 */
+
+	ASSERT(put_bits(&os, 1, 5));	/* num_litlen_syms: 1 + 257 */
+	ASSERT(put_bits(&os, 0, 5));	/* num_offset_syms: 0 + 1 */
+	ASSERT(put_bits(&os, 14, 4));	/* num_explicit_precode_lens: 14 + 4 */
+	/*
+	 * Precode codeword lengths: order is
+	 * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
+	 */
+	for (i = 0; i < 2; i++)		/* presym_{16,17}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 1, 3));	/* presym_18: len=1 */
+	for (i = 0; i < 12; i++)	/* presym_{0,...,13}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 2, 3));	/* presym_2: len=2 */
+	ASSERT(put_bits(&os, 0, 3));	/* presym_14: len=0 */
+	ASSERT(put_bits(&os, 2, 3));	/* presym_1: len=2 */
+
+	/* Litlen and offset codeword lengths */
+	ASSERT(put_bits(&os, 0x0, 1) &&	/* presym_18, 128 zeroes */
+	       put_bits(&os, 117, 7));
+	ASSERT(put_bits(&os, 0x0, 1) &&	/* presym_18, 127 zeroes */
+	       put_bits(&os, 116, 7));
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_1 */
+	ASSERT(put_bits(&os, 0x3, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x3, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_1 */
+
+	/* Literal */
+	ASSERT(put_bits(&os, 0x0, 1));	/* litlensym_255 */
+
+	/* Match */
+	ASSERT(put_bits(&os, 0x3, 2));	/* litlensym_257 */
+	ASSERT(put_bits(&os, 0x0, 1));	/* offsetsym_0 */
+
+	/* End of block */
+	ASSERT(put_bits(&os, 0x1, 2));	/* litlensym_256 */
+
+	ASSERT(flush_bits(&os));
+
+	verify_decompression(in, os.next - in, out, sizeof(out),
+			     expected_out, sizeof(expected_out));
+}
+
+/* Test that an offset code containing only one symbol is accepted, even if that
+ * symbol is not symbol 0.  The codeword should be '0' in either case. */
+static void
+test_singleton_offset_code_notsymzero(void)
+{
+	static const u8 expected_out[] = { 254, 255, 254, 255, 254 };
+	u8 in[128];
+	u8 out[128];
+	struct output_bitstream os = { .next = in, .end = in + sizeof(in) };
+	int i;
+
+	ASSERT(put_bits(&os, 1, 1));	/* BFINAL: 1 */
+	ASSERT(put_bits(&os, 2, 2));	/* BTYPE: DYNAMIC_HUFFMAN */
+
+	/*
+	 * Litlen code:
+	 *	litlensym_254			len=2 codeword=00
+	 *	litlensym_255			len=2 codeword=10
+	 *	litlensym_256 (end-of-block)	len=2 codeword=01
+	 *	litlensym_257 (len 3)		len=2 codeword=11
+	 * Offset code:
+	 *	offsetsym_1 (offset 2)		len=1 codeword=0
+	 *
+	 * Litlen and offset codeword lengths:
+	 *	[0..253] = 0	presym_{18,18}
+	 *	[254]	 = 2	presym_2
+	 *	[255]	 = 2	presym_2
+	 *	[256]	 = 2	presym_2
+	 *	[257]	 = 2	presym_2
+	 *	[258]	 = 0	presym_0
+	 *	[259]	 = 1	presym_1
+	 *
+	 * Precode:
+	 *	presym_0	len=2 codeword=00
+	 *	presym_1	len=2 codeword=10
+	 *	presym_2	len=2 codeword=01
+	 *	presym_18	len=2 codeword=11
+	 */
+
+	ASSERT(put_bits(&os, 1, 5));	/* num_litlen_syms: 1 + 257 */
+	ASSERT(put_bits(&os, 1, 5));	/* num_offset_syms: 1 + 1 */
+	ASSERT(put_bits(&os, 14, 4));	/* num_explicit_precode_lens: 14 + 4 */
+	/*
+	 * Precode codeword lengths: order is
+	 * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
+	 */
+	for (i = 0; i < 2; i++)		/* presym_{16,17}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 2, 3));	/* presym_18: len=2 */
+	ASSERT(put_bits(&os, 2, 3));	/* presym_0: len=2 */
+	for (i = 0; i < 11; i++)	/* presym_{8,...,13}: len=0 */
+		ASSERT(put_bits(&os, 0, 3));
+	ASSERT(put_bits(&os, 2, 3));	/* presym_2: len=2 */
+	ASSERT(put_bits(&os, 0, 3));	/* presym_14: len=0 */
+	ASSERT(put_bits(&os, 2, 3));	/* presym_1: len=2 */
+
+	/* Litlen and offset codeword lengths */
+	ASSERT(put_bits(&os, 0x3, 2) &&	/* presym_18, 128 zeroes */
+	       put_bits(&os, 117, 7));
+	ASSERT(put_bits(&os, 0x3, 2) &&	/* presym_18, 126 zeroes */
+	       put_bits(&os, 115, 7));
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x1, 2));	/* presym_2 */
+	ASSERT(put_bits(&os, 0x0, 2));	/* presym_0 */
+	ASSERT(put_bits(&os, 0x2, 2));	/* presym_1 */
+
+	/* Literals */
+	ASSERT(put_bits(&os, 0x0, 2));	/* litlensym_254 */
+	ASSERT(put_bits(&os, 0x2, 2));	/* litlensym_255 */
+
+	/* Match */
+	ASSERT(put_bits(&os, 0x3, 2));	/* litlensym_257 */
+	ASSERT(put_bits(&os, 0x0, 1));	/* offsetsym_1 */
+
+	/* End of block */
+	ASSERT(put_bits(&os, 0x1, 2));	/* litlensym_256 */
+
+	ASSERT(flush_bits(&os));
+
+	verify_decompression(in, os.next - in, out, sizeof(out),
+			     expected_out, sizeof(expected_out));
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	begin_program(argv);
+
+	test_empty_offset_code();
+	test_singleton_litrunlen_code();
+	test_singleton_offset_code();
+	test_singleton_offset_code_notsymzero();
+
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_litrunlen_overflow.c b/tools/z64compress/src/enc/libdeflate/programs/test_litrunlen_overflow.c
new file mode 100644
index 000000000..cdec8c802
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_litrunlen_overflow.c
@@ -0,0 +1,72 @@
+/*
+ * test_litrunlen_overflow.c
+ *
+ * Regression test for commit f2f0df727444 ("deflate_compress: fix corruption
+ * with long literal run").  Try to compress a file longer than 65535 bytes
+ * where no 2-byte sequence (3 would be sufficient) is repeated <= 32768 bytes
+ * apart, and the distribution of bytes remains constant throughout, and yet not
+ * all bytes are used so the data is still slightly compressible.  There will be
+ * no matches in this data, but the compressor should still output a compressed
+ * block, and this block should contain more than 65535 consecutive literals,
+ * which triggered the bug.
+ *
+ * Note: on random data, this situation is extremely unlikely if the compressor
+ * uses all matches it finds, since random data will on average have a 3-byte
+ * match every (256**3)/32768 = 512 bytes.
+ */
+
+#include "test_util.h"
+
+int
+tmain(int argc, tchar *argv[])
+{
+	const int data_size = 2 * 250 * 251;
+	u8 *orig_data, *compressed_data, *decompressed_data;
+	int i, stride, multiple, j = 0;
+	struct libdeflate_decompressor *d;
+	static const int levels[] = { 3, 6, 12 };
+
+	begin_program(argv);
+
+	orig_data = xmalloc(data_size);
+	compressed_data = xmalloc(data_size);
+	decompressed_data = xmalloc(data_size);
+
+	for (i = 0; i < 2; i++) {
+		for (stride = 1; stride < 251; stride++) {
+			for (multiple = 0; multiple < 251; multiple++)
+				orig_data[j++] = (stride * multiple) % 251;
+		}
+	}
+	ASSERT(j == data_size);
+
+	d = libdeflate_alloc_decompressor();
+	ASSERT(d != NULL);
+
+	for (i = 0; i < ARRAY_LEN(levels); i++) {
+		struct libdeflate_compressor *c;
+		size_t csize;
+		enum libdeflate_result res;
+
+		c = libdeflate_alloc_compressor(levels[i]);
+		ASSERT(c != NULL);
+
+		csize = libdeflate_deflate_compress(c, orig_data, data_size,
+						    compressed_data, data_size);
+		ASSERT(csize > 0 && csize < data_size);
+
+		res = libdeflate_deflate_decompress(d, compressed_data, csize,
+						    decompressed_data,
+						    data_size, NULL);
+		ASSERT(res == LIBDEFLATE_SUCCESS);
+		ASSERT(memcmp(orig_data, decompressed_data, data_size) == 0);
+
+		libdeflate_free_compressor(c);
+	}
+
+	libdeflate_free_decompressor(d);
+	free(orig_data);
+	free(compressed_data);
+	free(decompressed_data);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_overread.c b/tools/z64compress/src/enc/libdeflate/programs/test_overread.c
new file mode 100644
index 000000000..2a6003218
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_overread.c
@@ -0,0 +1,95 @@
+/*
+ * test_overread.c
+ *
+ * Test that the decompressor doesn't produce an unbounded amount of output if
+ * it runs out of input, even when implicit zeroes appended to the input would
+ * continue producing output (as is the case when the input ends during a
+ * DYNAMIC_HUFFMAN block where a literal has an all-zeroes codeword).
+ *
+ * This is a regression test for commit 3f21ec9d6121 ("deflate_decompress: error
+ * out if overread count gets too large").
+ */
+
+#include "test_util.h"
+
+static void
+generate_test_input(struct output_bitstream *os)
+{
+	int i;
+
+	put_bits(os, 0, 1);	/* BFINAL: 0 */
+	put_bits(os, 2, 2);	/* BTYPE: DYNAMIC_HUFFMAN */
+
+	/*
+	 * Write the Huffman codes.
+	 *
+	 * Litlen code:
+	 *	litlensym_0   (0)		len=1 codeword=0
+	 *	litlensym_256 (end-of-block)	len=1 codeword=1
+	 * Offset code:
+	 *	offsetsym_0 (unused)		len=1 codeword=0
+	 *
+	 * Litlen and offset codeword lengths:
+	 *	[0]	 = 1	presym_1
+	 *	[1..255] = 0	presym_{18,18}
+	 *	[256]	 = 1	presym_1
+	 *	[257]	 = 1	presym_1
+	 *
+	 * Precode:
+	 *	presym_1	len=1 codeword=0
+	 *	presym_18	len=1 codeword=1
+	 */
+	put_bits(os, 0, 5);	/* num_litlen_syms: 0 + 257 */
+	put_bits(os, 0, 5);	/* num_offset_syms: 0 + 1 */
+	put_bits(os, 14, 4);	/* num_explicit_precode_lens: 14 + 4 */
+	/*
+	 * Precode codeword lengths: order is
+	 * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
+	 */
+	put_bits(os, 0, 3);		/* presym_16: len=0 */
+	put_bits(os, 0, 3);		/* presym_17: len=0 */
+	put_bits(os, 1, 3);		/* presym_18: len=1 */
+	for (i = 0; i < 14; i++)	/* presym_{0,...,14}: len=0 */
+		put_bits(os, 0, 3);
+	put_bits(os, 1, 3);		/* presym_1: len=1 */
+
+	/* Litlen and offset codeword lengths */
+	put_bits(os, 0, 1);		/* presym_1 */
+	put_bits(os, 1, 1);		/* presym_18 ... */
+	put_bits(os, 117, 7);		/* ... 11 + 117 zeroes */
+	put_bits(os, 1, 1);		/* presym_18 ... */
+	put_bits(os, 116, 7);		/* ... 11 + 116 zeroes */
+	put_bits(os, 0, 1);		/* presym_1 */
+	put_bits(os, 0, 1);		/* presym_1 */
+
+	/* Implicit zeroes would generate endless literals from here. */
+
+	ASSERT(flush_bits(os));
+}
+
+int
+tmain(int argc, tchar *argv[])
+{
+	u8 cdata[16];
+	u8 udata[256];
+	struct output_bitstream os =
+		{ .next = cdata, .end = cdata + sizeof(cdata) };
+	struct libdeflate_decompressor *d;
+	enum libdeflate_result res;
+	size_t actual_out_nbytes;
+
+	begin_program(argv);
+
+	generate_test_input(&os);
+	d = libdeflate_alloc_decompressor();
+	ASSERT(d != NULL);
+
+	res = libdeflate_deflate_decompress(d, cdata, os.next - cdata,
+					    udata, sizeof(udata),
+					    &actual_out_nbytes);
+	/* Before the fix, the result was LIBDEFLATE_INSUFFICIENT_SPACE here. */
+	ASSERT(res == LIBDEFLATE_BAD_DATA);
+
+	libdeflate_free_decompressor(d);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_slow_decompression.c b/tools/z64compress/src/enc/libdeflate/programs/test_slow_decompression.c
new file mode 100644
index 000000000..d5ac26245
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_slow_decompression.c
@@ -0,0 +1,472 @@
+/*
+ * test_slow_decompression.c
+ *
+ * Test how quickly libdeflate decompresses degenerate/malicious compressed data
+ * streams that start new Huffman blocks extremely frequently.
+ */
+
+#include "test_util.h"
+
+/*
+ * Generate a DEFLATE stream containing all empty "static Huffman" blocks.
+ *
+ * libdeflate used to decompress this very slowly (~1000x slower than typical
+ * data), but now it's much faster (only ~2x slower than typical data) because
+ * now it skips rebuilding the decode tables for the static Huffman codes when
+ * they're already loaded into the decompressor.
+ */
+static void
+generate_empty_static_huffman_blocks(u8 *p, size_t len)
+{
+	struct output_bitstream os = { .next = p, .end = p + len };
+
+	while (put_bits(&os, 0, 1) &&	/* BFINAL: 0 */
+	       put_bits(&os, 1, 2) &&	/* BTYPE: STATIC_HUFFMAN */
+	       put_bits(&os, 0, 7))	/* litlensym_256 (end-of-block) */
+		;
+}
+
+static bool
+generate_empty_dynamic_huffman_block(struct output_bitstream *os)
+{
+	int i;
+
+	if (!put_bits(os, 0, 1))	/* BFINAL: 0 */
+		return false;
+	if (!put_bits(os, 2, 2))	/* BTYPE: DYNAMIC_HUFFMAN */
+		return false;
+
+	/*
+	 * Write a minimal Huffman code, then the end-of-block symbol.
+	 *
+	 * Litlen code:
+	 *	litlensym_256 (end-of-block)	freq=1 len=1 codeword=0
+	 * Offset code:
+	 *	offsetsym_0 (unused)		freq=0 len=1 codeword=0
+	 *
+	 * Litlen and offset codeword lengths:
+	 *	[0..255] = 0	presym_{18,18}
+	 *	[256]	 = 1	presym_1
+	 *	[257]	 = 1	presym_1
+	 *
+	 * Precode:
+	 *	presym_1	freq=2 len=1 codeword=0
+	 *	presym_18	freq=2 len=1 codeword=1
+	 */
+
+	if (!put_bits(os, 0, 5))	/* num_litlen_syms: 0 + 257 */
+		return false;
+	if (!put_bits(os, 0, 5))	/* num_offset_syms: 0 + 1 */
+		return false;
+	if (!put_bits(os, 14, 4))	/* num_explicit_precode_lens: 14 + 4 */
+		return false;
+	/*
+	 * Precode codeword lengths: order is
+	 * [16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15]
+	 */
+	for (i = 0; i < 2; i++) {	/* presym_{16,17}: len=0 */
+		if (!put_bits(os, 0, 3))
+			return false;
+	}
+	if (!put_bits(os, 1, 3))	/* presym_18: len=1 */
+		return false;
+	for (i = 0; i < 14; i++) {	/* presym_{0,...,14}: len=0 */
+		if (!put_bits(os, 0, 3))
+			return false;
+	}
+	if (!put_bits(os, 1, 3))	/* presym_1: len=1 */
+		return false;
+
+	/* Litlen and offset codeword lengths */
+	for (i = 0; i < 2; i++) {
+		if (!put_bits(os, 1, 1) || /* presym_18, 128 zeroes */
+		    !put_bits(os, 117, 7))
+			return false;
+	}
+	if (!put_bits(os, 0, 1))	/* presym_1 */
+		return false;
+	if (!put_bits(os, 0, 1))	/* presym_1 */
+		return false;
+	/* Done writing the Huffman codes */
+
+	return put_bits(os, 0, 1);	/* litlensym_256 (end-of-block) */
+}
+
+/*
+ * Generate a DEFLATE stream containing all empty "dynamic Huffman" blocks.
+ *
+ * This is the worst known case currently, being ~100x slower to decompress than
+ * typical data.
+ */
+static void
+generate_empty_dynamic_huffman_blocks(u8 *p, size_t len)
+{
+	struct output_bitstream os = { .next = p, .end = p + len };
+
+	while (generate_empty_dynamic_huffman_block(&os))
+		;
+}
+
+#define NUM_ITERATIONS	100
+
+static u64
+do_test_libdeflate(const char *input_type, const u8 *in, size_t in_nbytes,
+		   u8 *out, size_t out_nbytes_avail)
+{
+	struct libdeflate_decompressor *d;
+	enum libdeflate_result res;
+	u64 t;
+	int i;
+
+	d = libdeflate_alloc_decompressor();
+	ASSERT(d != NULL);
+
+	t = timer_ticks();
+	for (i = 0; i < NUM_ITERATIONS; i++) {
+		res = libdeflate_deflate_decompress(d, in, in_nbytes, out,
+						    out_nbytes_avail, NULL);
+		ASSERT(res == LIBDEFLATE_BAD_DATA ||
+		       res == LIBDEFLATE_INSUFFICIENT_SPACE);
+	}
+	t = timer_ticks() - t;
+
+	printf("[%s, libdeflate]: %"PRIu64" KB/s\n", input_type,
+	       timer_KB_per_s((u64)in_nbytes * NUM_ITERATIONS, t));
+
+	libdeflate_free_decompressor(d);
+	return t;
+}
+
+static u64
+do_test_zlib(const char *input_type, const u8 *in, size_t in_nbytes,
+	     u8 *out, size_t out_nbytes_avail)
+{
+	z_stream z;
+	int res;
+	u64 t;
+	int i;
+
+	memset(&z, 0, sizeof(z));
+	res = inflateInit2(&z, -15);
+	ASSERT(res == Z_OK);
+
+	t = timer_ticks();
+	for (i = 0; i < NUM_ITERATIONS; i++) {
+		inflateReset(&z);
+		z.next_in = (void *)in;
+		z.avail_in = in_nbytes;
+		z.next_out = out;
+		z.avail_out = out_nbytes_avail;
+		res = inflate(&z, Z_FINISH);
+		ASSERT(res == Z_BUF_ERROR || res == Z_DATA_ERROR);
+	}
+	t = timer_ticks() - t;
+
+	printf("[%s, zlib      ]: %"PRIu64" KB/s\n", input_type,
+	       timer_KB_per_s((u64)in_nbytes * NUM_ITERATIONS, t));
+
+	inflateEnd(&z);
+	return t;
+}
+
+/*
+ * Test case from https://github.com/ebiggers/libdeflate/issues/33
+ * with the gzip header and footer removed to leave just the DEFLATE stream
+ */
+static const u8 orig_repro[3962] =
+	"\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a"
+	"\x6a\x6a\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20"
+	"\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28"
+	"\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11"
+	"\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48"
+	"\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80"
+	"\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00"
+	"\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea"
+	"\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea"
+	"\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48"
+	"\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20"
+	"\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00"
+	"\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11"
+	"\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x63"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92"
+	"\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00"
+	"\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48"
+	"\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20"
+	"\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00"
+	"\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea"
+	"\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48"
+	"\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11"
+	"\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00"
+	"\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11"
+	"\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63"
+	"\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea"
+	"\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x92\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a"
+	"\x6a\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80"
+	"\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00"
+	"\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00"
+	"\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x92\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a\x6a"
+	"\x6a\x6a\x6a\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00"
+	"\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80"
+	"\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00"
+	"\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04"
+	"\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20"
+	"\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28"
+	"\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00"
+	"\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04"
+	"\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00"
+	"\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28"
+	"\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00"
+	"\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x63\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04"
+	"\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00"
+	"\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28"
+	"\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00"
+	"\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04"
+	"\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00"
+	"\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28"
+	"\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x92\x63\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00"
+	"\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92"
+	"\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00"
+	"\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x63\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00"
+	"\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80"
+	"\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00"
+	"\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92"
+	"\x63\x00\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00"
+	"\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04"
+	"\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00\x20"
+	"\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x00\xea\x04\x48\x00\x20\x80\x28"
+	"\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1a\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00"
+	"\xea\x04\x48\x00\x20\x80\x28\x00\x00\x11\x00\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b"
+	"\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x1b\x92\x63\x00\x04\xea\x48\x00\x20"
+	"\x80\x28\x00\x00\x11\x1b\x1b\x1b\x1b\x92\x63\x00\xea\x04\x48\x00"
+	"\x20\x80\x28\x00\x00\x11\x00\x00\x01\x04\x00\x3f\x00\x00\x00\x00"
+	"\x28\xf7\xff\x00\xff\xff\xff\xff\x00\x00";
+
+int
+tmain(int argc, tchar *argv[])
+{
+	u8 in[4096];
+	u8 out[10000];
+	u64 t, tz;
+
+	begin_program(argv);
+
+	begin_performance_test();
+
+	/* static huffman case */
+	generate_empty_static_huffman_blocks(in, sizeof(in));
+	t = do_test_libdeflate("static huffman", in, sizeof(in),
+			       out, sizeof(out));
+	tz = do_test_zlib("static huffman", in, sizeof(in), out, sizeof(out));
+	/*
+	 * libdeflate is faster than zlib in this case, e.g.
+	 *	[static huffman, libdeflate]: 215861 KB/s
+	 *	[static huffman, zlib      ]: 73651 KB/s
+	 */
+	putchar('\n');
+	ASSERT(t < tz);
+
+	/* dynamic huffman case */
+	generate_empty_dynamic_huffman_blocks(in, sizeof(in));
+	t = do_test_libdeflate("dynamic huffman", in, sizeof(in),
+			       out, sizeof(out));
+	tz = do_test_zlib("dynamic huffman", in, sizeof(in), out, sizeof(out));
+	/*
+	 * libdeflate is slower than zlib in this case, though not super bad.
+	 *	[dynamic huffman, libdeflate]: 6277 KB/s
+	 *	[dynamic huffman, zlib      ]: 10419 KB/s
+	 * FIXME: make it faster.
+	 */
+	putchar('\n');
+	ASSERT(t < 4 * tz);
+
+	/* original reproducer */
+	t = do_test_libdeflate("original repro", orig_repro, sizeof(orig_repro),
+			       out, sizeof(out));
+	tz = do_test_zlib("original repro", orig_repro, sizeof(orig_repro),
+			  out, sizeof(out));
+	ASSERT(t < tz);
+
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_trailing_bytes.c b/tools/z64compress/src/enc/libdeflate/programs/test_trailing_bytes.c
new file mode 100644
index 000000000..e37e97b9c
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_trailing_bytes.c
@@ -0,0 +1,151 @@
+/*
+ * test_trailing_bytes.c
+ *
+ * Test that decompression correctly stops at the end of the first DEFLATE,
+ * zlib, or gzip stream, and doesn't process any additional trailing bytes.
+ */
+
+#include "test_util.h"
+
+static const struct {
+	size_t (*compress)(struct libdeflate_compressor *compressor,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail);
+	enum libdeflate_result (*decompress)(
+			struct libdeflate_decompressor *decompressor,
+			const void *in, size_t in_nbytes,
+			void *out, size_t out_nbytes_avail,
+			size_t *actual_out_nbytes_ret);
+	enum libdeflate_result (*decompress_ex)(
+			struct libdeflate_decompressor *decompressor,
+			const void *in, size_t in_nbytes,
+			void *out, size_t out_nbytes_avail,
+			size_t *actual_in_nbytes_ret,
+			size_t *actual_out_nbytes_ret);
+} codecs[] = {
+	{
+		.compress = libdeflate_deflate_compress,
+		.decompress = libdeflate_deflate_decompress,
+		.decompress_ex = libdeflate_deflate_decompress_ex,
+	}, {
+		.compress = libdeflate_zlib_compress,
+		.decompress = libdeflate_zlib_decompress,
+		.decompress_ex = libdeflate_zlib_decompress_ex,
+	}, {
+		.compress = libdeflate_gzip_compress,
+		.decompress = libdeflate_gzip_decompress,
+		.decompress_ex = libdeflate_gzip_decompress_ex,
+	}
+};
+
+int
+tmain(int argc, tchar *argv[])
+{
+	const size_t original_nbytes = 32768;
+	const size_t compressed_nbytes_total = 32768;
+	/*
+	 * Don't use the full buffer for compressed data, because we want to
+	 * test whether decompression can deal with additional trailing bytes.
+	 *
+	 * Note: we can't use a guarded buffer (i.e. a buffer where the byte
+	 * after compressed_nbytes is unmapped) because the decompressor may
+	 * read a few bytes beyond the end of the stream (but ultimately not
+	 * actually use those bytes) as long as they are within the buffer.
+	 */
+	const size_t compressed_nbytes_avail = 30000;
+	size_t i;
+	u8 *original;
+	u8 *compressed;
+	u8 *decompressed;
+	struct libdeflate_compressor *c;
+	struct libdeflate_decompressor *d;
+	size_t compressed_nbytes;
+	enum libdeflate_result res;
+	size_t actual_compressed_nbytes;
+	size_t actual_decompressed_nbytes;
+
+	begin_program(argv);
+
+	ASSERT(compressed_nbytes_avail < compressed_nbytes_total);
+
+	/* Prepare some dummy data to compress */
+	original = xmalloc(original_nbytes);
+	ASSERT(original != NULL);
+	for (i = 0; i < original_nbytes; i++)
+		original[i] = (i % 123) + (i % 1023);
+
+	compressed = xmalloc(compressed_nbytes_total);
+	ASSERT(compressed != NULL);
+	memset(compressed, 0, compressed_nbytes_total);
+
+	decompressed = xmalloc(original_nbytes);
+	ASSERT(decompressed != NULL);
+
+	c = libdeflate_alloc_compressor(6);
+	ASSERT(c != NULL);
+
+	d = libdeflate_alloc_decompressor();
+	ASSERT(d != NULL);
+
+	for (i = 0; i < ARRAY_LEN(codecs); i++) {
+		compressed_nbytes = codecs[i].compress(c, original,
+						       original_nbytes,
+						       compressed,
+						       compressed_nbytes_avail);
+		ASSERT(compressed_nbytes > 0);
+		ASSERT(compressed_nbytes <= compressed_nbytes_avail);
+
+		/* Test decompress() of stream that fills the whole buffer */
+		actual_decompressed_nbytes = 0;
+		memset(decompressed, 0, original_nbytes);
+		res = codecs[i].decompress(d, compressed, compressed_nbytes,
+					   decompressed, original_nbytes,
+					   &actual_decompressed_nbytes);
+		ASSERT(res == LIBDEFLATE_SUCCESS);
+		ASSERT(actual_decompressed_nbytes == original_nbytes);
+		ASSERT(memcmp(decompressed, original, original_nbytes) == 0);
+
+		/* Test decompress_ex() of stream that fills the whole buffer */
+		actual_compressed_nbytes = actual_decompressed_nbytes = 0;
+		memset(decompressed, 0, original_nbytes);
+		res = codecs[i].decompress_ex(d, compressed, compressed_nbytes,
+					      decompressed, original_nbytes,
+					      &actual_compressed_nbytes,
+					      &actual_decompressed_nbytes);
+		ASSERT(res == LIBDEFLATE_SUCCESS);
+		ASSERT(actual_compressed_nbytes == compressed_nbytes);
+		ASSERT(actual_decompressed_nbytes == original_nbytes);
+		ASSERT(memcmp(decompressed, original, original_nbytes) == 0);
+
+		/* Test decompress() of stream with trailing bytes */
+		actual_decompressed_nbytes = 0;
+		memset(decompressed, 0, original_nbytes);
+		res = codecs[i].decompress(d, compressed,
+					   compressed_nbytes_total,
+					   decompressed, original_nbytes,
+					   &actual_decompressed_nbytes);
+		ASSERT(res == LIBDEFLATE_SUCCESS);
+		ASSERT(actual_decompressed_nbytes == original_nbytes);
+		ASSERT(memcmp(decompressed, original, original_nbytes) == 0);
+
+		/* Test decompress_ex() of stream with trailing bytes */
+		actual_compressed_nbytes = actual_decompressed_nbytes = 0;
+		memset(decompressed, 0, original_nbytes);
+		res = codecs[i].decompress_ex(d, compressed,
+					      compressed_nbytes_total,
+					      decompressed, original_nbytes,
+					      &actual_compressed_nbytes,
+					      &actual_decompressed_nbytes);
+		ASSERT(res == LIBDEFLATE_SUCCESS);
+		ASSERT(actual_compressed_nbytes == compressed_nbytes);
+		ASSERT(actual_decompressed_nbytes == original_nbytes);
+		ASSERT(memcmp(decompressed, original, original_nbytes) == 0);
+	}
+
+	free(original);
+	free(compressed);
+	free(decompressed);
+	libdeflate_free_compressor(c);
+	libdeflate_free_decompressor(d);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_util.c b/tools/z64compress/src/enc/libdeflate/programs/test_util.c
new file mode 100644
index 000000000..20e7c217f
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_util.c
@@ -0,0 +1,243 @@
+/*
+ * test_util.c - utility functions for test programs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _WIN32
+/* for MAP_ANONYMOUS or MAP_ANON, which unfortunately aren't part of POSIX... */
+#  undef _POSIX_C_SOURCE
+#  ifdef __APPLE__
+#    define _DARWIN_C_SOURCE
+#  elif defined(__linux__)
+#    define _GNU_SOURCE
+#  endif
+#endif
+
+#include "test_util.h"
+
+#include <fcntl.h>
+#include <time.h>
+#ifdef _WIN32
+#  include <windows.h>
+#else
+#  include <unistd.h>
+#  include <sys/mman.h>
+#  include <sys/time.h>
+#endif
+
+#ifndef MAP_ANONYMOUS
+#  define MAP_ANONYMOUS MAP_ANON
+#endif
+
+/* Abort with an error message */
+_noreturn void
+assertion_failed(const char *expr, const char *file, int line)
+{
+	msg("Assertion failed: %s at %s:%d", expr, file, line);
+	abort();
+}
+
+void
+begin_performance_test(void)
+{
+	/* Skip performance tests by default, since they can be flaky. */
+	if (getenv("INCLUDE_PERF_TESTS") == NULL)
+		exit(0);
+}
+
+static size_t
+get_page_size(void)
+{
+#ifdef _WIN32
+	SYSTEM_INFO info;
+
+	GetSystemInfo(&info);
+	return info.dwPageSize;
+#else
+	return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+/* Allocate a buffer with guard pages */
+void
+alloc_guarded_buffer(size_t size, u8 **start_ret, u8 **end_ret)
+{
+	const size_t pagesize = get_page_size();
+	const size_t nr_pages = (size + pagesize - 1) / pagesize;
+	u8 *base_addr;
+	u8 *start, *end;
+#ifdef _WIN32
+	DWORD oldProtect;
+#endif
+
+	*start_ret = NULL;
+	*end_ret = NULL;
+
+#ifdef _WIN32
+	/* Allocate buffer and guard pages with no access. */
+	base_addr = VirtualAlloc(NULL, (nr_pages + 2) * pagesize,
+				 MEM_COMMIT | MEM_RESERVE, PAGE_NOACCESS);
+	if (!base_addr) {
+		msg("Unable to allocate memory (VirtualAlloc): Windows error %u",
+		    (unsigned int)GetLastError());
+		ASSERT(0);
+	}
+	start = base_addr + pagesize;
+	end = start + (nr_pages * pagesize);
+
+	/* Grant read+write access to just the buffer. */
+	if (!VirtualProtect(start, end - start, PAGE_READWRITE, &oldProtect)) {
+		msg("Unable to protect memory (VirtualProtect): Windows error %u",
+		    (unsigned int)GetLastError());
+		VirtualFree(base_addr, 0, MEM_RELEASE);
+		ASSERT(0);
+	}
+#else
+	/* Allocate buffer and guard pages. */
+	base_addr = mmap(NULL, (nr_pages + 2) * pagesize, PROT_READ|PROT_WRITE,
+			 MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	if (base_addr == (u8 *)MAP_FAILED) {
+		msg_errno("Unable to allocate memory (anonymous mmap)");
+		ASSERT(0);
+	}
+	start = base_addr + pagesize;
+	end = start + (nr_pages * pagesize);
+
+	/* Unmap the guard pages. */
+	munmap(base_addr, pagesize);
+	munmap(end, pagesize);
+#endif
+	*start_ret = start;
+	*end_ret = end;
+}
+
+/* Free a buffer that was allocated by alloc_guarded_buffer() */
+void
+free_guarded_buffer(u8 *start, u8 *end)
+{
+	if (!start)
+		return;
+#ifdef _WIN32
+	VirtualFree(start - get_page_size(), 0, MEM_RELEASE);
+#else
+	munmap(start, end - start);
+#endif
+}
+
+/*
+ * Return the number of timer ticks that have elapsed since some unspecified
+ * point fixed at the start of program execution
+ */
+u64
+timer_ticks(void)
+{
+#ifdef _WIN32
+	LARGE_INTEGER count;
+
+	QueryPerformanceCounter(&count);
+	return count.QuadPart;
+#elif defined(HAVE_CLOCK_GETTIME)
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return (1000000000 * (u64)ts.tv_sec) + ts.tv_nsec;
+#else
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	return (1000000 * (u64)tv.tv_sec) + tv.tv_usec;
+#endif
+}
+
+/*
+ * Return the number of timer ticks per second
+ */
+static u64
+timer_frequency(void)
+{
+#ifdef _WIN32
+	LARGE_INTEGER freq;
+
+	QueryPerformanceFrequency(&freq);
+	return freq.QuadPart;
+#elif defined(HAVE_CLOCK_GETTIME)
+	return 1000000000;
+#else
+	return 1000000;
+#endif
+}
+
+/*
+ * Convert a number of elapsed timer ticks to milliseconds
+ */
+u64 timer_ticks_to_ms(u64 ticks)
+{
+	return ticks * 1000 / timer_frequency();
+}
+
+/*
+ * Convert a byte count and a number of elapsed timer ticks to MB/s
+ */
+u64 timer_MB_per_s(u64 bytes, u64 ticks)
+{
+	return bytes * timer_frequency() / ticks / 1000000;
+}
+
+/*
+ * Convert a byte count and a number of elapsed timer ticks to KB/s
+ */
+u64 timer_KB_per_s(u64 bytes, u64 ticks)
+{
+	return bytes * timer_frequency() / ticks / 1000;
+}
+
+bool
+put_bits(struct output_bitstream *os, machine_word_t bits, int num_bits)
+{
+	os->bitbuf |= bits << os->bitcount;
+	os->bitcount += num_bits;
+	while (os->bitcount >= 8) {
+		if (os->next == os->end)
+			return false;
+		*os->next++ = os->bitbuf;
+		os->bitcount -= 8;
+		os->bitbuf >>= 8;
+	}
+	return true;
+}
+
+bool
+flush_bits(struct output_bitstream *os)
+{
+	while (os->bitcount > 0) {
+		if (os->next == os->end)
+			return false;
+		*os->next++ = os->bitbuf;
+		os->bitcount -= 8;
+		os->bitbuf >>= 8;
+	}
+	os->bitcount = 0;
+	return true;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/programs/test_util.h b/tools/z64compress/src/enc/libdeflate/programs/test_util.h
new file mode 100644
index 000000000..4fb9688f6
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/test_util.h
@@ -0,0 +1,67 @@
+/*
+ * test_util.h - utility functions for test programs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef PROGRAMS_TEST_UTIL_H
+#define PROGRAMS_TEST_UTIL_H
+
+#include "prog_util.h"
+
+#include <zlib.h> /* for comparison purposes */
+
+#if defined(__GNUC__) || __has_attribute(noreturn)
+# define _noreturn __attribute__((noreturn))
+#else
+# define _noreturn
+#endif
+
+void _noreturn
+assertion_failed(const char *expr, const char *file, int line);
+
+#define ASSERT(expr) { if (unlikely(!(expr))) \
+	assertion_failed(#expr, __FILE__, __LINE__); }
+
+void begin_performance_test(void);
+
+void alloc_guarded_buffer(size_t size, u8 **start_ret, u8 **end_ret);
+void free_guarded_buffer(u8 *start, u8 *end);
+
+u64 timer_ticks(void);
+u64 timer_ticks_to_ms(u64 ticks);
+u64 timer_MB_per_s(u64 bytes, u64 ticks);
+u64 timer_KB_per_s(u64 bytes, u64 ticks);
+
+struct output_bitstream {
+	machine_word_t bitbuf;
+	int bitcount;
+	u8 *next;
+	u8 *end;
+};
+
+bool put_bits(struct output_bitstream *os, machine_word_t bits, int num_bits);
+bool flush_bits(struct output_bitstream *os);
+
+#endif /* PROGRAMS_TEST_UTIL_H */
diff --git a/tools/z64compress/src/enc/libdeflate/programs/tgetopt.c b/tools/z64compress/src/enc/libdeflate/programs/tgetopt.c
new file mode 100644
index 000000000..868600d97
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/programs/tgetopt.c
@@ -0,0 +1,118 @@
+/*
+ * tgetopt.c - portable replacement for GNU getopt()
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "prog_util.h"
+
+tchar *toptarg;
+int toptind = 1, topterr = 1, toptopt;
+
+/*
+ * This is a simple implementation of getopt().  It can be compiled with either
+ * 'char' or 'wchar_t' as the character type.
+ *
+ * Do *not* use this implementation if you need any of the following features,
+ * as they are not supported:
+ *	- Long options
+ *	- Option-related arguments retained in argv, not nulled out
+ *	- '+' and '-' characters in optstring
+ */
+int
+tgetopt(int argc, tchar *argv[], const tchar *optstring)
+{
+	static tchar empty[1];
+	static tchar *nextchar;
+	static bool done;
+
+	if (toptind == 1) {
+		/* Starting to scan a new argument vector */
+		nextchar = NULL;
+		done = false;
+	}
+
+	while (!done && (nextchar != NULL || toptind < argc)) {
+		if (nextchar == NULL) {
+			/* Scanning a new argument */
+			tchar *arg = argv[toptind++];
+			if (arg[0] == '-' && arg[1] != '\0') {
+				if (arg[1] == '-' && arg[2] == '\0') {
+					/* All args after "--" are nonoptions */
+					argv[toptind - 1] = NULL;
+					done = true;
+				} else {
+					/* Start of short option characters */
+					nextchar = &arg[1];
+				}
+			}
+		} else {
+			/* More short options in previous arg */
+			tchar opt = *nextchar;
+			tchar *p = tstrchr(optstring, opt);
+			if (p == NULL) {
+				if (topterr)
+					msg("invalid option -- '%"TC"'", opt);
+				toptopt = opt;
+				return '?';
+			}
+			/* 'opt' is a valid short option character */
+			nextchar++;
+			toptarg = NULL;
+			if (*(p + 1) == ':') {
+				/* 'opt' can take an argument */
+				if (*nextchar != '\0') {
+					/* Optarg is in same argv argument */
+					toptarg = nextchar;
+					nextchar = empty;
+				} else if (toptind < argc && *(p + 2) != ':') {
+					/* Optarg is next argv argument */
+					argv[toptind - 1] = NULL;
+					toptarg = argv[toptind++];
+				} else if (*(p + 2) != ':') {
+					if (topterr && *optstring != ':') {
+						msg("option requires an "
+						    "argument -- '%"TC"'", opt);
+					}
+					toptopt = opt;
+					opt = (*optstring == ':') ? ':' : '?';
+				}
+			}
+			if (*nextchar == '\0') {
+				argv[toptind - 1] = NULL;
+				nextchar = NULL;
+			}
+			return opt;
+		}
+	}
+
+	/* Done scanning.  Move all nonoptions to the end, set optind to the
+	 * index of the first nonoption, and return -1. */
+	toptind = argc;
+	while (--argc > 0)
+		if (argv[argc] != NULL)
+			argv[--toptind] = argv[argc];
+	done = true;
+	return -1;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_compress/fuzz.c b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_compress/fuzz.c
new file mode 100644
index 000000000..420a7db67
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_compress/fuzz.c
@@ -0,0 +1,56 @@
+#include <assert.h>
+#include <libdeflate.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+int main(int argc, char **argv)
+{
+	struct libdeflate_decompressor *d;
+	struct libdeflate_compressor *c;
+	int ret;
+	int fd = open(argv[1], O_RDONLY);
+	struct stat stbuf;
+	unsigned char level;
+	unsigned char use_bound;
+
+	assert(fd >= 0);
+	ret = fstat(fd, &stbuf);
+	assert(!ret);
+
+	if (stbuf.st_size < 2)
+		return 0;
+	ret = read(fd, &level, 1);
+	assert(ret == 1);
+	level %= 13;
+
+	ret = read(fd, &use_bound, 1);
+	assert(ret == 1);
+	use_bound %= 2;
+
+	char in[stbuf.st_size - 2];
+	ret = read(fd, in, sizeof in);
+	assert(ret == sizeof in);
+
+	c = libdeflate_alloc_compressor(level);
+	d = libdeflate_alloc_decompressor();
+
+	size_t outsize = use_bound ? libdeflate_deflate_compress_bound(c, sizeof(in)) : sizeof(in);
+	char out[outsize];
+	char checkarray[sizeof(in)];
+
+	size_t csize = libdeflate_deflate_compress(c, in,sizeof in, out, outsize);
+	if (csize != 0) {
+		enum libdeflate_result res;
+		res = libdeflate_deflate_decompress(d, out, csize, checkarray, sizeof in, NULL);
+		assert(!res);
+		assert(!memcmp(in, checkarray, sizeof in));
+	} else {
+		assert(!use_bound);
+	}
+
+	libdeflate_free_compressor(c);
+	libdeflate_free_decompressor(d);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_compress/inputs/0 b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_compress/inputs/0
new file mode 100644
index 000000000..875bce73a
Binary files /dev/null and b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_compress/inputs/0 differ
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_decompress/fuzz.c b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_decompress/fuzz.c
new file mode 100644
index 000000000..8cc4ce55c
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_decompress/fuzz.c
@@ -0,0 +1,28 @@
+#include <assert.h>
+#include <libdeflate.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+int main(int argc, char **argv)
+{
+	struct libdeflate_decompressor *d;
+	int ret;
+	int fd = open(argv[1], O_RDONLY);
+	struct stat stbuf;
+	assert(fd >= 0);
+	ret = fstat(fd, &stbuf);
+	assert(!ret);
+
+	char in[stbuf.st_size];
+	ret = read(fd, in, sizeof in);
+	assert(ret == sizeof in);
+
+	char out[sizeof(in) * 3];
+
+	d = libdeflate_alloc_decompressor();
+
+	libdeflate_deflate_decompress(d, in, sizeof in, out, sizeof out, NULL);
+	libdeflate_free_decompressor(d);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_decompress/inputs/0 b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_decompress/inputs/0
new file mode 100644
index 000000000..19e3a346e
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/deflate_decompress/inputs/0
@@ -0,0 +1,3 @@
+u�1
+�@Eg�Bl5
+��V���6j��X{�i=���l=��Ο��l�����?t���D���=�G�%���2xԇ7e�D���s[���Uk��q�|R/���뮰*F�Mz��v��`���r�1��B��,lDuYj#0<�՞2�0h�E�`��I���W�
\ No newline at end of file
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/fuzz.sh b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/fuzz.sh
new file mode 100644
index 000000000..c78a2ca05
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/fuzz.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+
+set -e -u -o pipefail
+
+cd "$(dirname "$0")"
+
+read -r -a AVAILABLE_TARGETS < <(echo */fuzz.c | sed 's@/fuzz.c@@g')
+
+usage()
+{
+	cat << EOF
+Usage: $0 [OPTION]... [TARGET]...
+
+Fuzz libdeflate with afl-fuzz.
+
+Options:
+   --asan          Enable AddressSanitizer
+   --no-resume     Don't resume existing afl-fuzz session; start a new one
+   --ubsan         Enable UndefinedBehaviorSanitizer
+
+Available targets: ${AVAILABLE_TARGETS[*]}
+EOF
+}
+
+die()
+{
+	echo "$*" 1>&2
+	exit 1
+}
+
+asan=false
+ubsan=false
+may_resume=true
+
+longopts_array=(
+asan
+help
+no-resume
+ubsan
+)
+longopts=$(echo "${longopts_array[@]}" | tr ' ' ',')
+
+if ! options=$(getopt -o "" -l "$longopts" -- "$@"); then
+	usage 1>&2
+	exit 1
+fi
+eval set -- "$options"
+while (( $# >= 0 )); do
+	case "$1" in
+	--asan)
+		asan=true
+		;;
+	--help)
+		usage
+		exit 0
+		;;
+	--no-resume)
+		may_resume=false
+		;;
+	--ubsan)
+		ubsan=true
+		;;
+	--)
+		shift
+		break
+		;;
+	*)
+		echo 1>&2 "Invalid option: \"$1\""
+		usage 1>&2
+		exit 1
+	esac
+	shift
+done
+
+if $asan && $ubsan; then
+	die "--asan and --ubsan are mutually exclusive"
+fi
+
+if ! type -P afl-fuzz > /dev/null; then
+	die "afl-fuzz is not installed"
+fi
+
+if (( $# == 0 )); then
+	targets=("${AVAILABLE_TARGETS[@]}")
+else
+	for target; do
+		found=false
+		for t in "${AVAILABLE_TARGETS[@]}"; do
+			if [ "$target" = "$t" ]; then
+				found=true
+			fi
+		done
+		if ! $found; then
+			echo 1>&2 "Unknown target '$target'"
+			echo 1>&2 "Available targets: ${AVAILABLE_TARGETS[*]}"
+			exit 1
+		fi
+	done
+	targets=("$@")
+fi
+if (( ${#targets[@]} > 1 )) && ! type -P urxvt > /dev/null; then
+	die "urxvt is not installed"
+fi
+
+afl_opts=""
+if $asan; then
+	export AFL_USE_ASAN=1
+	export CFLAGS="-O2 -m32"
+	export CC=afl-clang
+	afl_opts+=" -m 800"
+elif $ubsan; then
+	export CFLAGS="-fsanitize=undefined -fno-sanitize-recover=undefined"
+	export CC=afl-gcc
+else
+	export AFL_HARDEN=1
+	export CFLAGS="-O2"
+	export CC=afl-gcc
+fi
+CFLAGS+=" -DLIBDEFLATE_ENABLE_ASSERTIONS"
+
+sudo sh -c "echo core > /proc/sys/kernel/core_pattern"
+if [ -e /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor ]; then
+	sudo sh -c "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor"
+fi
+
+srcdir=../..
+builddir=$srcdir/build
+$srcdir/scripts/cmake-helper.sh -G Ninja
+cmake --build $builddir
+
+for dir in "${targets[@]}"; do
+	cp -vaT "$dir" "/tmp/$dir"
+	# shellcheck disable=SC2086 # Intended word splitting of $CFLAGS
+	$CC $CFLAGS -Wall -I$srcdir "$dir"/fuzz.c $builddir/libdeflate.a \
+		-o "/tmp/$dir/fuzz"
+	indir=/tmp/$dir/inputs
+	outdir=/tmp/$dir/outputs
+	if [ -e "$outdir" ]; then
+		if $may_resume; then
+			indir="-"
+		else
+			rm -rf "${outdir:?}"/*
+		fi
+	else
+		mkdir "$outdir"
+	fi
+	cmd="afl-fuzz -i $indir -o $outdir -T $dir $afl_opts -- /tmp/$dir/fuzz @@"
+	if (( ${#targets[@]} > 1 )); then
+		urxvt -e bash -c "$cmd" &
+	else
+		$cmd
+	fi
+done
+wait
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/gzip_decompress/fuzz.c b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/gzip_decompress/fuzz.c
new file mode 100644
index 000000000..aec50804c
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/gzip_decompress/fuzz.c
@@ -0,0 +1,28 @@
+#include <assert.h>
+#include <libdeflate.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+int main(int argc, char **argv)
+{
+	struct libdeflate_decompressor *d;
+	int ret;
+	int fd = open(argv[1], O_RDONLY);
+	struct stat stbuf;
+	assert(fd >= 0);
+	ret = fstat(fd, &stbuf);
+	assert(!ret);
+
+	char in[stbuf.st_size];
+	ret = read(fd, in, sizeof in);
+	assert(ret == sizeof in);
+
+	char out[sizeof(in) * 3];
+
+	d = libdeflate_alloc_decompressor();
+
+	libdeflate_gzip_decompress(d, in, sizeof in, out, sizeof out, NULL);
+	libdeflate_free_decompressor(d);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/gzip_decompress/inputs/0 b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/gzip_decompress/inputs/0
new file mode 100644
index 000000000..813c75359
Binary files /dev/null and b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/gzip_decompress/inputs/0 differ
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/zlib_decompress/fuzz.c b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/zlib_decompress/fuzz.c
new file mode 100644
index 000000000..797343bbf
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/zlib_decompress/fuzz.c
@@ -0,0 +1,28 @@
+#include <assert.h>
+#include <libdeflate.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+int main(int argc, char **argv)
+{
+	struct libdeflate_decompressor *d;
+	int ret;
+	int fd = open(argv[1], O_RDONLY);
+	struct stat stbuf;
+	assert(fd >= 0);
+	ret = fstat(fd, &stbuf);
+	assert(!ret);
+
+	char in[stbuf.st_size];
+	ret = read(fd, in, sizeof in);
+	assert(ret == sizeof in);
+
+	char out[sizeof(in) * 3];
+
+	d = libdeflate_alloc_decompressor();
+
+	libdeflate_zlib_decompress(d, in, sizeof in, out, sizeof out, NULL);
+	libdeflate_free_decompressor(d);
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/zlib_decompress/inputs/0 b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/zlib_decompress/inputs/0
new file mode 100644
index 000000000..292e9726d
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/afl-fuzz/zlib_decompress/inputs/0
@@ -0,0 +1,3 @@
+x�u�1
+�@Eg�Bl5
+��V���6j��X{�i=���l=��Ο��l�����?t���D���=�G�%���2xԇ7e�D���s[���Uk��q�|R/���뮰*F�Mz��v��`���r�1��B��,lDuYj#0<�՞2�0h�E�`��I���W��-�
\ No newline at end of file
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/android_build.sh b/tools/z64compress/src/enc/libdeflate/scripts/android_build.sh
new file mode 100644
index 000000000..ae0b4bc0e
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/android_build.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+SCRIPTDIR="$(dirname "$0")"
+BUILDDIR="$SCRIPTDIR/../build"
+API_LEVEL=28
+ARCH=arm64
+export CFLAGS=${CFLAGS:-}
+ENABLE_CRC=false
+ENABLE_CRYPTO=false
+NDKDIR=$HOME/android-ndk-r23b
+
+usage() {
+	cat << EOF
+Usage: $0 [OPTION]...
+Build libdeflate for Android.
+
+  --api-level=LEVEL    Android API level to target (default: $API_LEVEL)
+  --arch=ARCH          Architecture: arm32|arm64|x86|x86_64 (default: $ARCH)
+  --enable-crc         Enable crc instructions
+  --enable-crypto      Enable crypto instructions
+  --ndkdir=NDKDIR      Android NDK directory (default: $NDKDIR)
+EOF
+}
+if ! options=$(getopt -o '' \
+	-l 'api-level:,arch:,enable-crc,enable-crypto,help,ndkdir:' -- "$@"); then
+	usage 1>&2
+	exit 1
+fi
+
+eval set -- "$options"
+
+while [ $# -gt 0 ]; do
+	case "$1" in
+	--api-level)
+		API_LEVEL="$2"
+		shift
+		;;
+	--arch)
+		ARCH="$2"
+		shift
+		;;
+	--enable-crc)
+		ENABLE_CRC=true
+		;;
+	--enable-crypto)
+		ENABLE_CRYPTO=true
+		;;
+	--help)
+		usage
+		exit 0
+		;;
+	--ndkdir)
+		NDKDIR="$2"
+		shift
+		;;
+	--)
+		shift
+		break
+		;;
+	*)
+		echo 1>&2 "Unknown option \"$1\""
+		usage 1>&2
+		exit 1
+	esac
+	shift
+done
+
+case "$ARCH" in
+arm|arm32|aarch32|armeabi-v7a)
+	ANDROID_ABI=armeabi-v7a
+	if $ENABLE_CRC || $ENABLE_CRYPTO; then
+		CFLAGS+=" -march=armv8-a"
+		if $ENABLE_CRC; then
+			CFLAGS+=" -mcrc"
+		else
+			CFLAGS+=" -mnocrc"
+		fi
+		if $ENABLE_CRYPTO; then
+			CFLAGS+=" -mfpu=crypto-neon-fp-armv8"
+		else
+			CFLAGS+=" -mfpu=neon"
+		fi
+	fi
+	;;
+arm64|aarch64|arm64-v8a)
+	ANDROID_ABI=arm64-v8a
+	features=""
+	if $ENABLE_CRC; then
+		features+="+crc"
+	fi
+	if $ENABLE_CRYPTO; then
+		features+="+crypto"
+	fi
+	if [ -n "$features" ]; then
+		CFLAGS+=" -march=armv8-a$features"
+	fi
+	;;
+x86)
+	ANDROID_ABI=x86
+	;;
+x86_64)
+	ANDROID_ABI=x86_64
+	;;
+*)
+	echo 1>&2 "Unknown architecture: \"$ARCH\""
+	usage 1>&2
+	exit 1
+esac
+
+"$SCRIPTDIR"/cmake-helper.sh -G Ninja \
+	-DCMAKE_TOOLCHAIN_FILE="$NDKDIR"/build/cmake/android.toolchain.cmake \
+	-DANDROID_ABI="$ANDROID_ABI" \
+	-DANDROID_PLATFORM="$API_LEVEL" \
+	-DLIBDEFLATE_BUILD_TESTS=1
+cmake --build "$BUILDDIR"
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/android_tests.sh b/tools/z64compress/src/enc/libdeflate/scripts/android_tests.sh
new file mode 100644
index 000000000..3ec1007ba
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/android_tests.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#
+# Test libdeflate on a connected arm64 Android device.
+# Requires the Android NDK (release 19 or later) and adb.
+
+set -eu -o pipefail
+cd "$(dirname "$0")/.."
+
+if [ $# -ne 0 ]; then
+	echo 1>&2 "Usage: $0"
+	exit 2
+fi
+
+# Use NDKDIR if specified in environment, else use default value.
+: "${NDKDIR:=$HOME/android-ndk-r23b}"
+if [ ! -e "$NDKDIR" ]; then
+	cat 1>&2 << EOF
+Android NDK was not found in NDKDIR=$NDKDIR!  Set the
+environmental variable NDKDIR to the location of your Android NDK installation.
+EOF
+	exit 1
+fi
+
+CLEANUP_CMDS=()
+cleanup() {
+	for cmd in "${CLEANUP_CMDS[@]}"; do
+		eval "$cmd"
+	done
+}
+trap cleanup EXIT
+
+# Use TESTDATA if specified in environment, else generate it.
+if [ -z "${TESTDATA:-}" ]; then
+	# Generate default TESTDATA file.
+	TESTDATA=$(mktemp -t libdeflate_testdata.XXXXXXXXXX)
+	export TESTDATA
+	CLEANUP_CMDS+=("rm -f '$TESTDATA'")
+	find . '(' -name '*.c' -o -name '*.h' -o -name '*.sh' ')' \
+		-exec cat '{}' ';' | head -c 1000000 > "$TESTDATA"
+fi
+
+TMPDIR=$(mktemp -d -t libdeflate_test.XXXXXXXXX)
+CLEANUP_CMDS+=("rm -r '$TMPDIR'")
+
+android_build_and_test() {
+	echo "Running Android tests with $*"
+
+	./scripts/android_build.sh --ndkdir="$NDKDIR" "$@" > /dev/null
+	adb push "$TESTDATA" ./scripts/exec_tests.sh \
+		./build/programs/{benchmark,test_*} /data/local/tmp/ > /dev/null
+
+	# Note: adb shell always returns 0, even if the shell command fails...
+	adb shell "cd /data/local/tmp && WRAPPER= TESTDATA=$(basename "$TESTDATA") sh exec_tests.sh" \
+		> "$TMPDIR/adb.out"
+	if ! grep -q "exec_tests finished successfully" "$TMPDIR/adb.out"; then
+		echo 1>&2 "Android test failure!  adb shell output:"
+		cat "$TMPDIR/adb.out"
+		exit 1
+	fi
+}
+
+android_build_and_test --arch=arm32
+android_build_and_test --arch=arm32 --enable-crc
+android_build_and_test --arch=arm64
+android_build_and_test --arch=arm64 --enable-crc
+android_build_and_test --arch=arm64 --enable-crypto
+android_build_and_test --arch=arm64 --enable-crc --enable-crypto
+
+echo "Android tests passed"
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/benchmark.sh b/tools/z64compress/src/enc/libdeflate/scripts/benchmark.sh
new file mode 100644
index 000000000..e7275926d
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/benchmark.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -e
+
+SCRIPTDIR="$(dirname "$0")"
+BUILDDIR="$SCRIPTDIR/../build"
+
+"$SCRIPTDIR"/cmake-helper.sh -DLIBDEFLATE_BUILD_TESTS=1 -G Ninja > /dev/null
+ninja -C "$BUILDDIR" --quiet benchmark
+"$BUILDDIR"/programs/benchmark "$@"
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/checksum.sh b/tools/z64compress/src/enc/libdeflate/scripts/checksum.sh
new file mode 100644
index 000000000..c7350d756
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/checksum.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -e
+
+SCRIPTDIR="$(dirname "$0")"
+BUILDDIR="$SCRIPTDIR/../build"
+
+"$SCRIPTDIR"/cmake-helper.sh -DLIBDEFLATE_BUILD_TESTS=1 -G Ninja > /dev/null
+ninja -C "$BUILDDIR" --quiet checksum
+"$BUILDDIR"/programs/checksum "$@"
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/checksum_benchmarks.sh b/tools/z64compress/src/enc/libdeflate/scripts/checksum_benchmarks.sh
new file mode 100644
index 000000000..836676084
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/checksum_benchmarks.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+have_cpu_feature() {
+	local feature="$1"
+	local tag
+	case $ARCH in
+	arm*|aarch*)
+		tag="Features"
+		;;
+	*)
+		tag="flags"
+		;;
+	esac
+	grep -q "^$tag"$'[ \t]'"*:.*\<$feature\>" /proc/cpuinfo
+}
+
+make_and_test() {
+	# Build the checksum program and tests.  Set the special test support
+	# flag to get support for LIBDEFLATE_DISABLE_CPU_FEATURES.
+	rm -rf build
+	CFLAGS="$CFLAGS -DTEST_SUPPORT__DO_NOT_USE=1" \
+		cmake -B build -G Ninja -DLIBDEFLATE_BUILD_TESTS=1 > /dev/null
+	cmake --build build > /dev/null
+
+	# Run the checksum tests, for good measure.  (This isn't actually part
+	# of the benchmarking.)
+	./build/programs/test_checksums > /dev/null
+}
+
+__do_benchmark() {
+	local impl="$1" speed
+	shift
+	local flags=("$@")
+
+	speed=$(./build/programs/checksum "${CKSUM_FLAGS[@]}" \
+		"${flags[@]}" -t "$FILE" | \
+		grep -o '[0-9]\+ MB/s' | grep -o '[0-9]\+')
+	printf "%-45s%-10s\n" "$CKSUM_NAME ($impl)" "$speed"
+}
+
+do_benchmark() {
+	local impl="$1"
+
+	if [ "$impl" = zlib ]; then
+		__do_benchmark "$impl" "-Z"
+	else
+		CFLAGS="${EXTRA_CFLAGS[*]}" make_and_test
+		__do_benchmark "libdeflate, $impl"
+		if [ "$ARCH" = x86_64 ]; then
+			CFLAGS="-m32 ${EXTRA_CFLAGS[*]}" make_and_test
+			__do_benchmark "libdeflate, $impl, 32-bit"
+		fi
+	fi
+}
+
+sort_by_speed() {
+	awk '{print $NF, $0}' | sort -nr | cut -f2- -d' '
+}
+
+disable_cpu_feature() {
+	local name="$1"
+	shift
+	local extra_cflags=("$@")
+
+	LIBDEFLATE_DISABLE_CPU_FEATURES+=",$name"
+	EXTRA_CFLAGS+=("${extra_cflags[@]}")
+}
+
+cleanup() {
+	if $USING_TMPFILE; then
+		rm "$FILE"
+	fi
+}
+
+ARCH="$(uname -m)"
+USING_TMPFILE=false
+
+if (( $# > 1 )); then
+	echo "Usage: $0 [FILE]" 1>&2
+	exit 1
+fi
+
+trap cleanup EXIT
+
+if (( $# == 0 )); then
+	# Generate default test data file.
+	FILE=$(mktemp -t checksum_testdata.XXXXXXXXXX)
+	USING_TMPFILE=true
+	echo "Generating 100 MB test file: $FILE"
+	head -c 100000000 /dev/urandom > "$FILE"
+else
+	FILE="$1"
+fi
+
+cat << EOF
+Method                                       Speed (MB/s)
+------                                       ------------
+EOF
+
+# CRC-32
+CKSUM_NAME="CRC-32"
+CKSUM_FLAGS=()
+EXTRA_CFLAGS=()
+export LIBDEFLATE_DISABLE_CPU_FEATURES=""
+{
+case $ARCH in
+i386|x86_64)
+	if have_cpu_feature pclmulqdq && have_cpu_feature avx; then
+		do_benchmark "PCLMUL/AVX"
+		disable_cpu_feature "avx" "-mno-avx"
+	fi
+	if have_cpu_feature pclmulqdq; then
+		do_benchmark "PCLMUL"
+		disable_cpu_feature "pclmul" "-mno-pclmul"
+	fi
+	;;
+arm*|aarch*)
+	if have_cpu_feature crc32; then
+		do_benchmark "ARM"
+		disable_cpu_feature "crc32" "-march=armv8-a+nocrc"
+	fi
+	if have_cpu_feature pmull; then
+		do_benchmark "PMULL"
+		disable_cpu_feature "pmull" "-march=armv8-a+nocrc+nocrypto"
+	fi
+	;;
+esac
+do_benchmark "generic"
+do_benchmark "zlib"
+} | sort_by_speed
+
+# Adler-32
+CKSUM_NAME="Adler-32"
+CKSUM_FLAGS=(-A)
+EXTRA_CFLAGS=()
+export LIBDEFLATE_DISABLE_CPU_FEATURES=""
+echo
+{
+case $ARCH in
+i386|x86_64)
+	if have_cpu_feature avx2; then
+		do_benchmark "AVX2"
+		disable_cpu_feature "avx2" "-mno-avx2"
+	fi
+	if have_cpu_feature sse2; then
+		do_benchmark "SSE2"
+		disable_cpu_feature "sse2" "-mno-sse2"
+	fi
+	;;
+arm*)
+	if have_cpu_feature neon; then
+		do_benchmark "NEON"
+		disable_cpu_feature "neon" "-mfpu=vfpv3"
+	fi
+	;;
+aarch*)
+	if have_cpu_feature asimd; then
+		do_benchmark "NEON"
+		disable_cpu_feature "neon" "-march=armv8-a+nosimd"
+	fi
+	;;
+esac
+do_benchmark "generic"
+do_benchmark "zlib"
+} | sort_by_speed
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/cmake-helper.sh b/tools/z64compress/src/enc/libdeflate/scripts/cmake-helper.sh
new file mode 100644
index 000000000..0c67930ba
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/cmake-helper.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+# This script ensures that the 'build' directory has been created and configured
+# with the given CMake options and environment.
+
+TOPDIR="$(dirname "$0")"/..
+BUILDDIR="$TOPDIR"/build
+
+flags=$(env; echo "@CMAKEOPTS@=$*")
+if [ "$flags" != "$(cat "$BUILDDIR"/.flags 2>/dev/null)" ]; then
+	rm -rf "$BUILDDIR"/CMakeCache.txt "$BUILDDIR"/CMakeFiles
+	mkdir -p "$BUILDDIR"
+	cmake -S "$TOPDIR" -B "$BUILDDIR" "$@"
+	echo "$flags" > "$BUILDDIR"/.flags
+fi
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/deflate_benchmarks.sh b/tools/z64compress/src/enc/libdeflate/scripts/deflate_benchmarks.sh
new file mode 100644
index 000000000..5321cdc3a
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/deflate_benchmarks.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+set -eu -o pipefail
+topdir="$(dirname "$0")/.."
+tmpfile=$(mktemp)
+trap 'rm -f $tmpfile' EXIT
+
+run_benchmark()
+{
+	local best_ctime=1000000000
+	local i
+
+	for i in $(seq "$NUM_ITERATIONS"); do
+		"$@" > "$tmpfile"
+		csize=$(awk '/Compressed/{print $4}' "$tmpfile")
+		ctime=$(awk '/Compression time/{print $3}' "$tmpfile")
+		if (( ctime <  best_ctime )); then
+			best_ctime=$ctime
+		fi
+		: "$i" # make shellcheck happy
+	done
+	CSIZE=$csize
+	CTIME=$best_ctime
+}
+
+multifile()
+{
+	local file results cmd best em
+
+	NUM_ITERATIONS=1
+
+	echo "File | zlib -6 | zlib -9 | libdeflate -6 | libdeflate -9 | libdeflate -12"
+	echo "-----|---------|---------|---------------|---------------|---------------"
+
+	for file in "$@"; do
+		echo -n "$(basename "$file")"
+		results=()
+		cmd=("$topdir/build/programs/benchmark"
+		     -s"$(stat -c "%s" "$file")" "$file")
+		run_benchmark "${cmd[@]}" -Y -6
+		results+=("$CSIZE")
+		run_benchmark "${cmd[@]}" -Y -6
+		results+=("$CSIZE")
+		run_benchmark "${cmd[@]}" -6
+		results+=("$CSIZE")
+		run_benchmark "${cmd[@]}" -9
+		results+=("$CSIZE")
+		run_benchmark "${cmd[@]}" -12
+		results+=("$CSIZE")
+		best=2000000000
+		for result in "${results[@]}"; do
+			if (( result < best)); then
+				best=$result
+			fi
+		done
+		for result in "${results[@]}"; do
+			if (( result == best )); then
+				em="**"
+			else
+				em=""
+			fi
+			echo -n " | ${em}${result}${em}"
+		done
+		echo
+	done
+}
+
+single_file()
+{
+	local file=$1
+	local usize args
+	local include_old=false
+
+	usize=$(stat -c "%s" "$file")
+	: ${NUM_ITERATIONS:=3}
+
+	if [ -e "$topdir/benchmark-old" ]; then
+		include_old=true
+	fi
+	echo -n "Level | libdeflate (new) "
+	if $include_old; then
+		echo -n "| libdeflate (old) "
+	fi
+	echo "| zlib"
+	echo -n "------|------------------"
+	if $include_old; then
+		echo -n "|------------------"
+	fi
+	echo "|-----"
+	for level in {1..12}; do
+		echo -n "$level"
+		args=("$file" -s "$usize" "-$level")
+
+		run_benchmark "$topdir/build/programs/benchmark" "${args[@]}"
+		echo -n " | $CSIZE / $CTIME"
+
+		if $include_old; then
+			run_benchmark "$topdir/benchmark-old" "${args[@]}"
+			echo -n " | $CSIZE / $CTIME"
+		fi
+
+		if (( level > 9 )); then
+			echo -n " | N/A"
+		else
+			run_benchmark "$topdir/build/programs/benchmark" \
+				      "${args[@]}" -Y
+			echo -n " | $CSIZE / $CTIME"
+		fi
+		echo
+	done
+}
+
+if (( $# > 1 )); then
+	multifile "$@"
+elif (( $# == 1 )); then
+	single_file "$@"
+else
+	echo 1>&2 "Usage: $0 FILE..."
+fi
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/exec_tests.sh b/tools/z64compress/src/enc/libdeflate/scripts/exec_tests.sh
new file mode 100644
index 000000000..b4ad2d5b0
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/exec_tests.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+#
+# Helper script used by run_tests.sh and android_tests.sh,
+# not intended to be run directly
+#
+
+set -eu
+
+DIR=${1:-.}
+
+cd "$DIR"
+
+run_cmd() {
+	echo "$WRAPPER $*"
+	$WRAPPER "$@" > /dev/null
+}
+
+for prog in ./test_*; do
+	run_cmd "$prog"
+done
+
+for format in '' '-g' '-z'; do
+	for ref_impl in '' '-Y' '-Z'; do
+		run_cmd ./benchmark $format $ref_impl "$TESTDATA"
+	done
+done
+for level in 0 1 3 7 9; do
+	for ref_impl in '' '-Y'; do
+		run_cmd ./benchmark -$level $ref_impl "$TESTDATA"
+	done
+done
+for level in 0 1 3 7 9 12; do
+	for ref_impl in '' '-Z'; do
+		run_cmd ./benchmark -$level $ref_impl "$TESTDATA"
+	done
+done
+
+echo "exec_tests finished successfully" # Needed for 'adb shell'
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/gen_bitreverse_tab.py b/tools/z64compress/src/enc/libdeflate/scripts/gen_bitreverse_tab.py
new file mode 100644
index 000000000..3695742a3
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/gen_bitreverse_tab.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+#
+# This script computes a table that maps each byte to its bitwise reverse.
+
+def reverse_byte(v):
+    return sum(1 << (7 - bit) for bit in range(8) if (v & (1 << bit)) != 0)
+
+tab = [reverse_byte(v) for v in range(256)]
+
+print('static const u8 bitreverse_tab[256] = {')
+for i in range(0, len(tab), 8):
+    print('\t', end='')
+    for j, v in enumerate(tab[i:i+8]):
+        print(f'0x{v:02x},', end='')
+        if j == 7:
+            print('')
+        else:
+            print(' ', end='')
+print('};')
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/gen_crc32_multipliers.c b/tools/z64compress/src/enc/libdeflate/scripts/gen_crc32_multipliers.c
new file mode 100644
index 000000000..5ef9bacaf
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/gen_crc32_multipliers.c
@@ -0,0 +1,203 @@
+/*
+ * gen_crc32_multipliers.c
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This program computes the constant multipliers needed for "folding" over
+ * various distances with the gzip CRC-32.  Each such multiplier is x^D mod G(x)
+ * for some distance D, in bits, over which the folding is occurring.
+ *
+ * Folding works as follows: let A(x) be a polynomial (possibly reduced
+ * partially or fully mod G(x)) for part of the message, and let B(x) be a
+ * polynomial (possibly reduced partially or fully mod G(x)) for a later part of
+ * the message.  The unreduced combined polynomial is A(x)*x^D + B(x), where D
+ * is the number of bits separating the two parts of the message plus len(B(x)).
+ * Since mod G(x) can be applied at any point, x^D mod G(x) can be precomputed
+ * and used instead of x^D unreduced.  That allows the combined polynomial to be
+ * computed relatively easily in a partially-reduced form A(x)*(x^D mod G(x)) +
+ * B(x), with length max(len(A(x)) + 31, len(B(x))).  This does require doing a
+ * polynomial multiplication (carryless multiplication).
+ *
+ * "Folding" in this way can be used for the entire CRC computation except the
+ * final reduction to 32 bits; this works well when CPU support for carryless
+ * multiplication is available.  It can also be used to combine CRCs of
+ * different parts of the message that were computed using a different method.
+ *
+ * Note that the gzip CRC-32 uses bit-reversed polynomials.  I.e., the low order
+ * bits are really the high order polynomial coefficients.
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "../common_defs.h"
+
+/* The generator polynomial G(x) for the gzip CRC-32 */
+#define CRCPOLY		0xEDB88320 /* G(x) without x^32 term */
+#define CRCPOLY_FULL	(((u64)CRCPOLY << 1) | 1) /* G(x) */
+
+/* Compute x^D mod G(x) */
+static u32
+compute_xD_modG(size_t D)
+{
+	/* Start with x^0 mod G(x) */
+	u32 remainder = 0x80000000;
+
+	/* Each iteration, 'remainder' becomes x^i mod G(x) */
+	for (size_t i = 1; i <= D; i++)
+		remainder = (remainder >> 1) ^ ((remainder & 1) ? CRCPOLY : 0);
+
+	/* Now 'remainder' is x^D mod G(x) */
+	return remainder;
+}
+
+/* Compute floor(x^64 / G(x)) */
+static u64
+compute_x64_div_G(void)
+{
+	u64 quotient = 0;
+	u64 dividend = 0x1;
+
+	for (int i = 0; i < 64 - 32 + 1; i++) {
+		if ((dividend >> i) & 1) {
+			quotient |= (u64)1 << i;
+			dividend ^= CRCPOLY_FULL << i;
+		}
+	}
+
+	return quotient;
+}
+
+static void
+gen_vec_folding_constants(void)
+{
+	/*
+	 * Compute the multipliers needed for CRC-32 folding with carryless
+	 * multiplication instructions that operate on the 64-bit halves of
+	 * 128-bit vectors.  Using the terminology from earlier, for each 64-bit
+	 * fold len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial
+	 * multiplied by a 32-bit one produces a 95-bit one.  When A(x) is the
+	 * low order polynomial half of a 128-bit vector (high order physical
+	 * half), the separation between the message parts is the total length
+	 * of the 128-bit vectors separating the values.  When A(x) is the high
+	 * order polynomial half, the separation is 64 bits greater.
+	 */
+	for (int num_vecs = 1; num_vecs <= 12; num_vecs++) {
+		const int sep_lo = 128 * (num_vecs - 1);
+		const int sep_hi = sep_lo + 64;
+		const int len_B = 95;
+		int D;
+
+		/* A(x) = high 64 polynomial bits (low 64 physical bits) */
+		D = sep_hi + len_B;
+		printf("#define CRC32_%dVECS_MULT_1 0x%08"PRIx32" /* x^%d mod G(x) */\n",
+		       num_vecs, compute_xD_modG(D), D);
+
+		/* A(x) = low 64 polynomial bits (high 64 physical bits) */
+		D = sep_lo + len_B;
+		printf("#define CRC32_%dVECS_MULT_2 0x%08"PRIx32" /* x^%d mod G(x) */\n",
+		       num_vecs, compute_xD_modG(D), D);
+
+		printf("#define CRC32_%dVECS_MULTS { CRC32_%dVECS_MULT_1, CRC32_%dVECS_MULT_2 }\n",
+		       num_vecs, num_vecs, num_vecs);
+		printf("\n");
+	}
+
+	/* Multiplier for final 96 => 64 bit fold */
+	printf("#define CRC32_FINAL_MULT 0x%08"PRIx32" /* x^63 mod G(x) */\n",
+	       compute_xD_modG(63));
+
+	/*
+	 * Constants for final 64 => 32 bit reduction.  These constants are the
+	 * odd ones out, as this final reduction step can't use the regular CRC
+	 * folding described above.  It uses Barrett reduction instead.
+	 */
+	printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^64 / G(x)) */\n",
+	       compute_x64_div_G());
+	printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n",
+	       CRCPOLY_FULL);
+	printf("#define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }\n");
+}
+
+/* Multipliers for combining the CRCs of separate chunks */
+static void
+gen_chunk_constants(void)
+{
+	const size_t num_chunks = 4;
+	const size_t table_len = 129;
+	const size_t min_chunk_len = 128;
+
+	printf("#define CRC32_NUM_CHUNKS %zu\n", num_chunks);
+	printf("#define CRC32_MIN_VARIABLE_CHUNK_LEN %zuUL\n", min_chunk_len);
+	printf("#define CRC32_MAX_VARIABLE_CHUNK_LEN %zuUL\n",
+	       (table_len - 1) * min_chunk_len);
+	printf("\n");
+	printf("/* Multipliers for implementations that use a variable chunk length */\n");
+	printf("static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {\n",
+	       num_chunks - 1);
+	printf("\t{ 0 /* unused row */ },\n");
+	for (size_t i = 1; i < table_len; i++) {
+		const size_t chunk_len = i*min_chunk_len;
+
+		printf("\t/* chunk_len=%zu */\n", chunk_len);
+		printf("\t{ ");
+		for (size_t j = num_chunks - 1; j >= 1; j--) {
+			const size_t D = (j * 8 * chunk_len) - 33;
+
+			printf("0x%08"PRIx32" /* x^%zu mod G(x) */, ",
+			       compute_xD_modG(D), D);
+		}
+		printf("},\n");
+	}
+	printf("};\n");
+	printf("\n");
+
+	printf("/* Multipliers for implementations that use a large fixed chunk length */\n");
+	const size_t fixed_chunk_len = 32768;
+	printf("#define CRC32_FIXED_CHUNK_LEN %zuUL\n", fixed_chunk_len);
+	for (int j = 1; j < num_chunks; j++) {
+		const size_t D = (j * 8 * fixed_chunk_len) - 33;
+
+		printf("#define CRC32_FIXED_CHUNK_MULT_%d 0x%08"PRIx32" /* x^%zu mod G(x) */\n",
+		       j, compute_xD_modG(D), D);
+	}
+}
+
+int
+main(void)
+{
+	printf("/*\n"
+	       " * crc32_multipliers.h - constants for CRC-32 folding\n"
+	       " *\n"
+	       " * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.\n"
+	       " */\n"
+	       "\n");
+
+	gen_vec_folding_constants();
+	printf("\n");
+	gen_chunk_constants();
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/gen_crc32_tables.c b/tools/z64compress/src/enc/libdeflate/scripts/gen_crc32_tables.c
new file mode 100644
index 000000000..b13fc5c49
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/gen_crc32_tables.c
@@ -0,0 +1,105 @@
+/*
+ * gen_crc32_tables.c - a program for CRC-32 table generation
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+
+#include "../common_defs.h"
+
+#define CRCPOLY	0xEDB88320 /* G(x) without x^32 term */
+
+static u32
+crc32_update_bit(u32 remainder, u8 next_bit)
+{
+	return (remainder >> 1) ^ (((remainder ^ next_bit) & 1) ? CRCPOLY : 0);
+}
+
+static u32
+crc32_update_byte(u32 remainder, u8 next_byte)
+{
+	for (int j = 0; j < 8; j++, next_byte >>= 1)
+		remainder = crc32_update_bit(remainder, next_byte & 1);
+	return remainder;
+}
+
+static void
+print_256_entries(const u32 *entries)
+{
+	for (size_t i = 0; i < 256 / 4; i++) {
+		printf("\t");
+		for (size_t j = 0; j < 4; j++) {
+			printf("0x%08x,", entries[i * 4 + j]);
+			if (j != 3)
+				printf(" ");
+		}
+		printf("\n");
+	}
+}
+
+int
+main(void)
+{
+	u32 crc32_table[0x800];
+
+	/* crc32_table[i] for 0 <= i < 0x100 is the CRC-32 of byte i. */
+	for (int i = 0; i < 0x100; i++)
+		crc32_table[i] = crc32_update_byte(0, i);
+
+	/*
+	 * crc32_table[i] for 0x100 <= i < 0x800 is the CRC-32 of byte i % 0x100
+	 * followed by i / 0x100 zero bytes.
+	 */
+	for (int i = 0x100; i < 0x800; i++)
+		crc32_table[i] = crc32_update_byte(crc32_table[i - 0x100], 0);
+
+	printf("/*\n");
+	printf(" * crc32_tables.h - data tables for CRC-32 computation\n");
+	printf(" *\n");
+	printf(" * THIS FILE WAS GENERATED BY gen_crc32_tables.c.  DO NOT EDIT.\n");
+	printf(" */\n");
+	printf("\n");
+	/*
+	 * Although crc32_slice1_table is the same as the first 256 entries of
+	 * crc32_slice8_table, we output these tables separately because any
+	 * combo of (slice1, slice8, slice1 && slice8, nothing) might be needed,
+	 * and it's simplest to let the compiler optimize out any unused tables.
+	 */
+	printf("static const u32 crc32_slice1_table[] MAYBE_UNUSED = {\n");
+	print_256_entries(&crc32_table[0x000]);
+	printf("};\n");
+	printf("\n");
+	printf("static const u32 crc32_slice8_table[] MAYBE_UNUSED = {\n");
+	print_256_entries(&crc32_table[0x000]);
+	print_256_entries(&crc32_table[0x100]);
+	print_256_entries(&crc32_table[0x200]);
+	print_256_entries(&crc32_table[0x300]);
+	print_256_entries(&crc32_table[0x400]);
+	print_256_entries(&crc32_table[0x500]);
+	print_256_entries(&crc32_table[0x600]);
+	print_256_entries(&crc32_table[0x700]);
+	printf("};\n");
+	return 0;
+}
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/gen_default_litlen_costs.py b/tools/z64compress/src/enc/libdeflate/scripts/gen_default_litlen_costs.py
new file mode 100644
index 000000000..26b8d9ccb
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/gen_default_litlen_costs.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+#
+# This script computes the default litlen symbol costs for the near-optimal
+# parser.
+
+from math import log2
+
+BIT_COST = 16 # Must match BIT_COST in deflate_compress.c
+NUM_LEN_SLOTS = 29
+
+print("""static const struct {
+	u8 used_lits_to_lit_cost[257];
+	u8 len_sym_cost;
+} default_litlen_costs[] = {""")
+MATCH_PROBS = [0.25, 0.50, 0.75]
+for i, match_prob in enumerate(MATCH_PROBS):
+    len_prob = match_prob / NUM_LEN_SLOTS
+    len_sym_cost = int(-log2(len_prob) * BIT_COST)
+    if i == 0:
+        print('\t{', end='')
+    print(f' /* match_prob = {match_prob} */')
+    print('\t\t.used_lits_to_lit_cost = {')
+
+    j = 0
+    for num_used_literals in range(0, 257):
+        if num_used_literals == 0:
+            num_used_literals = 1
+        lit_prob = (1 - match_prob) / num_used_literals
+        lit_cost = int(-log2(lit_prob) * BIT_COST)
+        if j == 0:
+            print('\t\t\t', end='')
+        if j == 7 or num_used_literals == 256:
+            print(f'{lit_cost},')
+            j = 0
+        else:
+            print(f'{lit_cost}, ', end='')
+            j += 1
+    print('\t\t},')
+    print(f'\t\t.len_sym_cost = {len_sym_cost},')
+    if i < len(MATCH_PROBS) - 1:
+        print('\t}, {', end='')
+    else:
+        print('\t},')
+print('};')
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/gen_offset_slot_map.py b/tools/z64compress/src/enc/libdeflate/scripts/gen_offset_slot_map.py
new file mode 100644
index 000000000..500332cfb
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/gen_offset_slot_map.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+#
+# This script generates the deflate_offset_slot[] array, which is a condensed
+# map from offsets to offset slots.
+
+DEFLATE_OFFSET_SLOT_BASE = [
+	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
+	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
+	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
+	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
+]
+
+DEFLATE_EXTRA_OFFSET_BITS = [
+	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
+	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
+	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
+	11   , 11   , 12   , 12    , 13    , 13    ,
+]
+
+offset_slot_map = [0] * 512
+
+for offset_slot, offset_base in enumerate(DEFLATE_OFFSET_SLOT_BASE):
+    num_extra_bits = DEFLATE_EXTRA_OFFSET_BITS[offset_slot]
+    offset_end = offset_base + (1 << num_extra_bits)
+    if offset_base <= 256:
+        for offset in range(offset_base, offset_end):
+            offset_slot_map[offset] = offset_slot
+    else:
+        for offset in range(offset_base, offset_end, 128):
+            offset_slot_map[256 + ((offset - 1) >> 7)] = offset_slot
+
+print('static const u8 deflate_offset_slot_map[512] = {')
+for i in range(0, len(offset_slot_map), 16):
+    print('\t', end='')
+    for j, v in enumerate(offset_slot_map[i:i+16]):
+        print(f'{v},', end='')
+        if j == 15:
+            print('')
+        else:
+            print(' ', end='')
+print('};')
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/gzip_tests.sh b/tools/z64compress/src/enc/libdeflate/scripts/gzip_tests.sh
new file mode 100644
index 000000000..9b15cd1af
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/gzip_tests.sh
@@ -0,0 +1,523 @@
+#!/bin/bash
+#
+# Test script for libdeflate's gzip and gunzip programs.
+#
+# To run, you must set GZIP and GUNZIP in the environment to the absolute paths
+# to the gzip and gunzip programs to test.  All tests should pass regardless of
+# whether the GNU versions or the libdeflate versions, or a combination, of
+# these programs are used.
+#
+# The environmental variable TESTDATA must also be set to a file containing
+# test data.
+#
+
+set -eu -o pipefail
+
+export -n GZIP GUNZIP TESTDATA
+
+ORIG_PWD=$PWD
+TMPDIR="$(mktemp -d)"
+CURRENT_TEST=
+
+BSD_STAT=false
+if ! stat --version 2>&1 | grep -q coreutils; then
+	BSD_STAT=true
+fi
+
+cleanup() {
+	if [ -n "$CURRENT_TEST" ]; then
+		echo "TEST FAILED: \"$CURRENT_TEST\""
+	fi
+	rm -rf -- "$TMPDIR"
+}
+
+trap cleanup EXIT
+
+begin_test() {
+	CURRENT_TEST="$1"
+	rm -rf -- "${TMPDIR:?}"/*
+	cd "$ORIG_PWD"
+	cp "$TESTDATA" "$TMPDIR/file"
+	chmod +w "$TMPDIR/file"
+	cd "$TMPDIR"
+}
+
+gzip() {
+	$GZIP "$@"
+}
+
+gunzip() {
+	$GUNZIP "$@"
+}
+
+get_filesize() {
+	local file=$1
+
+	if $BSD_STAT; then
+		stat -f %z "$file"
+	else
+		stat -c %s "$file"
+	fi
+}
+
+get_linkcount() {
+	local file=$1
+
+	if $BSD_STAT; then
+		stat -f %l "$file"
+	else
+		stat -c %h "$file"
+	fi
+}
+
+get_modeandtimestamps() {
+	local file=$1
+
+	if $BSD_STAT; then
+		stat -f "%p;%a;%m" "$file"
+	elif [ "$(uname -m)" = s390x ]; then
+		# Use seconds precision instead of nanoseconds.
+		# TODO: why is this needed?  QEMU user mode emulation bug?
+		stat -c "%a;%X;%Y" "$file"
+	else
+		stat -c "%a;%x;%y" "$file"
+	fi
+}
+
+assert_status() {
+	local expected_status="$1"
+	local expected_msg="$2"
+	shift 2
+	(
+		set +e
+		{ eval "$*" > /dev/null; } 2>&1
+		local actual_status=$?
+		if [ "$actual_status" != "$expected_status" ]; then
+			echo 1>&2 "Command '$*' exited with status" \
+				"$actual_status but expected status" \
+				"$expected_status"
+			exit 1
+		fi
+		exit 0
+	) > command_output
+	if ! grep -E -q "$expected_msg" command_output; then
+		echo 1>&2 "Expected output of command '$*' to match regex" \
+			"'$expected_msg'"
+		echo 1>&2 "Actual output was:"
+		echo 1>&2 "---------------------------------------------------"
+		cat 1>&2 command_output
+		echo 1>&2 "---------------------------------------------------"
+		return 1
+	fi
+}
+
+assert_error() {
+	assert_status 1 "$@"
+}
+
+assert_warning() {
+	assert_status 2 "$@"
+}
+
+assert_skipped() {
+	assert_warning '\<(ignored|skipping|unchanged)\>' "$@"
+}
+
+assert_equals() {
+	local expected="$1"
+	local actual="$2"
+
+	if [ "$expected" != "$actual" ]; then
+		echo 1>&2 "Expected '$expected', but got '$actual'"
+		return 1
+	fi
+}
+
+
+begin_test 'Basic compression and decompression works'
+cp file orig
+gzip file
+[ ! -e file ] && [ -e file.gz ]
+gunzip file.gz
+[ -e file ] && [ ! -e file.gz ]
+cmp file orig
+
+
+begin_test 'gzip -d is gunzip'
+cp file orig
+gzip file
+gzip -d file.gz
+cmp file orig
+
+
+begin_test '-k (keep original file) works'
+cp file orig
+gzip -k file
+cmp file orig
+rm file
+cp file.gz orig.gz
+gunzip -k file.gz
+cmp file.gz orig.gz
+
+
+begin_test '-c (write to stdout) works'
+cp file orig
+gzip -k file
+gzip -c file > 2.gz
+cmp file orig
+cmp file.gz 2.gz
+gunzip -c 2.gz > file
+cmp file.gz 2.gz
+cmp file orig
+
+
+# Note: in some of the commands below, we intentionally use 'cat file | gzip'
+# rather than 'gzip < file', in order to test the use of a pipe.  This produces
+# a shellcheck warning about 'cat' being unnecessary.  Suppress that warning by
+# using { cat file; true; }.
+begin_test 'Reading from stdin works'
+gzip < file > 1.gz
+gzip - < file > 2.gz
+{ cat file; true; } | gzip > 3.gz
+{ cat file; true; } | gzip - > 4.gz
+cmp file <(gunzip < 1.gz)
+cmp file <(gunzip - < 2.gz)
+cmp file <({ cat 3.gz; true; } | gunzip)
+cmp file <({ cat 4.gz; true; } | gunzip -)
+
+
+begin_test '-n option is accepted'
+gzip -n file
+gunzip -n file.gz
+
+
+begin_test 'can specify multiple options'
+gzip -fk1 file
+cmp <(gzip -c -1 file) file.gz
+gunzip -kfd file.gz
+
+
+begin_test 'Compression levels'
+if [ "$GZIP" = /bin/gzip ] || [ "$GZIP" = /usr/bin/gzip ]; then
+	assert_error '\<invalid option\>' gzip -10
+	max_level=9
+else
+	for level in 13 99999 1a; do
+		assert_error '\<Invalid compression level\>' gzip -$level
+	done
+	max_level=12
+fi
+for level in $(seq 1 $max_level); do
+	gzip -c "-$level" file > "file$level"
+	cmp file <(gunzip -c "file$level")
+done
+rm file command_output
+
+
+begin_test 'Overwriting output file requires -f'
+cp file orig
+echo -n > file.gz
+gzip -c file > 2.gz
+assert_warning 'already exists' gzip file </dev/null
+cmp file.gz /dev/null
+gzip -f file
+cmp 2.gz file.gz
+echo -n > file
+assert_warning 'already exists' gunzip file.gz </dev/null
+gunzip -f file.gz
+cmp file orig
+
+
+begin_test 'Nonexistent input file fails, even with -f'
+for prog in 'gzip' 'gzip -f' 'gunzip' 'gunzip -f'; do
+	assert_error 'No such file or directory' "$prog" NONEXISTENT
+done
+
+
+begin_test 'Compressing already-suffixed file requires -f or -c'
+gzip file
+gzip -c file.gz > c.gz
+gzip file.gz 2>&1 >/dev/null | grep -q 'already has .gz suffix'
+[ -e file.gz ] && [ ! -e file.gz.gz ]
+gzip -f file.gz
+[ ! -e file.gz ] && [ -e file.gz.gz ]
+cmp file.gz.gz c.gz
+
+
+begin_test 'gunzip -f -c passes through non-gzip data'
+echo hello > file
+cp file orig
+gunzip -f -c file > foo
+cmp file foo
+gzip file
+gunzip -f -c file.gz > foo
+cmp foo orig
+
+
+begin_test 'gunzip -f (without -c) does *not* pass through non-gzip data'
+echo hello > file.gz
+assert_error '\<not in gzip format\>' gunzip -f file.gz
+
+
+begin_test 'Decompressing unsuffixed file only works with -c'
+gzip file && mv file.gz file
+assert_skipped gunzip file
+assert_skipped gunzip -f file
+gunzip -c file > orig
+mv file file.gz && gunzip file.gz && cmp file orig
+
+
+begin_test '... unless there is a corresponding suffixed file'
+cp file orig
+gzip file
+[ ! -e file ] && [ -e file.gz ]
+gunzip -c file > tmp
+cmp tmp orig
+rm tmp
+ln -s NONEXISTENT file
+gunzip -c file > tmp
+cmp tmp orig
+rm tmp file
+gunzip file
+[ -e file ] && [ ! -e file.gz ]
+cmp file orig
+
+
+begin_test 'Directory is skipped, even with -f'
+mkdir dir
+mkdir dir.gz
+for opt in '' '-f' '-c'; do
+	assert_skipped gzip $opt dir
+done
+#assert_skipped gzip dir.gz  # XXX: GNU gzip warns, libdeflate gzip no-ops
+for opt in '' '-f' '-c'; do
+	for name in dir dir.gz; do
+		assert_skipped gunzip $opt $name
+	done
+done
+
+
+begin_test '(gzip) symlink is rejected without -f or -c'
+ln -s file symlink1
+ln -s file symlink2
+assert_error 'Too many levels of symbolic links' gzip symlink1
+[ -e file ] && [ -e symlink1 ] && [ ! -e symlink1.gz ]
+gzip -f symlink1
+[ -e file ] && [ ! -e symlink1 ] && [ -e symlink1.gz ]
+gzip -c symlink2 > /dev/null
+
+
+begin_test '(gunzip) symlink is rejected without -f or -c'
+gzip file
+ln -s file.gz symlink1.gz
+ln -s file.gz symlink2.gz
+assert_error 'Too many levels of symbolic links' gunzip symlink1
+[ -e file.gz ] && [ -e symlink1.gz ] && [ ! -e symlink1 ]
+gunzip -f symlink1.gz
+[ -e file.gz ] && [ ! -e symlink1.gz ] && [ -e symlink1 ]
+gunzip -c symlink2.gz > /dev/null
+
+
+begin_test 'FIFO is skipped, even with -f'
+mkfifo foo
+mkfifo foo.gz
+assert_skipped gzip foo
+assert_skipped gzip -f foo
+#assert_skipped gzip -c foo # XXX: works with GNU gzip, not libdeflate's
+assert_skipped gunzip foo.gz
+assert_skipped gunzip -f foo.gz
+#assert_skipped gunzip -c foo.gz # XXX: works with GNU gzip, not libdeflate's
+
+
+begin_test '(gzip) overwriting symlink does not follow symlink'
+echo a > a
+echo b > b
+gzip a
+ln -s a.gz b.gz
+gzip -f b
+gunzip a.gz
+cmp <(echo a) a
+
+
+begin_test '(gunzip) overwriting symlink does not follow symlink'
+echo a > a
+echo b > b
+gzip b
+ln -s a b
+gunzip -f b.gz
+cmp <(echo a) a
+cmp <(echo b) b
+
+
+begin_test '(gzip) hard linked file skipped without -f or -c'
+cp file orig
+ln file link
+assert_equals 2 "$(get_linkcount file)"
+assert_skipped gzip file
+gzip -c file > /dev/null
+assert_equals 2 "$(get_linkcount file)"
+gzip -f file
+assert_equals 1 "$(get_linkcount link)"
+assert_equals 1 "$(get_linkcount file.gz)"
+cmp link orig
+# XXX: GNU gzip skips hard linked files with -k, libdeflate's doesn't
+
+
+begin_test '(gunzip) hard linked file skipped without -f or -c'
+gzip file
+ln file.gz link.gz
+cp file.gz orig.gz
+assert_equals 2 "$(get_linkcount file.gz)"
+assert_skipped gunzip file.gz
+gunzip -c file.gz > /dev/null
+assert_equals 2 "$(get_linkcount file.gz)"
+gunzip -f file
+assert_equals 1 "$(get_linkcount link.gz)"
+assert_equals 1 "$(get_linkcount file)"
+cmp link.gz orig.gz
+
+
+begin_test 'Multiple files'
+cp file file2
+gzip file file2
+[ ! -e file ] && [ ! -e file2 ] && [ -e file.gz ] && [ -e file2.gz ]
+gunzip file.gz file2.gz
+[ -e file ] && [ -e file2 ] && [ ! -e file.gz ] && [ ! -e file2.gz ]
+
+
+begin_test 'Multiple files, continue on warning'
+mkdir 1
+cp file 2
+assert_skipped gzip 1 2
+[ ! -e 1.gz ]
+cmp file <(gunzip -c 2.gz)
+rmdir 1
+mkdir 1.gz
+assert_skipped gunzip 1.gz 2.gz
+[ ! -e 1 ]
+cmp 2 file
+
+
+if (( $(id -u) != 0 )); then
+	begin_test 'Multiple files, continue on error'
+	cp file 1
+	cp file 2
+	chmod a-r 1
+	assert_error 'Permission denied' gzip 1 2
+	[ ! -e 1.gz ]
+	cmp file <(gunzip -c 2.gz)
+	rm -f 1
+	cp 2.gz 1.gz
+	chmod a-r 1.gz
+	assert_error 'Permission denied' gunzip 1.gz 2.gz
+	[ ! -e 1 ]
+	cmp 2 file
+fi
+
+
+begin_test 'Compressing empty file'
+echo -n > empty
+gzip empty
+gunzip empty.gz
+cmp /dev/null empty
+
+
+begin_test 'Decompressing malformed file'
+echo -n > foo.gz
+assert_error '\<(not in gzip format|unexpected end of file)\>' \
+	gunzip foo.gz
+echo 1 > foo.gz
+assert_error '\<not in gzip format\>' gunzip foo.gz
+echo abcdefgh > foo.gz
+assert_error '\<not in gzip format\>' gunzip foo.gz
+echo -ne '\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\xff\x4b\x4c\x4a\x4e\x49\x24\x16\x73\x01\x00\x6c\x5b\xa2\x62\x2e\x00\x00\x00' \
+	> foo.gz
+assert_error '\<(not in gzip format|crc error)\>' gunzip foo.gz
+
+
+for suf in .foo foo .blaaaaaaaaaaaaaaaargh; do
+	begin_test "Custom suffix: $suf"
+	gzip -S $suf file
+	[ ! -e file ] && [ ! -e file.gz ] && [ -e file$suf ]
+	assert_skipped gunzip file$suf
+	gunzip -S $suf file$suf
+	[ -e file ] && [ ! -e file.gz ] && [ ! -e file$suf ]
+done
+# DIFFERENCE: GNU gzip lower cases suffix, we don't
+
+
+begin_test 'Empty suffix is rejected'
+assert_error '\<invalid suffix\>' gzip -S '""' file
+assert_error '\<invalid suffix\>' gunzip -S '""' file
+
+
+begin_test 'Timestamps and mode are preserved'
+chmod 777 file
+orig_stat=$(get_modeandtimestamps file)
+gzip file
+sleep 1
+gunzip file.gz
+assert_equals "$orig_stat" "$(get_modeandtimestamps file)"
+
+
+begin_test 'Decompressing multi-member gzip file'
+cat file file > orig
+gzip -c file > file.gz
+gzip -c file >> file.gz
+gunzip -f file.gz
+cmp file orig
+
+
+begin_test 'Decompressing multi-member gzip file (final member smaller)'
+echo 'hello world' > hello
+cat file hello > orig
+gzip -c file > file.gz
+gzip -c hello >> file.gz
+gunzip -f file.gz
+cmp file orig
+
+
+begin_test 'Help option'
+gzip -h 2>&1 | grep -q 'Usage'
+gunzip -h 2>&1 | grep -q 'Usage'
+
+
+begin_test 'Incorrect usage'
+for prog in gzip gunzip; do
+	for opt in '--invalid-option' '-0'; do
+		assert_error '\<(unrecognized|invalid) option\>' $prog $opt
+	done
+done
+
+
+begin_test '-t (test) option works'
+good_files=(
+'H4sIAAAAAAAAA3PMSVTITVTIzi9JVABTIJ5jzpGZelwAX+86ehsAAAA='
+'H4sIAAAAAAAAAwvJSFUoLM1MzlZIKsovz1NIy69QyCrNLShWyC9LLVIoAUrnJFZVKqTkp+txAQBqzFDrLQAAAA==')
+bad_files=(
+'H4sIAO1YYmAAA3PMSVTITVTIzi9JVABTIJ5jzpGZelwAX+46ehsAAAA='
+'H4sIAO1YYmAAA3PMSVTITVTIzi85VABTIJ5jzpGZelwAX+86ehsAAAA='
+'H4sIAAAAAAAAA3PMSVTITVTIzi9JVABTIJ5jzpGZelwAX+86ehsBAAA='
+'H4sIAAAAAAAAAwvJSFUoLM1MzlZIKsovz1NIy69QyCrNLShWyC9LLVIogUrnJFZVKqTkp+txAQBqzFDrLQAAAA=='
+'H4sIAAAAAAAAAwvJSFUoLM1MzlZIKsovz1NIy69QyCrNLShWyC9L')
+for contents in "${good_files[@]}"; do
+	echo "$contents" | base64 -d | gzip -t
+done
+for contents in "${bad_files[@]}"; do
+	echo "$contents" | base64 -d > file
+	assert_error '\<invalid compressed data|file corrupt|unexpected end of file|Out of memory\>' \
+		gzip -t file
+done
+
+
+begin_test '-q (quiet) option works'
+mkdir dir
+gunzip -q dir &> output || true
+[ ! -s output ]
+
+
+begin_test 'Version information'
+gzip -V | grep -q Copyright
+gunzip -V | grep -q Copyright
+
+CURRENT_TEST=
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/make-windows-releases.sh b/tools/z64compress/src/enc/libdeflate/scripts/make-windows-releases.sh
new file mode 100644
index 000000000..332f42c51
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/make-windows-releases.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -eu -o pipefail
+
+for arch in 'i686' 'x86_64'; do
+	dir=libdeflate-$(git describe --tags | tr -d v)-windows-${arch}-bin
+	rm -rf build "$dir" "$dir.zip"
+	CFLAGS="-Werror" ${arch}-w64-mingw32-cmake -B build -G Ninja \
+		-DLIBDEFLATE_BUILD_TESTS=1
+	cmake --build build
+	mkdir "$dir"
+	cp libdeflate.h build/libdeflate.{dll,dll.a,a} \
+		build/programs/{benchmark,checksum}.exe "$dir"
+	cp build/programs/libdeflate-gzip.exe "$dir"/gzip.exe
+	cp build/programs/libdeflate-gzip.exe "$dir"/gunzip.exe
+	${arch}-w64-mingw32-strip "$dir"/libdeflate.dll "$dir"/*.exe
+	for file in COPYING NEWS.md README.md; do
+		sed < $file > "$dir/${file}.txt" -e 's/$/\r/g'
+	done
+	(cd "$dir" && zip -r "../${dir}.zip" .)
+done
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/msc_test.bat b/tools/z64compress/src/enc/libdeflate/scripts/msc_test.bat
new file mode 100644
index 000000000..e8a150fa0
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/msc_test.bat
@@ -0,0 +1,3 @@
+nmake /f Makefile.msc clean
+nmake /f Makefile.msc
+copy /y *.exe j:\exe\
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/run_tests.sh b/tools/z64compress/src/enc/libdeflate/scripts/run_tests.sh
new file mode 100644
index 000000000..44d3b9e25
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/run_tests.sh
@@ -0,0 +1,415 @@
+#!/bin/bash
+#
+# Test script for libdeflate
+#
+# Usage:
+#    Run all tests:
+#	./run_tests.sh
+#    Run only the given tests:
+#	./run_tests.sh asan valgrind
+#    Run all tests other than the given ones:
+#	./run_tests.sh ^asan ^valgrind
+#
+# See TEST_FUNCS for the available tests.
+
+set -eu -o pipefail
+cd "$(dirname "$0")/.."
+
+# Use CC if specified in environment, else default to "cc".
+: "${CC:=cc}"
+
+export CFLAGS="-Werror -DLIBDEFLATE_ENABLE_ASSERTIONS"
+
+# No wrapper by default; overridden by valgrind tests
+export WRAPPER=
+
+TEST_FUNCS=()
+
+CLEANUP_CMDS=()
+cleanup() {
+	for cmd in "${CLEANUP_CMDS[@]}"; do
+		eval "$cmd"
+	done
+}
+trap cleanup EXIT
+
+CLEANUP_CMDS+=("rm -rf build")
+
+# Use TESTDATA if specified in environment, else generate it.
+if [ -z "${TESTDATA:-}" ]; then
+	# Generate default TESTDATA file.
+	TESTDATA=$(mktemp -t libdeflate_testdata.XXXXXXXXXX)
+	export TESTDATA
+	CLEANUP_CMDS+=("rm -f '$TESTDATA'")
+	find . '(' -name '*.c' -o -name '*.h' -o -name '*.sh' ')' \
+		-exec cat '{}' ';' | head -c 1000000 > "$TESTDATA"
+fi
+
+TMPDIR=$(mktemp -d -t libdeflate_test.XXXXXXXXX)
+CLEANUP_CMDS+=("rm -r '$TMPDIR'")
+
+MAKE="make -j$(getconf _NPROCESSORS_ONLN)"
+
+UNAME=$(uname)
+ARCH=$(uname -m)
+
+SHLIB=build/libdeflate.so
+if [ "$UNAME" = Darwin ]; then
+	SHLIB=build/libdeflate.dylib
+fi
+
+###############################################################################
+
+INDENT=0
+
+log()
+{
+	echo -n "[$(date)] "
+	if (( INDENT != 0 )); then
+		head -c $(( INDENT * 4 )) /dev/zero | tr '\0' ' '
+	fi
+	echo "$@"
+}
+
+begin()
+{
+	log "$@"
+	(( INDENT++ )) || true
+}
+
+end()
+{
+	(( INDENT-- )) || true
+}
+
+run_cmd()
+{
+	log "$@"
+	"$@" > /dev/null
+}
+
+fail()
+{
+	echo 1>&2 "$@"
+	exit 1
+}
+
+file_count()
+{
+	local dir=$1
+
+	find "$dir" -type f -o -type l | wc -l
+}
+
+cflags_supported()
+{
+	# -Werror is needed here in order for old versions of clang to reject
+	# invalid options.
+	echo 'int main(void){ return 0; }' \
+		| $CC "$@" -Werror -x c - -o /dev/null 2>/dev/null
+}
+
+# Build libdeflate, including the test programs.  Set the special test support
+# flag to get support for LIBDEFLATE_DISABLE_CPU_FEATURES.
+build()
+{
+	CFLAGS="$CFLAGS -DTEST_SUPPORT__DO_NOT_USE=1" scripts/cmake-helper.sh \
+		-DLIBDEFLATE_BUILD_TESTS=1 "$@" > /dev/null
+	$MAKE -C build > /dev/null
+}
+
+build_and_run_tests()
+{
+	local quick=false
+	if [ "${1:-}" = "--quick" ]; then
+		quick=true
+		shift
+	fi
+
+	begin "CC=$CC CFLAGS=\"$CFLAGS\" WRAPPER=\"$WRAPPER\" $*"
+
+	build "$@"
+
+	# When not using -march=native, run the tests multiple times with
+	# different combinations of CPU features disabled.  This is needed to
+	# test all variants of dynamically-dispatched code.
+	#
+	# For now, we aren't super exhausive in which combinations of features
+	# we test disabling.  We just disable the features roughly in order from
+	# newest to oldest for each architecture, cumulatively.  In practice,
+	# that's good enough to cover all the code.
+	local features=('')
+	if ! [[ "$CFLAGS" =~ "-march=native" ]] && ! $quick; then
+		case "$ARCH" in
+		i386|x86_64)
+			features+=(avx2 avx bmi2 pclmul sse2)
+			;;
+		arm*|aarch*)
+			features+=(dotprod sha3 crc32 pmull neon)
+			;;
+		esac
+	fi
+	local disable_str=""
+	local feature
+	for feature in "${features[@]}"; do
+		if [ -n "$feature" ]; then
+			if [ -n "$disable_str" ]; then
+				disable_str+=","
+			fi
+			disable_str+="$feature"
+		fi
+		log "Using LIBDEFLATE_DISABLE_CPU_FEATURES=$disable_str"
+		LIBDEFLATE_DISABLE_CPU_FEATURES="$disable_str" \
+		    sh ./scripts/exec_tests.sh build/programs/ > /dev/null
+	done
+	end
+}
+
+is_compatible_system_gzip()
+{
+	local prog=$1
+
+	# Needs to exist.
+	if ! [ -e "$prog" ]; then
+		return 1
+	fi
+	# Needs to be GNU gzip.
+	if ! "$prog" -V 2>&1 | grep -q 'Free Software Foundation'; then
+		return 1
+	fi
+	# Needs to support the -k option, i.e. be v1.6 or later.
+	if echo | { "$prog" -k 2>&1 >/dev/null || true; } \
+				| grep -q 'invalid option'; then
+		return 1
+	fi
+	return 0
+}
+
+gzip_tests()
+{
+	local gzips=("$PWD/build/programs/libdeflate-gzip")
+	local gunzips=("$PWD/build/programs/libdeflate-gzip -d")
+	if [ "${1:-}" != "--quick" ]; then
+		if is_compatible_system_gzip /bin/gzip; then
+			gzips+=(/bin/gzip)
+			gunzips+=(/bin/gunzip)
+		elif is_compatible_system_gzip /usr/bin/gzip; then
+			gzips+=(/usr/bin/gzip)
+			gunzips+=(/usr/bin/gunzip)
+		else
+			log "Unsupported system gzip; skipping comparison with system gzip"
+		fi
+	fi
+	local gzip gunzip
+
+	begin "Running gzip program tests with CC=\"$CC\" CFLAGS=\"$CFLAGS\""
+	build
+	for gzip in "${gzips[@]}"; do
+		for gunzip in "${gunzips[@]}"; do
+			log "GZIP=$gzip, GUNZIP=$gunzip"
+			GZIP="$gzip" GUNZIP="$gunzip" TESTDATA="$TESTDATA" \
+				./scripts/gzip_tests.sh
+		done
+	done
+	end
+}
+
+do_run_tests()
+{
+	build_and_run_tests "$@"
+	gzip_tests "$@"
+}
+
+################################################################################
+
+regular_test()
+{
+	do_run_tests
+}
+TEST_FUNCS+=(regular_test)
+
+O3_test()
+{
+	CFLAGS="$CFLAGS -O3" do_run_tests
+}
+TEST_FUNCS+=(O3_test)
+
+march_native_test()
+{
+	if ! cflags_supported "-march=native"; then
+		log "Compiler doesn't support -march=native; skipping test"
+		return
+	fi
+	CFLAGS="$CFLAGS -march=native" do_run_tests
+}
+TEST_FUNCS+=(march_native_test)
+
+valgrind_version_at_least()
+{
+	local want_vers=$1
+	local vers
+
+	if ! type -P valgrind &> /dev/null; then
+		return 1
+	fi
+
+	vers=$(valgrind --version | grep -E -o '[0-9\.]+' | head -1)
+
+	[ "$want_vers" = "$(echo -e "$vers\n$want_vers" | sort -V | head -1)" ]
+}
+
+valgrind_test()
+{
+	# Need valgrind 3.9.0 for '--errors-for-leak-kinds=all'
+	# Need valgrind 3.12.0 for armv8 crypto and crc instructions
+	if ! valgrind_version_at_least 3.12.0; then
+		log "valgrind not found; skipping test"
+		return
+	fi
+	WRAPPER="valgrind --quiet --error-exitcode=100 --leak-check=full --errors-for-leak-kinds=all" \
+		do_run_tests --quick
+}
+TEST_FUNCS+=(valgrind_test)
+
+ubsan_test()
+{
+	local cflags=("-fsanitize=undefined" "-fno-sanitize-recover=undefined")
+	if ! cflags_supported "${cflags[@]}"; then
+		log "Compiler doesn't support UBSAN; skipping test"
+		return
+	fi
+	CFLAGS="$CFLAGS ${cflags[*]}" do_run_tests --quick
+}
+TEST_FUNCS+=(ubsan_test)
+
+asan_test()
+{
+	local cflags=("-fsanitize=address" "-fno-sanitize-recover=address")
+	if ! cflags_supported "${cflags[@]}"; then
+		log "Compiler doesn't support ASAN; skipping test"
+		return
+	fi
+	CFLAGS="$CFLAGS ${cflags[*]}" do_run_tests --quick
+}
+TEST_FUNCS+=(asan_test)
+
+cfi_test()
+{
+	local cflags=("-fsanitize=cfi" "-fno-sanitize-recover=cfi" "-flto"
+		      "-fvisibility=hidden")
+	if ! cflags_supported "${cflags[@]}"; then
+		log "Compiler doesn't support CFI; skipping test"
+		return
+	fi
+	CFLAGS="$CFLAGS ${cflags[*]}" AR=llvm-ar do_run_tests --quick
+}
+TEST_FUNCS+=(cfi_test)
+
+install_test()
+{
+	build
+	$MAKE -C build install DESTDIR=inst > /dev/null
+}
+TEST_FUNCS+=(install_test)
+
+symbol_prefix_test()
+{
+	build
+	log "Checking that all global symbols are prefixed with \"libdeflate_\""
+	if nm build/libdeflate.a | grep ' T ' | grep -E -v " _?libdeflate_"
+	then
+		fail "Some global symbols aren't prefixed with \"libdeflate_\""
+	fi
+	log "Checking that all exported symbols are prefixed with \"libdeflate\""
+	if nm $SHLIB | grep ' T ' \
+			| grep -E -v " _?(libdeflate_|_init\>|_fini\>)"; then
+		fail "Some exported symbols aren't prefixed with \"libdeflate_\""
+	fi
+}
+TEST_FUNCS+=(symbol_prefix_test)
+
+is_dynamically_linked()
+{
+	local prog=$1
+
+	if [ "$UNAME" = Darwin ]; then
+		otool -L "$prog" | grep -q libdeflate
+	else
+		ldd "$prog" | grep -q libdeflate
+	fi
+}
+
+use_shared_lib_test()
+{
+	log "Testing USE_SHARED_LIB=1"
+	build
+	if is_dynamically_linked build/programs/libdeflate-gzip; then
+		fail "Binary should be statically linked by default"
+	fi
+	build -DLIBDEFLATE_USE_SHARED_LIB=1 > /dev/null
+	if ! is_dynamically_linked build/programs/libdeflate-gzip; then
+		fail "Binary isn't dynamically linked"
+	fi
+}
+TEST_FUNCS+=(use_shared_lib_test)
+
+freestanding_test()
+{
+	if [ "$UNAME" = Darwin ]; then
+		log "Skipping freestanding build tests due to unsupported OS"
+		return
+	fi
+	build_and_run_tests --quick -DLIBDEFLATE_FREESTANDING=1
+	if nm $SHLIB | grep -v '\<__stack_chk_fail\>' | grep -q ' U '; then
+		echo 1>&2 "Freestanding lib links to external functions!:"
+		nm $SHLIB | grep ' U '
+		return 1
+	fi
+	if ldd $SHLIB | grep -q -v '\<statically linked\>'; then
+		echo 1>&2 "Freestanding lib links to external libraries!:"
+		ldd $SHLIB
+		return 1
+	fi
+}
+TEST_FUNCS+=(freestanding_test)
+
+###############################################################################
+
+declare -A all_tests
+for test_func in "${TEST_FUNCS[@]}"; do
+	all_tests["${test_func%_test}"]=true
+done
+declare -A tests_to_run
+
+# Determine the set of tests to run by applying any inclusions and exclusions
+# given on the command line.  If no inclusions were given, then default to all
+# tests (subject to exclusions).
+all=true
+for arg; do
+	if [[ $arg != ^* ]]; then
+		all=false
+	fi
+done
+if $all; then
+	for t in "${!all_tests[@]}"; do
+		tests_to_run[$t]=true
+	done
+fi
+for arg; do
+	if [[ $arg == ^* ]]; then
+		unset "tests_to_run[${arg#^}]"
+	elif [[ -z ${all_tests["$arg"]:-} ]]; then
+		fail "Unknown test '$arg'.  Options are: ${!all_tests[*]}"
+	else
+		tests_to_run["$arg"]=true
+	fi
+done
+
+# Actually run the tests.
+log "Running libdeflate tests: ${!tests_to_run[*]}"
+for t in "${!tests_to_run[@]}"; do
+	begin "Running ${t}_test"
+	eval "${t}_test"
+	end
+done
+log "All tests passed!"
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake b/tools/z64compress/src/enc/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake
new file mode 100644
index 000000000..0b8063648
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/toolchain-i686-w64-mingw32.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR i686)
+set(CMAKE_C_COMPILER i686-w64-mingw32-gcc)
+set(CMAKE_FIND_ROOT_PATH /usr/i686-w64-mingw32)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
diff --git a/tools/z64compress/src/enc/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake b/tools/z64compress/src/enc/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake
new file mode 100644
index 000000000..f9d6e37f2
--- /dev/null
+++ b/tools/z64compress/src/enc/libdeflate/scripts/toolchain-x86_64-w64-mingw32.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_NAME Windows)
+set(CMAKE_SYSTEM_PROCESSOR x86_64)
+set(CMAKE_C_COMPILER x86_64-w64-mingw32-gcc)
+set(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
diff --git a/tools/z64compress/src/enc/lzo.c b/tools/z64compress/src/enc/lzo.c
new file mode 100644
index 000000000..a6f9b3b26
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo.c
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "lzo/lzoconf.h"
+#include "lzo/lzo1x.h"
+
+void
+lzoCtx_free(void *_ctx)
+{
+	if (!_ctx)
+		return;
+	
+	free(_ctx);
+}
+
+void *
+lzoCtx_new(void)
+{
+	return malloc(LZO1X_999_MEM_COMPRESS);
+}
+
+int
+lzoenc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	unsigned char *wrkmem = _ctx;
+	lzo_uint result_sz = 0;
+	
+	extern int g_hlen; /* header length */
+	memset(dst, 0, g_hlen);
+	memcpy(dst, "LZO0", 4);
+	dst[4] = (src_sz >> 24);
+	dst[5] = (src_sz >> 16);
+	dst[6] = (src_sz >>  8);
+	dst[7] = (src_sz >>  0);
+	
+	if (!wrkmem)
+		return 1;
+	
+	memset(wrkmem, 0, LZO1X_999_MEM_COMPRESS);
+	
+	lzo1x_999_compress(src, src_sz, dst + g_hlen, &result_sz, wrkmem);
+	
+	*dst_sz = result_sz + g_hlen;
+	
+	return 0;
+}
+
diff --git a/tools/z64compress/src/enc/lzo/config1x.h b/tools/z64compress/src/enc/lzo/config1x.h
new file mode 100644
index 000000000..f85cb6548
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/config1x.h
@@ -0,0 +1,106 @@
+/* config1x.h -- configuration for the LZO1X algorithm
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the library and is subject
+   to change.
+ */
+
+
+#ifndef __LZO_CONFIG1X_H
+#define __LZO_CONFIG1X_H 1
+
+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
+#  define LZO1X 1
+#endif
+
+#include "lzo_conf.h"
+#if !defined(__LZO_IN_MINILZO)
+#include "lzo1x.h"
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#ifndef LZO_EOF_CODE
+#define LZO_EOF_CODE 1
+#endif
+#undef LZO_DETERMINISTIC
+
+#define M1_MAX_OFFSET   0x0400
+#ifndef M2_MAX_OFFSET
+#define M2_MAX_OFFSET   0x0800
+#endif
+#define M3_MAX_OFFSET   0x4000
+#define M4_MAX_OFFSET   0xbfff
+
+#define MX_MAX_OFFSET   (M1_MAX_OFFSET + M2_MAX_OFFSET)
+
+#define M1_MIN_LEN      2
+#define M1_MAX_LEN      2
+#define M2_MIN_LEN      3
+#ifndef M2_MAX_LEN
+#define M2_MAX_LEN      8
+#endif
+#define M3_MIN_LEN      3
+#define M3_MAX_LEN      33
+#define M4_MIN_LEN      3
+#define M4_MAX_LEN      9
+
+#define M1_MARKER       0
+#define M2_MARKER       64
+#define M3_MARKER       32
+#define M4_MARKER       16
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#ifndef MIN_LOOKAHEAD
+#define MIN_LOOKAHEAD       (M2_MAX_LEN + 1)
+#endif
+
+#if defined(LZO_NEED_DICT_H)
+
+#ifndef LZO_HASH
+#define LZO_HASH            LZO_HASH_LZO_INCREMENTAL_B
+#endif
+#define DL_MIN_LEN          M2_MIN_LEN
+#include "lzo_dict.h"
+
+#endif
+
+
+
+#endif /* already included */
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1_d.ch b/tools/z64compress/src/enc/lzo/lzo1_d.ch
new file mode 100644
index 000000000..bedc7ce8c
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1_d.ch
@@ -0,0 +1,156 @@
+/* lzo1_d.ch -- common decompression stuff
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+
+#if defined(LZO_TEST_OVERRUN)
+#  if !defined(LZO_TEST_OVERRUN_INPUT)
+#    define LZO_TEST_OVERRUN_INPUT       2
+#  endif
+#  if !defined(LZO_TEST_OVERRUN_OUTPUT)
+#    define LZO_TEST_OVERRUN_OUTPUT      2
+#  endif
+#  if !defined(LZO_TEST_OVERRUN_LOOKBEHIND)
+#    define LZO_TEST_OVERRUN_LOOKBEHIND  1
+#  endif
+#endif
+
+
+/***********************************************************************
+// Overrun detection is internally handled by these macros:
+//
+//   TEST_IP    test input overrun at loop begin
+//   NEED_IP    test input overrun at every input byte
+//
+//   TEST_OP    test output overrun at loop begin
+//   NEED_OP    test output overrun at every output byte
+//
+//   TEST_LB    test match position
+//
+// The fastest decompressor results when testing for no overruns
+// and using LZO_EOF_CODE.
+************************************************************************/
+
+#undef TEST_IP
+#undef TEST_OP
+#undef TEST_IP_AND_TEST_OP
+#undef TEST_LB
+#undef TEST_LBO
+#undef NEED_IP
+#undef NEED_OP
+#undef TEST_IV
+#undef TEST_OV
+#undef HAVE_TEST_IP
+#undef HAVE_TEST_OP
+#undef HAVE_NEED_IP
+#undef HAVE_NEED_OP
+#undef HAVE_ANY_IP
+#undef HAVE_ANY_OP
+
+
+#if defined(LZO_TEST_OVERRUN_INPUT)
+#  if (LZO_TEST_OVERRUN_INPUT >= 1)
+#    define TEST_IP             (ip < ip_end)
+#  endif
+#  if (LZO_TEST_OVERRUN_INPUT >= 2)
+#    define NEED_IP(x) \
+            if ((lzo_uint)(ip_end - ip) < (lzo_uint)(x))  goto input_overrun
+#    define TEST_IV(x)          if ((x) >  (lzo_uint)0 - (511)) goto input_overrun
+#  endif
+#endif
+
+#if defined(LZO_TEST_OVERRUN_OUTPUT)
+#  if (LZO_TEST_OVERRUN_OUTPUT >= 1)
+#    define TEST_OP             (op <= op_end)
+#  endif
+#  if (LZO_TEST_OVERRUN_OUTPUT >= 2)
+#    undef TEST_OP              /* don't need both of the tests here */
+#    define NEED_OP(x) \
+            if ((lzo_uint)(op_end - op) < (lzo_uint)(x))  goto output_overrun
+#    define TEST_OV(x)          if ((x) >  (lzo_uint)0 - (511)) goto output_overrun
+#  endif
+#endif
+
+#if defined(LZO_TEST_OVERRUN_LOOKBEHIND)
+#  define TEST_LB(m_pos)        if (PTR_LT(m_pos,out) || PTR_GE(m_pos,op)) goto lookbehind_overrun
+#  define TEST_LBO(m_pos,o)     if (PTR_LT(m_pos,out) || PTR_GE(m_pos,op-(o))) goto lookbehind_overrun
+#else
+#  define TEST_LB(m_pos)        ((void) 0)
+#  define TEST_LBO(m_pos,o)     ((void) 0)
+#endif
+
+
+#if !defined(LZO_EOF_CODE) && !defined(TEST_IP)
+   /* if we have no EOF code, we have to test for the end of the input */
+#  define TEST_IP               (ip < ip_end)
+#endif
+
+
+#if defined(TEST_IP)
+#  define HAVE_TEST_IP 1
+#else
+#  define TEST_IP               1
+#endif
+#if defined(TEST_OP)
+#  define HAVE_TEST_OP 1
+#else
+#  define TEST_OP               1
+#endif
+
+#if defined(HAVE_TEST_IP) && defined(HAVE_TEST_OP)
+#  define TEST_IP_AND_TEST_OP   (TEST_IP && TEST_OP)
+#elif defined(HAVE_TEST_IP)
+#  define TEST_IP_AND_TEST_OP   TEST_IP
+#elif defined(HAVE_TEST_OP)
+#  define TEST_IP_AND_TEST_OP   TEST_OP
+#else
+#  define TEST_IP_AND_TEST_OP   1
+#endif
+
+#if defined(NEED_IP)
+#  define HAVE_NEED_IP 1
+#else
+#  define NEED_IP(x)            ((void) 0)
+#  define TEST_IV(x)            ((void) 0)
+#endif
+#if defined(NEED_OP)
+#  define HAVE_NEED_OP 1
+#else
+#  define NEED_OP(x)            ((void) 0)
+#  define TEST_OV(x)            ((void) 0)
+#endif
+
+
+#if defined(HAVE_TEST_IP) || defined(HAVE_NEED_IP)
+#  define HAVE_ANY_IP 1
+#endif
+#if defined(HAVE_TEST_OP) || defined(HAVE_NEED_OP)
+#  define HAVE_ANY_OP 1
+#endif
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x.h b/tools/z64compress/src/enc/lzo/lzo1x.h
new file mode 100644
index 000000000..a11151407
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x.h
@@ -0,0 +1,165 @@
+/* lzo1x.h -- public interface of the LZO1X compression algorithm
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#ifndef __LZO1X_H_INCLUDED
+#define __LZO1X_H_INCLUDED 1
+
+#ifndef __LZOCONF_H_INCLUDED
+#include <lzo/lzoconf.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+/* Memory required for the wrkmem parameter.
+ * When the required size is 0, you can also pass a NULL pointer.
+ */
+
+#define LZO1X_MEM_COMPRESS      LZO1X_1_MEM_COMPRESS
+#define LZO1X_MEM_DECOMPRESS    (0)
+#define LZO1X_MEM_OPTIMIZE      (0)
+
+
+/* decompression */
+LZO_EXTERN(int)
+lzo1x_decompress        ( const lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem /* NOT USED */ );
+
+/* safe decompression with overrun testing */
+LZO_EXTERN(int)
+lzo1x_decompress_safe   ( const lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem /* NOT USED */ );
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#define LZO1X_1_MEM_COMPRESS    ((lzo_uint32_t) (16384L * lzo_sizeof_dict_t))
+
+LZO_EXTERN(int)
+lzo1x_1_compress        ( const lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem );
+
+
+/***********************************************************************
+// special compressor versions
+************************************************************************/
+
+/* this version needs only 8 KiB work memory */
+#define LZO1X_1_11_MEM_COMPRESS ((lzo_uint32_t) (2048L * lzo_sizeof_dict_t))
+
+LZO_EXTERN(int)
+lzo1x_1_11_compress     ( const lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem );
+
+
+/* this version needs 16 KiB work memory */
+#define LZO1X_1_12_MEM_COMPRESS ((lzo_uint32_t) (4096L * lzo_sizeof_dict_t))
+
+LZO_EXTERN(int)
+lzo1x_1_12_compress     ( const lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem );
+
+
+/* use this version if you need a little more compression speed */
+#define LZO1X_1_15_MEM_COMPRESS ((lzo_uint32_t) (32768L * lzo_sizeof_dict_t))
+
+LZO_EXTERN(int)
+lzo1x_1_15_compress     ( const lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem );
+
+
+/***********************************************************************
+// better compression ratio at the cost of more memory and time
+************************************************************************/
+
+#define LZO1X_999_MEM_COMPRESS  ((lzo_uint32_t) (14 * 16384L * sizeof(short)))
+
+LZO_EXTERN(int)
+lzo1x_999_compress      ( const lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem );
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+LZO_EXTERN(int)
+lzo1x_999_compress_dict     ( const lzo_bytep src, lzo_uint  src_len,
+                                    lzo_bytep dst, lzo_uintp dst_len,
+                                    lzo_voidp wrkmem,
+                              const lzo_bytep dict, lzo_uint dict_len );
+
+LZO_EXTERN(int)
+lzo1x_999_compress_level    ( const lzo_bytep src, lzo_uint  src_len,
+                                    lzo_bytep dst, lzo_uintp dst_len,
+                                    lzo_voidp wrkmem,
+                              const lzo_bytep dict, lzo_uint dict_len,
+                                    lzo_callback_p cb,
+                                    int compression_level );
+
+LZO_EXTERN(int)
+lzo1x_decompress_dict_safe ( const lzo_bytep src, lzo_uint  src_len,
+                                   lzo_bytep dst, lzo_uintp dst_len,
+                                   lzo_voidp wrkmem /* NOT USED */,
+                             const lzo_bytep dict, lzo_uint dict_len );
+
+
+/***********************************************************************
+// optimize a compressed data block
+************************************************************************/
+
+LZO_EXTERN(int)
+lzo1x_optimize          (       lzo_bytep src, lzo_uint  src_len,
+                                lzo_bytep dst, lzo_uintp dst_len,
+                                lzo_voidp wrkmem /* NOT USED */ );
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* already included */
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x_1.c b/tools/z64compress/src/enc/lzo/lzo1x_1.c
new file mode 100644
index 000000000..a659393f2
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x_1.c
@@ -0,0 +1,57 @@
+/* lzo1x_1.c -- LZO1X-1 compression
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#include "lzo_conf.h"
+#if 1 && defined(UA_GET_LE32)
+#undef  LZO_DICT_USE_PTR
+#define LZO_DICT_USE_PTR 0
+#undef  lzo_dict_t
+#define lzo_dict_t lzo_uint16_t
+#endif
+
+#define LZO_NEED_DICT_H 1
+#ifndef D_BITS
+#define D_BITS          14
+#endif
+#define D_INDEX1(d,p)       d = DM(DMUL(0x21,DX3(p,5,5,6)) >> 5)
+#define D_INDEX2(d,p)       d = (d & (D_MASK & 0x7ff)) ^ (D_HIGH | 0x1f)
+#if 1
+#define DINDEX(dv,p)        DM(((DMUL(0x1824429d,dv)) >> (32-D_BITS)))
+#else
+#define DINDEX(dv,p)        DM((dv) + ((dv) >> (32-D_BITS)))
+#endif
+#include "config1x.h"
+#define LZO_DETERMINISTIC !(LZO_DICT_USE_PTR)
+
+#ifndef DO_COMPRESS
+#define DO_COMPRESS     lzo1x_1_compress
+#endif
+
+#include "lzo1x_c.ch"
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x_9x.c b/tools/z64compress/src/enc/lzo/lzo1x_9x.c
new file mode 100644
index 000000000..39a211e54
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x_9x.c
@@ -0,0 +1,867 @@
+/* lzo1x_9x.c -- implementation of the LZO1X-999 compression algorithm
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#if !defined(LZO1X) && !defined(LZO1Y) && !defined(LZO1Z)
+#  define LZO1X 1
+#endif
+
+#if defined(LZO1X)
+#  include "config1x.h"
+#elif defined(LZO1Y)
+#  include "config1y.h"
+#elif defined(LZO1Z)
+#  include "config1z.h"
+#else
+#  error
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#define SWD_N           M4_MAX_OFFSET   /* size of ring buffer */
+#define SWD_THRESHOLD       1           /* lower limit for match length */
+#define SWD_F            2048           /* upper limit for match length */
+
+#define SWD_BEST_OFF    (LZO_MAX3( M2_MAX_LEN, M3_MAX_LEN, M4_MAX_LEN ) + 1)
+
+#if defined(LZO1X)
+#  define LZO_COMPRESS_T                lzo1x_999_t
+#  define lzo_swd_t                     lzo1x_999_swd_t
+#elif defined(LZO1Y)
+#  define LZO_COMPRESS_T                lzo1y_999_t
+#  define lzo_swd_t                     lzo1y_999_swd_t
+#  define lzo1x_999_compress_internal   lzo1y_999_compress_internal
+#  define lzo1x_999_compress_dict       lzo1y_999_compress_dict
+#  define lzo1x_999_compress_level      lzo1y_999_compress_level
+#  define lzo1x_999_compress            lzo1y_999_compress
+#elif defined(LZO1Z)
+#  define LZO_COMPRESS_T                lzo1z_999_t
+#  define lzo_swd_t                     lzo1z_999_swd_t
+#  define lzo1x_999_compress_internal   lzo1z_999_compress_internal
+#  define lzo1x_999_compress_dict       lzo1z_999_compress_dict
+#  define lzo1x_999_compress_level      lzo1z_999_compress_level
+#  define lzo1x_999_compress            lzo1z_999_compress
+#else
+#  error
+#endif
+
+#if 0
+#  define HEAD3(b,p) \
+    ((((((lzo_xint)b[p]<<3)^b[p+1])<<3)^b[p+2]) & (SWD_HSIZE-1))
+#endif
+#if 0 && (LZO_OPT_UNALIGNED32) && (LZO_ABI_LITTLE_ENDIAN)
+#  define HEAD3(b,p) \
+    (((* (lzo_uint32_tp) &b[p]) ^ ((* (lzo_uint32_tp) &b[p])>>10)) & (SWD_HSIZE-1))
+#endif
+
+#include "lzo_mchw.ch"
+
+
+/* this is a public functions, but there is no prototype in a header file */
+LZO_EXTERN(int)
+lzo1x_999_compress_internal ( const lzo_bytep in , lzo_uint  in_len,
+                                    lzo_bytep out, lzo_uintp out_len,
+                                    lzo_voidp wrkmem,
+                              const lzo_bytep dict, lzo_uint dict_len,
+                                    lzo_callback_p cb,
+                                    int try_lazy_parm,
+                                    lzo_uint good_length,
+                                    lzo_uint max_lazy,
+                                    lzo_uint nice_length,
+                                    lzo_uint max_chain,
+                                    lzo_uint32_t flags );
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static lzo_bytep
+code_match ( LZO_COMPRESS_T *c, lzo_bytep op, lzo_uint m_len, lzo_uint m_off )
+{
+    lzo_uint x_len = m_len;
+    lzo_uint x_off = m_off;
+
+    c->match_bytes += m_len;
+
+#if 0
+/*
+    static lzo_uint last_m_len = 0, last_m_off = 0;
+    static lzo_uint prev_m_off[4];
+    static unsigned prev_m_off_ptr = 0;
+    unsigned i;
+
+    //if (m_len >= 3 && m_len <= M2_MAX_LEN && m_off <= M2_MAX_OFFSET)
+    if (m_len >= 3 && m_len <= M2_MAX_LEN)
+    {
+    //if (m_len == last_m_len && m_off == last_m_off)
+        //printf("last_m_len + last_m_off\n");
+    //else
+    if (m_off == last_m_off)
+        printf("last_m_off\n");
+    else
+    {
+        for (i = 0; i < 4; i++)
+            if (m_off == prev_m_off[i])
+                printf("prev_m_off %u: %5ld\n",i,(long)m_off);
+    }
+    }
+    last_m_len = m_len;
+    last_m_off = prev_m_off[prev_m_off_ptr] = m_off;
+    prev_m_off_ptr = (prev_m_off_ptr + 1) & 3;
+*/
+#endif
+
+    assert(op > c->out);
+    if (m_len == 2)
+    {
+        assert(m_off <= M1_MAX_OFFSET);
+        assert(c->r1_lit > 0); assert(c->r1_lit < 4);
+        m_off -= 1;
+#if defined(LZO1Z)
+        *op++ = LZO_BYTE(M1_MARKER | (m_off >> 6));
+        *op++ = LZO_BYTE(m_off << 2);
+#else
+        *op++ = LZO_BYTE(M1_MARKER | ((m_off & 3) << 2));
+        *op++ = LZO_BYTE(m_off >> 2);
+#endif
+        c->m1a_m++;
+    }
+#if defined(LZO1Z)
+    else if (m_len <= M2_MAX_LEN && (m_off <= M2_MAX_OFFSET || m_off == c->last_m_off))
+#else
+    else if (m_len <= M2_MAX_LEN && m_off <= M2_MAX_OFFSET)
+#endif
+    {
+        assert(m_len >= 3);
+#if defined(LZO1X)
+        m_off -= 1;
+        *op++ = LZO_BYTE(((m_len - 1) << 5) | ((m_off & 7) << 2));
+        *op++ = LZO_BYTE(m_off >> 3);
+        assert(op[-2] >= M2_MARKER);
+#elif defined(LZO1Y)
+        m_off -= 1;
+        *op++ = LZO_BYTE(((m_len + 1) << 4) | ((m_off & 3) << 2));
+        *op++ = LZO_BYTE(m_off >> 2);
+        assert(op[-2] >= M2_MARKER);
+#elif defined(LZO1Z)
+        if (m_off == c->last_m_off)
+            *op++ = LZO_BYTE(((m_len - 1) << 5) | (0x700 >> 6));
+        else
+        {
+            m_off -= 1;
+            *op++ = LZO_BYTE(((m_len - 1) << 5) | (m_off >> 6));
+            *op++ = LZO_BYTE(m_off << 2);
+        }
+#endif
+        c->m2_m++;
+    }
+    else if (m_len == M2_MIN_LEN && m_off <= MX_MAX_OFFSET && c->r1_lit >= 4)
+    {
+        assert(m_len == 3);
+        assert(m_off > M2_MAX_OFFSET);
+        m_off -= 1 + M2_MAX_OFFSET;
+#if defined(LZO1Z)
+        *op++ = LZO_BYTE(M1_MARKER | (m_off >> 6));
+        *op++ = LZO_BYTE(m_off << 2);
+#else
+        *op++ = LZO_BYTE(M1_MARKER | ((m_off & 3) << 2));
+        *op++ = LZO_BYTE(m_off >> 2);
+#endif
+        c->m1b_m++;
+    }
+    else if (m_off <= M3_MAX_OFFSET)
+    {
+        assert(m_len >= 3);
+        m_off -= 1;
+        if (m_len <= M3_MAX_LEN)
+            *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
+        else
+        {
+            m_len -= M3_MAX_LEN;
+            *op++ = M3_MARKER | 0;
+            while (m_len > 255)
+            {
+                m_len -= 255;
+                *op++ = 0;
+            }
+            assert(m_len > 0);
+            *op++ = LZO_BYTE(m_len);
+        }
+#if defined(LZO1Z)
+        *op++ = LZO_BYTE(m_off >> 6);
+        *op++ = LZO_BYTE(m_off << 2);
+#else
+        *op++ = LZO_BYTE(m_off << 2);
+        *op++ = LZO_BYTE(m_off >> 6);
+#endif
+        c->m3_m++;
+    }
+    else
+    {
+        lzo_uint k;
+
+        assert(m_len >= 3);
+        assert(m_off > 0x4000); assert(m_off <= 0xbfff);
+        m_off -= 0x4000;
+        k = (m_off & 0x4000) >> 11;
+        if (m_len <= M4_MAX_LEN)
+            *op++ = LZO_BYTE(M4_MARKER | k | (m_len - 2));
+        else
+        {
+            m_len -= M4_MAX_LEN;
+            *op++ = LZO_BYTE(M4_MARKER | k | 0);
+            while (m_len > 255)
+            {
+                m_len -= 255;
+                *op++ = 0;
+            }
+            assert(m_len > 0);
+            *op++ = LZO_BYTE(m_len);
+        }
+#if defined(LZO1Z)
+        *op++ = LZO_BYTE(m_off >> 6);
+        *op++ = LZO_BYTE(m_off << 2);
+#else
+        *op++ = LZO_BYTE(m_off << 2);
+        *op++ = LZO_BYTE(m_off >> 6);
+#endif
+        c->m4_m++;
+    }
+
+    c->last_m_len = x_len;
+    c->last_m_off = x_off;
+    return op;
+}
+
+
+static lzo_bytep
+STORE_RUN ( LZO_COMPRESS_T *c, lzo_bytep op, const lzo_bytep ii, lzo_uint t )
+{
+    c->lit_bytes += t;
+
+    if (op == c->out && t <= 238)
+    {
+        *op++ = LZO_BYTE(17 + t);
+    }
+    else if (t <= 3)
+    {
+#if defined(LZO1Z)
+        op[-1] = LZO_BYTE(op[-1] | t);
+#else
+        op[-2] = LZO_BYTE(op[-2] | t);
+#endif
+        c->lit1_r++;
+    }
+    else if (t <= 18)
+    {
+        *op++ = LZO_BYTE(t - 3);
+        c->lit2_r++;
+    }
+    else
+    {
+        lzo_uint tt = t - 18;
+
+        *op++ = 0;
+        while (tt > 255)
+        {
+            tt -= 255;
+            *op++ = 0;
+        }
+        assert(tt > 0);
+        *op++ = LZO_BYTE(tt);
+        c->lit3_r++;
+    }
+    do *op++ = *ii++; while (--t > 0);
+
+    return op;
+}
+
+
+static lzo_bytep
+code_run ( LZO_COMPRESS_T *c, lzo_bytep op, const lzo_bytep ii,
+           lzo_uint lit, lzo_uint m_len )
+{
+    if (lit > 0)
+    {
+        assert(m_len >= 2);
+        op = STORE_RUN(c,op,ii,lit);
+        c->r1_m_len = m_len;
+        c->r1_lit = lit;
+    }
+    else
+    {
+        assert(m_len >= 3);
+        c->r1_m_len = 0;
+        c->r1_lit = 0;
+    }
+
+    return op;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static lzo_uint
+len_of_coded_match ( lzo_uint m_len, lzo_uint m_off, lzo_uint lit )
+{
+    lzo_uint n = 4;
+
+    if (m_len < 2)
+        return 0;
+    if (m_len == 2)
+        return (m_off <= M1_MAX_OFFSET && lit > 0 && lit < 4) ? 2 : 0;
+    if (m_len <= M2_MAX_LEN && m_off <= M2_MAX_OFFSET)
+        return 2;
+    if (m_len == M2_MIN_LEN && m_off <= MX_MAX_OFFSET && lit >= 4)
+        return 2;
+    if (m_off <= M3_MAX_OFFSET)
+    {
+        if (m_len <= M3_MAX_LEN)
+            return 3;
+        m_len -= M3_MAX_LEN;
+        while (m_len > 255)
+        {
+            m_len -= 255;
+            n++;
+        }
+        return n;
+    }
+    if (m_off <= M4_MAX_OFFSET)
+    {
+        if (m_len <= M4_MAX_LEN)
+            return 3;
+        m_len -= M4_MAX_LEN;
+        while (m_len > 255)
+        {
+            m_len -= 255;
+            n++;
+        }
+        return n;
+    }
+    return 0;
+}
+
+
+static lzo_uint
+min_gain(lzo_uint ahead, lzo_uint lit1, lzo_uint lit2, lzo_uint l1, lzo_uint l2, lzo_uint l3)
+{
+    lzo_uint lazy_match_min_gain;
+
+    assert (ahead >= 1);
+    lazy_match_min_gain = ahead;
+
+#if 0
+    if (l3)
+        lit2 -= ahead;
+#endif
+
+    if (lit1 <= 3)
+        lazy_match_min_gain += (lit2 <= 3) ? 0 : 2;
+    else if (lit1 <= 18)
+        lazy_match_min_gain += (lit2 <= 18) ? 0 : 1;
+
+    lazy_match_min_gain += (l2 - l1) * 2;
+    if (l3)
+        lazy_match_min_gain -= (ahead - l3) * 2;
+
+    if ((lzo_int) lazy_match_min_gain < 0)
+        lazy_match_min_gain = 0;
+
+#if 0
+    if (l1 == 2)
+        if (lazy_match_min_gain == 0)
+            lazy_match_min_gain = 1;
+#endif
+
+    return lazy_match_min_gain;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#if !defined(NDEBUG)
+static
+void assert_match( const lzo_swd_p swd, lzo_uint m_len, lzo_uint m_off )
+{
+    const LZO_COMPRESS_T *c = swd->c;
+    lzo_uint d_off;
+
+    assert(m_len >= 2);
+    if (m_off <= (lzo_uint) (c->bp - c->in))
+    {
+        assert(c->bp - m_off + m_len < c->ip);
+        assert(lzo_memcmp(c->bp, c->bp - m_off, m_len) == 0);
+    }
+    else
+    {
+        assert(swd->dict != NULL);
+        d_off = m_off - (lzo_uint) (c->bp - c->in);
+        assert(d_off <= swd->dict_len);
+        if (m_len > d_off)
+        {
+            assert(lzo_memcmp(c->bp, swd->dict_end - d_off, d_off) == 0);
+            assert(c->in + m_len - d_off < c->ip);
+            assert(lzo_memcmp(c->bp + d_off, c->in, m_len - d_off) == 0);
+        }
+        else
+        {
+            assert(lzo_memcmp(c->bp, swd->dict_end - d_off, m_len) == 0);
+        }
+    }
+}
+#else
+#  define assert_match(a,b,c)   ((void)0)
+#endif
+
+
+#if defined(SWD_BEST_OFF)
+
+static void
+better_match ( const lzo_swd_p swd, lzo_uint *m_len, lzo_uint *m_off )
+{
+#if defined(LZO1Z)
+    const LZO_COMPRESS_T *c = swd->c;
+#endif
+
+    if (*m_len <= M2_MIN_LEN)
+        return;
+#if defined(LZO1Z)
+    if (*m_off == c->last_m_off && *m_len <= M2_MAX_LEN)
+        return;
+#if 1
+    if (*m_len >= M2_MIN_LEN + 1 && *m_len <= M2_MAX_LEN + 1 &&
+        c->last_m_off && swd->best_off[*m_len-1] == c->last_m_off)
+    {
+        *m_len = *m_len - 1;
+        *m_off = swd->best_off[*m_len];
+        return;
+    }
+#endif
+#endif
+
+    if (*m_off <= M2_MAX_OFFSET)
+        return;
+
+#if 1
+    /* M3/M4 -> M2 */
+    if (*m_off > M2_MAX_OFFSET &&
+        *m_len >= M2_MIN_LEN + 1 && *m_len <= M2_MAX_LEN + 1 &&
+        swd->best_off[*m_len-1] && swd->best_off[*m_len-1] <= M2_MAX_OFFSET)
+    {
+        *m_len = *m_len - 1;
+        *m_off = swd->best_off[*m_len];
+        return;
+    }
+#endif
+
+#if 1
+    /* M4 -> M2 */
+    if (*m_off > M3_MAX_OFFSET &&
+        *m_len >= M4_MAX_LEN + 1 && *m_len <= M2_MAX_LEN + 2 &&
+        swd->best_off[*m_len-2] && swd->best_off[*m_len-2] <= M2_MAX_OFFSET)
+    {
+        *m_len = *m_len - 2;
+        *m_off = swd->best_off[*m_len];
+        return;
+    }
+#endif
+
+#if 1
+    /* M4 -> M3 */
+    if (*m_off > M3_MAX_OFFSET &&
+        *m_len >= M4_MAX_LEN + 1 && *m_len <= M3_MAX_LEN + 1 &&
+        swd->best_off[*m_len-1] && swd->best_off[*m_len-1] <= M3_MAX_OFFSET)
+    {
+        *m_len = *m_len - 1;
+        *m_off = swd->best_off[*m_len];
+    }
+#endif
+}
+
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+LZO_PUBLIC(int)
+lzo1x_999_compress_internal ( const lzo_bytep in , lzo_uint  in_len,
+                                    lzo_bytep out, lzo_uintp out_len,
+                                    lzo_voidp wrkmem,
+                              const lzo_bytep dict, lzo_uint dict_len,
+                                    lzo_callback_p cb,
+                                    int try_lazy_parm,
+                                    lzo_uint good_length,
+                                    lzo_uint max_lazy,
+                                    lzo_uint nice_length,
+                                    lzo_uint max_chain,
+                                    lzo_uint32_t flags )
+{
+    lzo_bytep op;
+    const lzo_bytep ii;
+    lzo_uint lit;
+    lzo_uint m_len, m_off;
+    LZO_COMPRESS_T cc;
+    LZO_COMPRESS_T * const c = &cc;
+    lzo_swd_p const swd = (lzo_swd_p) wrkmem;
+    lzo_uint try_lazy;
+    int r;
+
+    /* sanity check */
+#if defined(LZO1X)
+    LZO_COMPILE_TIME_ASSERT(LZO1X_999_MEM_COMPRESS >= SIZEOF_LZO_SWD_T)
+#elif defined(LZO1Y)
+    LZO_COMPILE_TIME_ASSERT(LZO1Y_999_MEM_COMPRESS >= SIZEOF_LZO_SWD_T)
+#elif defined(LZO1Z)
+    LZO_COMPILE_TIME_ASSERT(LZO1Z_999_MEM_COMPRESS >= SIZEOF_LZO_SWD_T)
+#else
+#  error
+#endif
+
+/* setup parameter defaults */
+    /* number of lazy match tries */
+    try_lazy = (lzo_uint) try_lazy_parm;
+    if (try_lazy_parm < 0)
+        try_lazy = 1;
+    /* reduce lazy match search if we already have a match with this length */
+    if (good_length == 0)
+        good_length = 32;
+    /* do not try a lazy match if we already have a match with this length */
+    if (max_lazy == 0)
+        max_lazy = 32;
+    /* stop searching for longer matches than this one */
+    if (nice_length == 0)
+        nice_length = 0;
+    /* don't search more positions than this */
+    if (max_chain == 0)
+        max_chain = SWD_MAX_CHAIN;
+
+    c->init = 0;
+    c->ip = c->in = in;
+    c->in_end = in + in_len;
+    c->out = out;
+    c->cb = cb;
+    c->m1a_m = c->m1b_m = c->m2_m = c->m3_m = c->m4_m = 0;
+    c->lit1_r = c->lit2_r = c->lit3_r = 0;
+
+    op = out;
+    ii = c->ip;             /* point to start of literal run */
+    lit = 0;
+    c->r1_lit = c->r1_m_len = 0;
+
+    r = init_match(c,swd,dict,dict_len,flags);
+    if (r != 0)
+        return r;
+    if (max_chain > 0)
+        swd->max_chain = max_chain;
+    if (nice_length > 0)
+        swd->nice_length = nice_length;
+
+    r = find_match(c,swd,0,0);
+    if (r != 0)
+        return r;
+    while (c->look > 0)
+    {
+        lzo_uint ahead;
+        lzo_uint max_ahead;
+        lzo_uint l1, l2, l3;
+
+        c->codesize = pd(op, out);
+
+        m_len = c->m_len;
+        m_off = c->m_off;
+
+        assert(c->bp == c->ip - c->look);
+        assert(c->bp >= in);
+        if (lit == 0)
+            ii = c->bp;
+        assert(ii + lit == c->bp);
+        assert(swd->b_char == *(c->bp));
+
+        if ( m_len < 2 ||
+            (m_len == 2 && (m_off > M1_MAX_OFFSET || lit == 0 || lit >= 4)) ||
+#if 1
+            /* Do not accept this match for compressed-data compatibility
+             * with LZO v1.01 and before
+             * [ might be a problem for decompress() and optimize() ]
+             */
+            (m_len == 2 && op == out) ||
+#endif
+            (op == out && lit == 0))
+        {
+            /* a literal */
+            m_len = 0;
+        }
+        else if (m_len == M2_MIN_LEN)
+        {
+            /* compression ratio improves if we code a literal in some cases */
+            if (m_off > MX_MAX_OFFSET && lit >= 4)
+                m_len = 0;
+        }
+
+        if (m_len == 0)
+        {
+    /* a literal */
+            lit++;
+            swd->max_chain = max_chain;
+            r = find_match(c,swd,1,0);
+            assert(r == 0); LZO_UNUSED(r);
+            continue;
+        }
+
+    /* a match */
+#if defined(SWD_BEST_OFF)
+        if (swd->use_best_off)
+            better_match(swd,&m_len,&m_off);
+#endif
+        assert_match(swd,m_len,m_off);
+
+
+        /* shall we try a lazy match ? */
+        ahead = 0;
+        if (try_lazy == 0 || m_len >= max_lazy)
+        {
+            /* no */
+            l1 = 0;
+            max_ahead = 0;
+        }
+        else
+        {
+            /* yes, try a lazy match */
+            l1 = len_of_coded_match(m_len,m_off,lit);
+            assert(l1 > 0);
+#if 1
+            max_ahead = LZO_MIN(try_lazy, l1 - 1);
+#else
+            max_ahead = LZO_MIN3(try_lazy, l1, m_len - 1);
+#endif
+        }
+
+
+        while (ahead < max_ahead && c->look > m_len)
+        {
+            lzo_uint lazy_match_min_gain;
+
+            if (m_len >= good_length)
+                swd->max_chain = max_chain >> 2;
+            else
+                swd->max_chain = max_chain;
+            r = find_match(c,swd,1,0);
+            ahead++;
+
+            assert(r == 0); LZO_UNUSED(r);
+            assert(c->look > 0);
+            assert(ii + lit + ahead == c->bp);
+
+#if defined(LZO1Z)
+            if (m_off == c->last_m_off && c->m_off != c->last_m_off)
+                if (m_len >= M2_MIN_LEN && m_len <= M2_MAX_LEN)
+                    c->m_len = 0;
+#endif
+            if (c->m_len < m_len)
+                continue;
+#if 1
+            if (c->m_len == m_len && c->m_off >= m_off)
+                continue;
+#endif
+#if defined(SWD_BEST_OFF)
+            if (swd->use_best_off)
+                better_match(swd,&c->m_len,&c->m_off);
+#endif
+            l2 = len_of_coded_match(c->m_len,c->m_off,lit+ahead);
+            if (l2 == 0)
+                continue;
+#if 0
+            if (c->m_len == m_len && l2 >= l1)
+                continue;
+#endif
+
+
+#if 1
+            /* compressed-data compatibility [see above] */
+            l3 = (op == out) ? 0 : len_of_coded_match(ahead,m_off,lit);
+#else
+            l3 = len_of_coded_match(ahead,m_off,lit);
+#endif
+
+            lazy_match_min_gain = min_gain(ahead,lit,lit+ahead,l1,l2,l3);
+            if (c->m_len >= m_len + lazy_match_min_gain)
+            {
+                c->lazy++;
+                assert_match(swd,c->m_len,c->m_off);
+
+                if (l3)
+                {
+                    /* code previous run */
+                    op = code_run(c,op,ii,lit,ahead);
+                    lit = 0;
+                    /* code shortened match */
+                    op = code_match(c,op,ahead,m_off);
+                }
+                else
+                {
+                    lit += ahead;
+                    assert(ii + lit == c->bp);
+                }
+                goto lazy_match_done;
+            }
+        }
+
+
+        assert(ii + lit + ahead == c->bp);
+
+        /* 1 - code run */
+        op = code_run(c,op,ii,lit,m_len);
+        lit = 0;
+
+        /* 2 - code match */
+        op = code_match(c,op,m_len,m_off);
+        swd->max_chain = max_chain;
+        r = find_match(c,swd,m_len,1+ahead);
+        assert(r == 0); LZO_UNUSED(r);
+
+lazy_match_done: ;
+    }
+
+
+    /* store final run */
+    if (lit > 0)
+        op = STORE_RUN(c,op,ii,lit);
+
+#if defined(LZO_EOF_CODE)
+    *op++ = M4_MARKER | 1;
+    *op++ = 0;
+    *op++ = 0;
+#endif
+
+    c->codesize = pd(op, out);
+    assert(c->textsize == in_len);
+
+    *out_len = pd(op, out);
+
+    if (c->cb && c->cb->nprogress)
+        (*c->cb->nprogress)(c->cb, c->textsize, c->codesize, 0);
+
+#if 0
+    printf("%ld %ld -> %ld  %ld: %ld %ld %ld %ld %ld  %ld: %ld %ld %ld  %ld\n",
+        (long) c->textsize, (long) in_len, (long) c->codesize,
+        c->match_bytes, c->m1a_m, c->m1b_m, c->m2_m, c->m3_m, c->m4_m,
+        c->lit_bytes, c->lit1_r, c->lit2_r, c->lit3_r, c->lazy);
+#endif
+    assert(c->lit_bytes + c->match_bytes == in_len);
+
+    return LZO_E_OK;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+LZO_PUBLIC(int)
+lzo1x_999_compress_level    ( const lzo_bytep in , lzo_uint  in_len,
+                                    lzo_bytep out, lzo_uintp out_len,
+                                    lzo_voidp wrkmem,
+                              const lzo_bytep dict, lzo_uint dict_len,
+                                    lzo_callback_p cb,
+                                    int compression_level )
+{
+    static const struct
+    {
+        int try_lazy_parm;
+        lzo_uint good_length;
+        lzo_uint max_lazy;
+        lzo_uint nice_length;
+        lzo_uint max_chain;
+        lzo_uint32_t flags;
+    } c[9] = {
+        /* faster compression */
+        {   0,     0,     0,     8,    4,   0 },
+        {   0,     0,     0,    16,    8,   0 },
+        {   0,     0,     0,    32,   16,   0 },
+        {   1,     4,     4,    16,   16,   0 },
+        {   1,     8,    16,    32,   32,   0 },
+        {   1,     8,    16,   128,  128,   0 },
+        {   2,     8,    32,   128,  256,   0 },
+        {   2,    32,   128, SWD_F, 2048,   1 },
+        {   2, SWD_F, SWD_F, SWD_F, 4096,   1 }
+        /* max. compression */
+    };
+
+    if (compression_level < 1 || compression_level > 9)
+        return LZO_E_ERROR;
+
+    compression_level -= 1;
+    return lzo1x_999_compress_internal(in, in_len, out, out_len, wrkmem,
+                                       dict, dict_len, cb,
+                                       c[compression_level].try_lazy_parm,
+                                       c[compression_level].good_length,
+                                       c[compression_level].max_lazy,
+#if 0
+                                       c[compression_level].nice_length,
+#else
+                                       0,
+#endif
+                                       c[compression_level].max_chain,
+                                       c[compression_level].flags);
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+LZO_PUBLIC(int)
+lzo1x_999_compress_dict     ( const lzo_bytep in , lzo_uint  in_len,
+                                    lzo_bytep out, lzo_uintp out_len,
+                                    lzo_voidp wrkmem,
+                              const lzo_bytep dict, lzo_uint dict_len )
+{
+    return lzo1x_999_compress_level(in, in_len, out, out_len, wrkmem,
+                                    dict, dict_len, 0, 8);
+}
+
+LZO_PUBLIC(int)
+lzo1x_999_compress  ( const lzo_bytep in , lzo_uint  in_len,
+                            lzo_bytep out, lzo_uintp out_len,
+                            lzo_voidp wrkmem )
+{
+    return lzo1x_999_compress_level(in, in_len, out, out_len, wrkmem,
+                                    NULL, 0, (lzo_callback_p) 0, 9/*ootntsc10: 9 saves 2580 bytes over 8*/);
+}
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x_c.ch b/tools/z64compress/src/enc/lzo/lzo1x_c.ch
new file mode 100644
index 000000000..be19b2b74
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x_c.ch
@@ -0,0 +1,403 @@
+/* lzo1x_c.ch -- implementation of the LZO1[XY]-1 compression algorithm
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+
+#if 1 && defined(DO_COMPRESS) && !defined(do_compress)
+   /* choose a unique name to better help PGO optimizations */
+#  define do_compress       LZO_PP_ECONCAT2(DO_COMPRESS,_core)
+#endif
+
+
+/***********************************************************************
+// compress a block of data.
+************************************************************************/
+
+static __lzo_noinline lzo_uint
+do_compress ( const lzo_bytep in , lzo_uint  in_len,
+                    lzo_bytep out, lzo_uintp out_len,
+                    lzo_uint  ti,  lzo_voidp wrkmem)
+{
+    const lzo_bytep ip;
+    lzo_bytep op;
+    const lzo_bytep const in_end = in + in_len;
+    const lzo_bytep const ip_end = in + in_len - 20;
+    const lzo_bytep ii;
+    lzo_dict_p const dict = (lzo_dict_p) wrkmem;
+
+    op = out;
+    ip = in;
+    ii = ip;
+
+    ip += ti < 4 ? 4 - ti : 0;
+    for (;;)
+    {
+        const lzo_bytep m_pos;
+#if !(LZO_DETERMINISTIC)
+        LZO_DEFINE_UNINITIALIZED_VAR(lzo_uint, m_off, 0);
+        lzo_uint m_len;
+        lzo_uint dindex;
+next:
+        if __lzo_unlikely(ip >= ip_end)
+            break;
+        DINDEX1(dindex,ip);
+        GINDEX(m_pos,m_off,dict,dindex,in);
+        if (LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,M4_MAX_OFFSET))
+            goto literal;
+#if 1
+        if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+            goto try_match;
+        DINDEX2(dindex,ip);
+#endif
+        GINDEX(m_pos,m_off,dict,dindex,in);
+        if (LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,M4_MAX_OFFSET))
+            goto literal;
+        if (m_off <= M2_MAX_OFFSET || m_pos[3] == ip[3])
+            goto try_match;
+        goto literal;
+
+try_match:
+#if (LZO_OPT_UNALIGNED32)
+        if (UA_GET_NE32(m_pos) != UA_GET_NE32(ip))
+#else
+        if (m_pos[0] != ip[0] || m_pos[1] != ip[1] || m_pos[2] != ip[2] || m_pos[3] != ip[3])
+#endif
+        {
+            /* a literal */
+literal:
+            UPDATE_I(dict,0,dindex,ip,in);
+            ip += 1 + ((ip - ii) >> 5);
+            continue;
+        }
+/*match:*/
+        UPDATE_I(dict,0,dindex,ip,in);
+#else
+        lzo_uint m_off;
+        lzo_uint m_len;
+        {
+        lzo_uint32_t dv;
+        lzo_uint dindex;
+literal:
+        ip += 1 + ((ip - ii) >> 5);
+next:
+        if __lzo_unlikely(ip >= ip_end)
+            break;
+        dv = UA_GET_LE32(ip);
+        dindex = DINDEX(dv,ip);
+        GINDEX(m_off,m_pos,in+dict,dindex,in);
+        UPDATE_I(dict,0,dindex,ip,in);
+        if __lzo_unlikely(dv != UA_GET_LE32(m_pos))
+            goto literal;
+        }
+#endif
+
+    /* a match */
+
+        ii -= ti; ti = 0;
+        {
+        lzo_uint t = pd(ip,ii);
+        if (t != 0)
+        {
+            if (t <= 3)
+            {
+                op[-2] = LZO_BYTE(op[-2] | t);
+#if (LZO_OPT_UNALIGNED32)
+                UA_COPY4(op, ii);
+                op += t;
+#else
+                { do *op++ = *ii++; while (--t > 0); }
+#endif
+            }
+#if (LZO_OPT_UNALIGNED32) || (LZO_OPT_UNALIGNED64)
+            else if (t <= 16)
+            {
+                *op++ = LZO_BYTE(t - 3);
+                UA_COPY8(op, ii);
+                UA_COPY8(op+8, ii+8);
+                op += t;
+            }
+#endif
+            else
+            {
+                if (t <= 18)
+                    *op++ = LZO_BYTE(t - 3);
+                else
+                {
+                    lzo_uint tt = t - 18;
+                    *op++ = 0;
+                    while __lzo_unlikely(tt > 255)
+                    {
+                        tt -= 255;
+                        UA_SET1(op, 0);
+                        op++;
+                    }
+                    assert(tt > 0);
+                    *op++ = LZO_BYTE(tt);
+                }
+#if (LZO_OPT_UNALIGNED32) || (LZO_OPT_UNALIGNED64)
+                do {
+                    UA_COPY8(op, ii);
+                    UA_COPY8(op+8, ii+8);
+                    op += 16; ii += 16; t -= 16;
+                } while (t >= 16); if (t > 0)
+#endif
+                { do *op++ = *ii++; while (--t > 0); }
+            }
+        }
+        }
+        m_len = 4;
+        {
+#if (LZO_OPT_UNALIGNED64)
+        lzo_uint64_t v;
+        v = UA_GET_NE64(ip + m_len) ^ UA_GET_NE64(m_pos + m_len);
+        if __lzo_unlikely(v == 0) {
+            do {
+                m_len += 8;
+                v = UA_GET_NE64(ip + m_len) ^ UA_GET_NE64(m_pos + m_len);
+                if __lzo_unlikely(ip + m_len >= ip_end)
+                    goto m_len_done;
+            } while (v == 0);
+        }
+#if (LZO_ABI_BIG_ENDIAN) && defined(lzo_bitops_ctlz64)
+        m_len += lzo_bitops_ctlz64(v) / CHAR_BIT;
+#elif (LZO_ABI_BIG_ENDIAN)
+        if ((v >> (64 - CHAR_BIT)) == 0) do {
+            v <<= CHAR_BIT;
+            m_len += 1;
+        } while ((v >> (64 - CHAR_BIT)) == 0);
+#elif (LZO_ABI_LITTLE_ENDIAN) && defined(lzo_bitops_cttz64)
+        m_len += lzo_bitops_cttz64(v) / CHAR_BIT;
+#elif (LZO_ABI_LITTLE_ENDIAN)
+        if ((v & UCHAR_MAX) == 0) do {
+            v >>= CHAR_BIT;
+            m_len += 1;
+        } while ((v & UCHAR_MAX) == 0);
+#else
+        if (ip[m_len] == m_pos[m_len]) do {
+            m_len += 1;
+        } while (ip[m_len] == m_pos[m_len]);
+#endif
+#elif (LZO_OPT_UNALIGNED32)
+        lzo_uint32_t v;
+        v = UA_GET_NE32(ip + m_len) ^ UA_GET_NE32(m_pos + m_len);
+        if __lzo_unlikely(v == 0) {
+            do {
+                m_len += 4;
+                v = UA_GET_NE32(ip + m_len) ^ UA_GET_NE32(m_pos + m_len);
+                if (v != 0)
+                    break;
+                m_len += 4;
+                v = UA_GET_NE32(ip + m_len) ^ UA_GET_NE32(m_pos + m_len);
+                if __lzo_unlikely(ip + m_len >= ip_end)
+                    goto m_len_done;
+            } while (v == 0);
+        }
+#if (LZO_ABI_BIG_ENDIAN) && defined(lzo_bitops_ctlz32)
+        m_len += lzo_bitops_ctlz32(v) / CHAR_BIT;
+#elif (LZO_ABI_BIG_ENDIAN)
+        if ((v >> (32 - CHAR_BIT)) == 0) do {
+            v <<= CHAR_BIT;
+            m_len += 1;
+        } while ((v >> (32 - CHAR_BIT)) == 0);
+#elif (LZO_ABI_LITTLE_ENDIAN) && defined(lzo_bitops_cttz32)
+        m_len += lzo_bitops_cttz32(v) / CHAR_BIT;
+#elif (LZO_ABI_LITTLE_ENDIAN)
+        if ((v & UCHAR_MAX) == 0) do {
+            v >>= CHAR_BIT;
+            m_len += 1;
+        } while ((v & UCHAR_MAX) == 0);
+#else
+        if (ip[m_len] == m_pos[m_len]) do {
+            m_len += 1;
+        } while (ip[m_len] == m_pos[m_len]);
+#endif
+#else
+        if __lzo_unlikely(ip[m_len] == m_pos[m_len]) {
+            do {
+                m_len += 1;
+                if (ip[m_len] != m_pos[m_len])
+                    break;
+                m_len += 1;
+                if (ip[m_len] != m_pos[m_len])
+                    break;
+                m_len += 1;
+                if (ip[m_len] != m_pos[m_len])
+                    break;
+                m_len += 1;
+                if (ip[m_len] != m_pos[m_len])
+                    break;
+                m_len += 1;
+                if (ip[m_len] != m_pos[m_len])
+                    break;
+                m_len += 1;
+                if (ip[m_len] != m_pos[m_len])
+                    break;
+                m_len += 1;
+                if (ip[m_len] != m_pos[m_len])
+                    break;
+                m_len += 1;
+                if __lzo_unlikely(ip + m_len >= ip_end)
+                    goto m_len_done;
+            } while (ip[m_len] == m_pos[m_len]);
+        }
+#endif
+        }
+m_len_done:
+        m_off = pd(ip,m_pos);
+        ip += m_len;
+        ii = ip;
+        if (m_len <= M2_MAX_LEN && m_off <= M2_MAX_OFFSET)
+        {
+            m_off -= 1;
+#if defined(LZO1X)
+            *op++ = LZO_BYTE(((m_len - 1) << 5) | ((m_off & 7) << 2));
+            *op++ = LZO_BYTE(m_off >> 3);
+#elif defined(LZO1Y)
+            *op++ = LZO_BYTE(((m_len + 1) << 4) | ((m_off & 3) << 2));
+            *op++ = LZO_BYTE(m_off >> 2);
+#endif
+        }
+        else if (m_off <= M3_MAX_OFFSET)
+        {
+            m_off -= 1;
+            if (m_len <= M3_MAX_LEN)
+                *op++ = LZO_BYTE(M3_MARKER | (m_len - 2));
+            else
+            {
+                m_len -= M3_MAX_LEN;
+                *op++ = M3_MARKER | 0;
+                while __lzo_unlikely(m_len > 255)
+                {
+                    m_len -= 255;
+                    UA_SET1(op, 0);
+                    op++;
+                }
+                *op++ = LZO_BYTE(m_len);
+            }
+            *op++ = LZO_BYTE(m_off << 2);
+            *op++ = LZO_BYTE(m_off >> 6);
+        }
+        else
+        {
+            m_off -= 0x4000;
+            if (m_len <= M4_MAX_LEN)
+                *op++ = LZO_BYTE(M4_MARKER | ((m_off >> 11) & 8) | (m_len - 2));
+            else
+            {
+                m_len -= M4_MAX_LEN;
+                *op++ = LZO_BYTE(M4_MARKER | ((m_off >> 11) & 8));
+                while __lzo_unlikely(m_len > 255)
+                {
+                    m_len -= 255;
+                    UA_SET1(op, 0);
+                    op++;
+                }
+                *op++ = LZO_BYTE(m_len);
+            }
+            *op++ = LZO_BYTE(m_off << 2);
+            *op++ = LZO_BYTE(m_off >> 6);
+        }
+        goto next;
+    }
+
+    *out_len = pd(op, out);
+    return pd(in_end,ii-ti);
+}
+
+
+/***********************************************************************
+// public entry point
+************************************************************************/
+
+LZO_PUBLIC(int)
+DO_COMPRESS      ( const lzo_bytep in , lzo_uint  in_len,
+                         lzo_bytep out, lzo_uintp out_len,
+                         lzo_voidp wrkmem )
+{
+    const lzo_bytep ip = in;
+    lzo_bytep op = out;
+    lzo_uint l = in_len;
+    lzo_uint t = 0;
+
+    while (l > 20)
+    {
+        lzo_uint ll = l;
+        lzo_uintptr_t ll_end;
+#if 0 || (LZO_DETERMINISTIC)
+        ll = LZO_MIN(ll, 49152);
+#endif
+        ll_end = (lzo_uintptr_t)ip + ll;
+        if ((ll_end + ((t + ll) >> 5)) <= ll_end || (const lzo_bytep)(ll_end + ((t + ll) >> 5)) <= ip + ll)
+            break;
+#if (LZO_DETERMINISTIC)
+        lzo_memset(wrkmem, 0, ((lzo_uint)1 << D_BITS) * sizeof(lzo_dict_t));
+#endif
+        t = do_compress(ip,ll,op,out_len,t,wrkmem);
+        ip += ll;
+        op += *out_len;
+        l  -= ll;
+    }
+    t += l;
+
+    if (t > 0)
+    {
+        const lzo_bytep ii = in + in_len - t;
+
+        if (op == out && t <= 238)
+            *op++ = LZO_BYTE(17 + t);
+        else if (t <= 3)
+            op[-2] = LZO_BYTE(op[-2] | t);
+        else if (t <= 18)
+            *op++ = LZO_BYTE(t - 3);
+        else
+        {
+            lzo_uint tt = t - 18;
+
+            *op++ = 0;
+            while (tt > 255)
+            {
+                tt -= 255;
+                UA_SET1(op, 0);
+                op++;
+            }
+            assert(tt > 0);
+            *op++ = LZO_BYTE(tt);
+        }
+        UA_COPYN(op, ii, t);
+        op += t;
+    }
+
+    *op++ = M4_MARKER | 1;
+    *op++ = 0;
+    *op++ = 0;
+
+    *out_len = pd(op, out);
+    return LZO_E_OK;
+}
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x_d.ch b/tools/z64compress/src/enc/lzo/lzo1x_d.ch
new file mode 100644
index 000000000..b6c6d9947
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x_d.ch
@@ -0,0 +1,475 @@
+/* lzo1x_d.ch -- implementation of the LZO1X decompression algorithm
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#include "lzo1_d.ch"
+
+
+/***********************************************************************
+// decompress a block of data.
+************************************************************************/
+
+#if defined(DO_DECOMPRESS)
+LZO_PUBLIC(int)
+DO_DECOMPRESS  ( const lzo_bytep in , lzo_uint  in_len,
+                       lzo_bytep out, lzo_uintp out_len,
+                       lzo_voidp wrkmem )
+#endif
+{
+    lzo_bytep op;
+    const lzo_bytep ip;
+    lzo_uint t;
+#if defined(COPY_DICT)
+    lzo_uint m_off;
+    const lzo_bytep dict_end;
+#else
+    const lzo_bytep m_pos;
+#endif
+
+    const lzo_bytep const ip_end = in + in_len;
+#if defined(HAVE_ANY_OP)
+    lzo_bytep const op_end = out + *out_len;
+#endif
+#if defined(LZO1Z)
+    lzo_uint last_m_off = 0;
+#endif
+
+    LZO_UNUSED(wrkmem);
+
+#if defined(COPY_DICT)
+    if (dict)
+    {
+        if (dict_len > M4_MAX_OFFSET)
+        {
+            dict += dict_len - M4_MAX_OFFSET;
+            dict_len = M4_MAX_OFFSET;
+        }
+        dict_end = dict + dict_len;
+    }
+    else
+    {
+        dict_len = 0;
+        dict_end = NULL;
+    }
+#endif /* COPY_DICT */
+
+    *out_len = 0;
+
+    op = out;
+    ip = in;
+
+    NEED_IP(1);
+    if (*ip > 17)
+    {
+        t = *ip++ - 17;
+        if (t < 4)
+            goto match_next;
+        assert(t > 0); NEED_OP(t); NEED_IP(t+3);
+        do *op++ = *ip++; while (--t > 0);
+        goto first_literal_run;
+    }
+
+    for (;;)
+    {
+        NEED_IP(3);
+        t = *ip++;
+        if (t >= 16)
+            goto match;
+        /* a literal run */
+        if (t == 0)
+        {
+            while (*ip == 0)
+            {
+                t += 255;
+                ip++;
+                TEST_IV(t);
+                NEED_IP(1);
+            }
+            t += 15 + *ip++;
+        }
+        /* copy literals */
+        assert(t > 0); NEED_OP(t+3); NEED_IP(t+6);
+#if (LZO_OPT_UNALIGNED64) && (LZO_OPT_UNALIGNED32)
+        t += 3;
+        if (t >= 8) do
+        {
+            UA_COPY8(op,ip);
+            op += 8; ip += 8; t -= 8;
+        } while (t >= 8);
+        if (t >= 4)
+        {
+            UA_COPY4(op,ip);
+            op += 4; ip += 4; t -= 4;
+        }
+        if (t > 0)
+        {
+            *op++ = *ip++;
+            if (t > 1) { *op++ = *ip++; if (t > 2) { *op++ = *ip++; } }
+        }
+#elif (LZO_OPT_UNALIGNED32) || (LZO_ALIGNED_OK_4)
+#if !(LZO_OPT_UNALIGNED32)
+        if (PTR_ALIGNED2_4(op,ip))
+        {
+#endif
+        UA_COPY4(op,ip);
+        op += 4; ip += 4;
+        if (--t > 0)
+        {
+            if (t >= 4)
+            {
+                do {
+                    UA_COPY4(op,ip);
+                    op += 4; ip += 4; t -= 4;
+                } while (t >= 4);
+                if (t > 0) do *op++ = *ip++; while (--t > 0);
+            }
+            else
+                do *op++ = *ip++; while (--t > 0);
+        }
+#if !(LZO_OPT_UNALIGNED32)
+        }
+        else
+#endif
+#endif
+#if !(LZO_OPT_UNALIGNED32)
+        {
+            *op++ = *ip++; *op++ = *ip++; *op++ = *ip++;
+            do *op++ = *ip++; while (--t > 0);
+        }
+#endif
+
+
+first_literal_run:
+
+
+        t = *ip++;
+        if (t >= 16)
+            goto match;
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+        m_off = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
+        last_m_off = m_off;
+#else
+        m_off = (1 + M2_MAX_OFFSET) + (t >> 2) + (*ip++ << 2);
+#endif
+        NEED_OP(3);
+        t = 3; COPY_DICT(t,m_off)
+#else /* !COPY_DICT */
+#if defined(LZO1Z)
+        t = (1 + M2_MAX_OFFSET) + (t << 6) + (*ip++ >> 2);
+        m_pos = op - t;
+        last_m_off = t;
+#else
+        m_pos = op - (1 + M2_MAX_OFFSET);
+        m_pos -= t >> 2;
+        m_pos -= *ip++ << 2;
+#endif
+        TEST_LB(m_pos); NEED_OP(3);
+        *op++ = *m_pos++; *op++ = *m_pos++; *op++ = *m_pos;
+#endif /* COPY_DICT */
+        goto match_done;
+
+
+        /* handle matches */
+        for (;;) {
+match:
+            if (t >= 64)                /* a M2 match */
+            {
+#if defined(COPY_DICT)
+#if defined(LZO1X)
+                m_off = 1 + ((t >> 2) & 7) + (*ip++ << 3);
+                t = (t >> 5) - 1;
+#elif defined(LZO1Y)
+                m_off = 1 + ((t >> 2) & 3) + (*ip++ << 2);
+                t = (t >> 4) - 3;
+#elif defined(LZO1Z)
+                m_off = t & 0x1f;
+                if (m_off >= 0x1c)
+                    m_off = last_m_off;
+                else
+                {
+                    m_off = 1 + (m_off << 6) + (*ip++ >> 2);
+                    last_m_off = m_off;
+                }
+                t = (t >> 5) - 1;
+#endif
+#else /* !COPY_DICT */
+#if defined(LZO1X)
+                m_pos = op - 1;
+                m_pos -= (t >> 2) & 7;
+                m_pos -= *ip++ << 3;
+                t = (t >> 5) - 1;
+#elif defined(LZO1Y)
+                m_pos = op - 1;
+                m_pos -= (t >> 2) & 3;
+                m_pos -= *ip++ << 2;
+                t = (t >> 4) - 3;
+#elif defined(LZO1Z)
+                {
+                    lzo_uint off = t & 0x1f;
+                    m_pos = op;
+                    if (off >= 0x1c)
+                    {
+                        assert(last_m_off > 0);
+                        m_pos -= last_m_off;
+                    }
+                    else
+                    {
+                        off = 1 + (off << 6) + (*ip++ >> 2);
+                        m_pos -= off;
+                        last_m_off = off;
+                    }
+                }
+                t = (t >> 5) - 1;
+#endif
+                TEST_LB(m_pos); assert(t > 0); NEED_OP(t+3-1);
+                goto copy_match;
+#endif /* COPY_DICT */
+            }
+            else if (t >= 32)           /* a M3 match */
+            {
+                t &= 31;
+                if (t == 0)
+                {
+                    while (*ip == 0)
+                    {
+                        t += 255;
+                        ip++;
+                        TEST_OV(t);
+                        NEED_IP(1);
+                    }
+                    t += 31 + *ip++;
+                    NEED_IP(2);
+                }
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+                m_off = 1 + (ip[0] << 6) + (ip[1] >> 2);
+                last_m_off = m_off;
+#else
+                m_off = 1 + (ip[0] >> 2) + (ip[1] << 6);
+#endif
+#else /* !COPY_DICT */
+#if defined(LZO1Z)
+                {
+                    lzo_uint off = 1 + (ip[0] << 6) + (ip[1] >> 2);
+                    m_pos = op - off;
+                    last_m_off = off;
+                }
+#elif (LZO_OPT_UNALIGNED16) && (LZO_ABI_LITTLE_ENDIAN)
+                m_pos = op - 1;
+                m_pos -= UA_GET_LE16(ip) >> 2;
+#else
+                m_pos = op - 1;
+                m_pos -= (ip[0] >> 2) + (ip[1] << 6);
+#endif
+#endif /* COPY_DICT */
+                ip += 2;
+            }
+            else if (t >= 16)           /* a M4 match */
+            {
+#if defined(COPY_DICT)
+                m_off = (t & 8) << 11;
+#else /* !COPY_DICT */
+                m_pos = op;
+                m_pos -= (t & 8) << 11;
+#endif /* COPY_DICT */
+                t &= 7;
+                if (t == 0)
+                {
+                    while (*ip == 0)
+                    {
+                        t += 255;
+                        ip++;
+                        TEST_OV(t);
+                        NEED_IP(1);
+                    }
+                    t += 7 + *ip++;
+                    NEED_IP(2);
+                }
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+                m_off += (ip[0] << 6) + (ip[1] >> 2);
+#else
+                m_off += (ip[0] >> 2) + (ip[1] << 6);
+#endif
+                ip += 2;
+                if (m_off == 0)
+                    goto eof_found;
+                m_off += 0x4000;
+#if defined(LZO1Z)
+                last_m_off = m_off;
+#endif
+#else /* !COPY_DICT */
+#if defined(LZO1Z)
+                m_pos -= (ip[0] << 6) + (ip[1] >> 2);
+#elif (LZO_OPT_UNALIGNED16) && (LZO_ABI_LITTLE_ENDIAN)
+                m_pos -= UA_GET_LE16(ip) >> 2;
+#else
+                m_pos -= (ip[0] >> 2) + (ip[1] << 6);
+#endif
+                ip += 2;
+                if (m_pos == op)
+                    goto eof_found;
+                m_pos -= 0x4000;
+#if defined(LZO1Z)
+                last_m_off = pd((const lzo_bytep)op, m_pos);
+#endif
+#endif /* COPY_DICT */
+            }
+            else                            /* a M1 match */
+            {
+#if defined(COPY_DICT)
+#if defined(LZO1Z)
+                m_off = 1 + (t << 6) + (*ip++ >> 2);
+                last_m_off = m_off;
+#else
+                m_off = 1 + (t >> 2) + (*ip++ << 2);
+#endif
+                NEED_OP(2);
+                t = 2; COPY_DICT(t,m_off)
+#else /* !COPY_DICT */
+#if defined(LZO1Z)
+                t = 1 + (t << 6) + (*ip++ >> 2);
+                m_pos = op - t;
+                last_m_off = t;
+#else
+                m_pos = op - 1;
+                m_pos -= t >> 2;
+                m_pos -= *ip++ << 2;
+#endif
+                TEST_LB(m_pos); NEED_OP(2);
+                *op++ = *m_pos++; *op++ = *m_pos;
+#endif /* COPY_DICT */
+                goto match_done;
+            }
+
+            /* copy match */
+#if defined(COPY_DICT)
+
+            NEED_OP(t+3-1);
+            t += 3-1; COPY_DICT(t,m_off)
+
+#else /* !COPY_DICT */
+
+            TEST_LB(m_pos); assert(t > 0); NEED_OP(t+3-1);
+#if (LZO_OPT_UNALIGNED64) && (LZO_OPT_UNALIGNED32)
+            if (op - m_pos >= 8)
+            {
+                t += (3 - 1);
+                if (t >= 8) do
+                {
+                    UA_COPY8(op,m_pos);
+                    op += 8; m_pos += 8; t -= 8;
+                } while (t >= 8);
+                if (t >= 4)
+                {
+                    UA_COPY4(op,m_pos);
+                    op += 4; m_pos += 4; t -= 4;
+                }
+                if (t > 0)
+                {
+                    *op++ = m_pos[0];
+                    if (t > 1) { *op++ = m_pos[1]; if (t > 2) { *op++ = m_pos[2]; } }
+                }
+            }
+            else
+#elif (LZO_OPT_UNALIGNED32) || (LZO_ALIGNED_OK_4)
+#if !(LZO_OPT_UNALIGNED32)
+            if (t >= 2 * 4 - (3 - 1) && PTR_ALIGNED2_4(op,m_pos))
+            {
+                assert((op - m_pos) >= 4);  /* both pointers are aligned */
+#else
+            if (t >= 2 * 4 - (3 - 1) && (op - m_pos) >= 4)
+            {
+#endif
+                UA_COPY4(op,m_pos);
+                op += 4; m_pos += 4; t -= 4 - (3 - 1);
+                do {
+                    UA_COPY4(op,m_pos);
+                    op += 4; m_pos += 4; t -= 4;
+                } while (t >= 4);
+                if (t > 0) do *op++ = *m_pos++; while (--t > 0);
+            }
+            else
+#endif
+            {
+copy_match:
+                *op++ = *m_pos++; *op++ = *m_pos++;
+                do *op++ = *m_pos++; while (--t > 0);
+            }
+
+#endif /* COPY_DICT */
+
+match_done:
+#if defined(LZO1Z)
+            t = ip[-1] & 3;
+#else
+            t = ip[-2] & 3;
+#endif
+            if (t == 0)
+                break;
+
+            /* copy literals */
+match_next:
+            assert(t > 0); assert(t < 4); NEED_OP(t); NEED_IP(t+3);
+#if 0
+            do *op++ = *ip++; while (--t > 0);
+#else
+            *op++ = *ip++;
+            if (t > 1) { *op++ = *ip++; if (t > 2) { *op++ = *ip++; } }
+#endif
+            t = *ip++;
+        }
+    }
+
+eof_found:
+    *out_len = pd(op, out);
+    return (ip == ip_end ? LZO_E_OK :
+           (ip < ip_end  ? LZO_E_INPUT_NOT_CONSUMED : LZO_E_INPUT_OVERRUN));
+
+
+#if defined(HAVE_NEED_IP)
+input_overrun:
+    *out_len = pd(op, out);
+    return LZO_E_INPUT_OVERRUN;
+#endif
+
+#if defined(HAVE_NEED_OP)
+output_overrun:
+    *out_len = pd(op, out);
+    return LZO_E_OUTPUT_OVERRUN;
+#endif
+
+#if defined(LZO_TEST_OVERRUN_LOOKBEHIND)
+lookbehind_overrun:
+    *out_len = pd(op, out);
+    return LZO_E_LOOKBEHIND_OVERRUN;
+#endif
+}
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x_d1.c b/tools/z64compress/src/enc/lzo/lzo1x_d1.c
new file mode 100644
index 000000000..68faf48e8
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x_d1.c
@@ -0,0 +1,36 @@
+/* lzo1x_d1.c -- LZO1X decompression
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#include "config1x.h"
+
+#undef LZO_TEST_OVERRUN
+#define DO_DECOMPRESS       lzo1x_decompress
+
+#include "lzo1x_d.ch"
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x_d2.c b/tools/z64compress/src/enc/lzo/lzo1x_d2.c
new file mode 100644
index 000000000..8b7c316af
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x_d2.c
@@ -0,0 +1,61 @@
+/* lzo1x_d2.c -- LZO1X decompression with overrun testing
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#include "config1x.h"
+
+#define LZO_TEST_OVERRUN 1
+#define DO_DECOMPRESS       lzo1x_decompress_safe
+
+#include "lzo1x_d.ch"
+
+#if defined(LZO_ARCH_I386) && defined(LZO_USE_ASM)
+LZO_EXTERN(int) lzo1x_decompress_asm_safe
+                                (const lzo_bytep src, lzo_uint  src_len,
+                                       lzo_bytep dst, lzo_uintp dst_len,
+                                       lzo_voidp wrkmem);
+LZO_PUBLIC(int) lzo1x_decompress_asm_safe
+                                (const lzo_bytep src, lzo_uint  src_len,
+                                       lzo_bytep dst, lzo_uintp dst_len,
+                                       lzo_voidp wrkmem)
+{
+    return lzo1x_decompress_safe(src, src_len, dst, dst_len, wrkmem);
+}
+LZO_EXTERN(int) lzo1x_decompress_asm_fast_safe
+                                (const lzo_bytep src, lzo_uint  src_len,
+                                       lzo_bytep dst, lzo_uintp dst_len,
+                                       lzo_voidp wrkmem);
+LZO_PUBLIC(int) lzo1x_decompress_asm_fast_safe
+                                (const lzo_bytep src, lzo_uint  src_len,
+                                       lzo_bytep dst, lzo_uintp dst_len,
+                                       lzo_voidp wrkmem)
+{
+    return lzo1x_decompress_safe(src, src_len, dst, dst_len, wrkmem);
+}
+#endif
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo1x_d3.c b/tools/z64compress/src/enc/lzo/lzo1x_d3.c
new file mode 100644
index 000000000..b3d430f85
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo1x_d3.c
@@ -0,0 +1,93 @@
+/* lzo1x_d3.c -- LZO1X decompression with preset dictionary
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#include "config1x.h"
+
+#define LZO_TEST_OVERRUN 1
+
+
+#define SLOW_MEMCPY(a,b,l)      { do *a++ = *b++; while (--l > 0); }
+#define FAST_MEMCPY(a,b,l)      { lzo_memcpy(a,b,l); a += l; }
+
+#if 1 && defined(FAST_MEMCPY)
+#  define DICT_MEMMOVE(op,m_pos,m_len,m_off) \
+        if (m_off >= (m_len)) \
+            FAST_MEMCPY(op,m_pos,m_len) \
+        else \
+            SLOW_MEMCPY(op,m_pos,m_len)
+#else
+#  define DICT_MEMMOVE(op,m_pos,m_len,m_off) \
+        SLOW_MEMCPY(op,m_pos,m_len)
+#endif
+
+#if !defined(FAST_MEMCPY)
+#  define FAST_MEMCPY   SLOW_MEMCPY
+#endif
+
+
+#define COPY_DICT_DICT(m_len,m_off) \
+    { \
+        const lzo_bytep m_pos; \
+        m_off -= pd(op, out); assert(m_off > 0); \
+        if (m_off > dict_len) goto lookbehind_overrun; \
+        m_pos = dict_end - m_off; \
+        if (m_len > m_off) \
+        { \
+            m_len -= m_off; \
+            FAST_MEMCPY(op,m_pos,m_off) \
+            m_pos = out; \
+            SLOW_MEMCPY(op,m_pos,m_len) \
+        } \
+        else \
+            FAST_MEMCPY(op,m_pos,m_len) \
+    }
+
+#define COPY_DICT(m_len,m_off) \
+    assert(m_len >= 2); assert(m_off > 0); assert(op > out); \
+    if (m_off <= pd(op, out)) \
+    { \
+        const lzo_bytep m_pos = op - m_off; \
+        DICT_MEMMOVE(op,m_pos,m_len,m_off) \
+    } \
+    else \
+        COPY_DICT_DICT(m_len,m_off)
+
+
+
+
+LZO_PUBLIC(int)
+lzo1x_decompress_dict_safe ( const lzo_bytep in,  lzo_uint  in_len,
+                                   lzo_bytep out, lzo_uintp out_len,
+                                   lzo_voidp wrkmem /* NOT USED */,
+                             const lzo_bytep dict, lzo_uint dict_len)
+
+
+#include "lzo1x_d.ch"
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_conf.h b/tools/z64compress/src/enc/lzo/lzo_conf.h
new file mode 100644
index 000000000..fff021d1a
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_conf.h
@@ -0,0 +1,436 @@
+/* lzo_conf.h -- main internal configuration file for the the LZO library
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the library and is subject
+   to change.
+ */
+
+
+#ifndef __LZO_CONF_H
+#define __LZO_CONF_H 1
+
+#if !defined(__LZO_IN_MINILZO)
+#if defined(LZO_CFG_FREESTANDING) && (LZO_CFG_FREESTANDING)
+#  define LZO_LIBC_FREESTANDING 1
+#  define LZO_OS_FREESTANDING 1
+#endif
+#if defined(LZO_CFG_EXTRA_CONFIG_HEADER)
+#  include LZO_CFG_EXTRA_CONFIG_HEADER
+#endif
+#if defined(__LZOCONF_H) || defined(__LZOCONF_H_INCLUDED)
+#  error "include this file first"
+#endif
+#if defined(LZO_CFG_BUILD_DLL) && (LZO_CFG_BUILD_DLL+0) && !defined(__LZO_EXPORT1) && !defined(__LZO_EXPORT2) && 0
+  /* idea: we could auto-define __LZO_EXPORT1 for DLL exports */
+#ifndef __LZODEFS_H_INCLUDED
+#if defined(LZO_HAVE_CONFIG_H)
+#  include <config.h>
+#endif
+#include <limits.h>
+#include <stddef.h>
+#include "lzodefs.h"
+#endif
+  /* #define __LZO_EXPORT1 __attribute__((__visibility__("default"))) */
+  /* #define __LZO_EXPORT1 __declspec(dllexport) */
+#endif
+#include "lzoconf.h"
+#if defined(LZO_CFG_EXTRA_CONFIG_HEADER2)
+#  include LZO_CFG_EXTRA_CONFIG_HEADER2
+#endif
+#endif /* !defined(__LZO_IN_MINILZO) */
+
+#if !defined(__LZOCONF_H_INCLUDED) || (LZO_VERSION+0 != 0x20a0)
+#  error "version mismatch"
+#endif
+
+
+/***********************************************************************
+// pragmas
+************************************************************************/
+
+#if (LZO_CC_MSC && (_MSC_VER >= 1000 && _MSC_VER < 1100))
+   /* disable bogus "unreachable code" warnings */
+#  pragma warning(disable: 4702)
+#endif
+#if (LZO_CC_MSC && (_MSC_VER >= 1000))
+#  pragma warning(disable: 4127 4701)
+   /* disable warnings about inlining */
+#  pragma warning(disable: 4514 4710 4711)
+#endif
+#if (LZO_CC_MSC && (_MSC_VER >= 1300))
+   /* disable '-Wall' warnings in system header files */
+#  pragma warning(disable: 4820)
+#endif
+#if (LZO_CC_MSC && (_MSC_VER >= 1800))
+   /* disable '-Wall' warnings in system header files */
+#  pragma warning(disable: 4746)
+#endif
+#if (LZO_CC_INTELC && (__INTEL_COMPILER >= 900))
+   /* disable pedantic warnings in system header files */
+#  pragma warning(disable: 1684)
+#endif
+
+#if (LZO_CC_SUNPROC)
+#if !defined(__cplusplus)
+#  pragma error_messages(off,E_END_OF_LOOP_CODE_NOT_REACHED)
+#  pragma error_messages(off,E_LOOP_NOT_ENTERED_AT_TOP)
+#  pragma error_messages(off,E_STATEMENT_NOT_REACHED)
+#endif
+#endif
+
+
+/***********************************************************************
+// function types
+************************************************************************/
+
+#if !defined(__LZO_NOEXPORT1)
+#  define __LZO_NOEXPORT1       /*empty*/
+#endif
+#if !defined(__LZO_NOEXPORT2)
+#  define __LZO_NOEXPORT2       /*empty*/
+#endif
+
+#if 1
+#  define LZO_PUBLIC_DECL(r)    LZO_EXTERN(r)
+#endif
+#if 1
+#  define LZO_PUBLIC_IMPL(r)    LZO_PUBLIC(r)
+#endif
+#if !defined(LZO_LOCAL_DECL)
+#  define LZO_LOCAL_DECL(r)     __LZO_EXTERN_C LZO_LOCAL_IMPL(r)
+#endif
+#if !defined(LZO_LOCAL_IMPL)
+#  define LZO_LOCAL_IMPL(r)     __LZO_NOEXPORT1 r __LZO_NOEXPORT2 __LZO_CDECL
+#endif
+#if 1
+#  define LZO_STATIC_DECL(r)    LZO_PRIVATE(r)
+#endif
+#if 1
+#  define LZO_STATIC_IMPL(r)    LZO_PRIVATE(r)
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#if defined(__LZO_IN_MINILZO) || (LZO_CFG_FREESTANDING)
+#elif 1
+#  include <string.h>
+#else
+#  define LZO_WANT_ACC_INCD_H 1
+#endif
+#if defined(LZO_HAVE_CONFIG_H)
+#  define LZO_CFG_NO_CONFIG_HEADER 1
+#endif
+#include "lzo_supp.h"
+
+/* Integral types */
+#if 1 || defined(lzo_int8_t) || defined(lzo_uint8_t)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int8_t)  == 1)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint8_t) == 1)
+#endif
+#if 1 || defined(lzo_int16_t) || defined(lzo_uint16_t)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int16_t)  == 2)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint16_t) == 2)
+#endif
+#if 1 || defined(lzo_int32_t) || defined(lzo_uint32_t)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32_t)  == 4)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint32_t) == 4)
+#endif
+#if defined(lzo_int64_t) || defined(lzo_uint64_t)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64_t)  == 8)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint64_t) == 8)
+#endif
+
+#if (LZO_CFG_FREESTANDING)
+#  undef HAVE_MEMCMP
+#  undef HAVE_MEMCPY
+#  undef HAVE_MEMMOVE
+#  undef HAVE_MEMSET
+#endif
+
+#if !(HAVE_MEMCMP)
+#  undef memcmp
+#  define memcmp(a,b,c)         lzo_memcmp(a,b,c)
+#else
+#  undef lzo_memcmp
+#  define lzo_memcmp(a,b,c)     memcmp(a,b,c)
+#endif
+#if !(HAVE_MEMCPY)
+#  undef memcpy
+#  define memcpy(a,b,c)         lzo_memcpy(a,b,c)
+#else
+#  undef lzo_memcpy
+#  define lzo_memcpy(a,b,c)     memcpy(a,b,c)
+#endif
+#if !(HAVE_MEMMOVE)
+#  undef memmove
+#  define memmove(a,b,c)        lzo_memmove(a,b,c)
+#else
+#  undef lzo_memmove
+#  define lzo_memmove(a,b,c)    memmove(a,b,c)
+#endif
+#if !(HAVE_MEMSET)
+#  undef memset
+#  define memset(a,b,c)         lzo_memset(a,b,c)
+#else
+#  undef lzo_memset
+#  define lzo_memset(a,b,c)     memset(a,b,c)
+#endif
+
+#undef NDEBUG
+#if (LZO_CFG_FREESTANDING)
+#  undef LZO_DEBUG
+#  define NDEBUG 1
+#  undef assert
+#  define assert(e) ((void)0)
+#else
+#  if !defined(LZO_DEBUG)
+#    define NDEBUG 1
+#  endif
+#  include <assert.h>
+#endif
+
+#if 0 && defined(__BOUNDS_CHECKING_ON)
+#  include <unchecked.h>
+#else
+#  define BOUNDS_CHECKING_OFF_DURING(stmt)      stmt
+#  define BOUNDS_CHECKING_OFF_IN_EXPR(expr)     (expr)
+#endif
+
+#if (LZO_CFG_PGO)
+#  undef __lzo_likely
+#  undef __lzo_unlikely
+#  define __lzo_likely(e)       (e)
+#  define __lzo_unlikely(e)     (e)
+#endif
+
+#undef _
+#undef __
+#undef ___
+#undef ____
+#undef _p0
+#undef _p1
+#undef _p2
+#undef _p3
+#undef _p4
+#undef _s0
+#undef _s1
+#undef _s2
+#undef _s3
+#undef _s4
+#undef _ww
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#if 1
+#  define LZO_BYTE(x)       ((unsigned char) (x))
+#else
+#  define LZO_BYTE(x)       ((unsigned char) ((x) & 0xff))
+#endif
+
+#define LZO_MAX(a,b)        ((a) >= (b) ? (a) : (b))
+#define LZO_MIN(a,b)        ((a) <= (b) ? (a) : (b))
+#define LZO_MAX3(a,b,c)     ((a) >= (b) ? LZO_MAX(a,c) : LZO_MAX(b,c))
+#define LZO_MIN3(a,b,c)     ((a) <= (b) ? LZO_MIN(a,c) : LZO_MIN(b,c))
+
+#define lzo_sizeof(type)    ((lzo_uint) (sizeof(type)))
+
+#define LZO_HIGH(array)     ((lzo_uint) (sizeof(array)/sizeof(*(array))))
+
+/* this always fits into 32 bits */
+#define LZO_SIZE(bits)      (1u << (bits))
+#define LZO_MASK(bits)      (LZO_SIZE(bits) - 1)
+
+#define LZO_USIZE(bits)     ((lzo_uint) 1 << (bits))
+#define LZO_UMASK(bits)     (LZO_USIZE(bits) - 1)
+
+#if !defined(DMUL)
+#if 0
+   /* 32*32 multiplies may be faster than 64*64 on some 64-bit machines,
+    * but then we need extra casts from unsigned<->size_t */
+#  define DMUL(a,b) ((lzo_xint) ((lzo_uint32_t)(a) * (lzo_uint32_t)(b)))
+#else
+#  define DMUL(a,b) ((lzo_xint) ((a) * (b)))
+#endif
+#endif
+
+
+/***********************************************************************
+// compiler and architecture specific stuff
+************************************************************************/
+
+/* Some defines that indicate if memory can be accessed at unaligned
+ * memory addresses. You should also test that this is actually faster
+ * even if it is allowed by your system.
+ */
+
+#include "lzo_func.h"
+
+#ifndef UA_SET1
+#define UA_SET1             LZO_MEMOPS_SET1
+#endif
+#ifndef UA_SET2
+#define UA_SET2             LZO_MEMOPS_SET2
+#endif
+#ifndef UA_SET3
+#define UA_SET3             LZO_MEMOPS_SET3
+#endif
+#ifndef UA_SET4
+#define UA_SET4             LZO_MEMOPS_SET4
+#endif
+#ifndef UA_MOVE1
+#define UA_MOVE1            LZO_MEMOPS_MOVE1
+#endif
+#ifndef UA_MOVE2
+#define UA_MOVE2            LZO_MEMOPS_MOVE2
+#endif
+#ifndef UA_MOVE3
+#define UA_MOVE3            LZO_MEMOPS_MOVE3
+#endif
+#ifndef UA_MOVE4
+#define UA_MOVE4            LZO_MEMOPS_MOVE4
+#endif
+#ifndef UA_MOVE8
+#define UA_MOVE8            LZO_MEMOPS_MOVE8
+#endif
+#ifndef UA_COPY1
+#define UA_COPY1            LZO_MEMOPS_COPY1
+#endif
+#ifndef UA_COPY2
+#define UA_COPY2            LZO_MEMOPS_COPY2
+#endif
+#ifndef UA_COPY3
+#define UA_COPY3            LZO_MEMOPS_COPY3
+#endif
+#ifndef UA_COPY4
+#define UA_COPY4            LZO_MEMOPS_COPY4
+#endif
+#ifndef UA_COPY8
+#define UA_COPY8            LZO_MEMOPS_COPY8
+#endif
+#ifndef UA_COPYN
+#define UA_COPYN            LZO_MEMOPS_COPYN
+#endif
+#ifndef UA_COPYN_X
+#define UA_COPYN_X          LZO_MEMOPS_COPYN
+#endif
+#ifndef UA_GET_LE16
+#define UA_GET_LE16         LZO_MEMOPS_GET_LE16
+#endif
+#ifndef UA_GET_LE32
+#define UA_GET_LE32         LZO_MEMOPS_GET_LE32
+#endif
+#ifdef LZO_MEMOPS_GET_LE64
+#ifndef UA_GET_LE64
+#define UA_GET_LE64         LZO_MEMOPS_GET_LE64
+#endif
+#endif
+#ifndef UA_GET_NE16
+#define UA_GET_NE16         LZO_MEMOPS_GET_NE16
+#endif
+#ifndef UA_GET_NE32
+#define UA_GET_NE32         LZO_MEMOPS_GET_NE32
+#endif
+#ifdef LZO_MEMOPS_GET_NE64
+#ifndef UA_GET_NE64
+#define UA_GET_NE64         LZO_MEMOPS_GET_NE64
+#endif
+#endif
+#ifndef UA_PUT_LE16
+#define UA_PUT_LE16         LZO_MEMOPS_PUT_LE16
+#endif
+#ifndef UA_PUT_LE32
+#define UA_PUT_LE32         LZO_MEMOPS_PUT_LE32
+#endif
+#ifndef UA_PUT_NE16
+#define UA_PUT_NE16         LZO_MEMOPS_PUT_NE16
+#endif
+#ifndef UA_PUT_NE32
+#define UA_PUT_NE32         LZO_MEMOPS_PUT_NE32
+#endif
+
+
+/* Fast memcpy that copies multiples of 8 byte chunks.
+ * len is the number of bytes.
+ * note: all parameters must be lvalues, len >= 8
+ *       dest and src advance, len is undefined afterwards
+ */
+
+#define MEMCPY8_DS(dest,src,len) \
+    lzo_memcpy(dest,src,len); dest += len; src += len
+
+#define BZERO8_PTR(s,l,n) \
+    lzo_memset((lzo_voidp)(s),0,(lzo_uint)(l)*(n))
+
+#define MEMCPY_DS(dest,src,len) \
+    do *dest++ = *src++; while (--len > 0)
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+LZO_EXTERN(const lzo_bytep) lzo_copyright(void);
+
+#include "lzo_ptr.h"
+
+/* Generate compressed data in a deterministic way.
+ * This is fully portable, and compression can be faster as well.
+ * A reason NOT to be deterministic is when the block size is
+ * very small (e.g. 8kB) or the dictionary is big, because
+ * then the initialization of the dictionary becomes a relevant
+ * magnitude for compression speed.
+ */
+#ifndef LZO_DETERMINISTIC
+#define LZO_DETERMINISTIC 1
+#endif
+
+
+#ifndef LZO_DICT_USE_PTR
+#define LZO_DICT_USE_PTR 1
+#endif
+
+#if (LZO_DICT_USE_PTR)
+#  define lzo_dict_t    const lzo_bytep
+#  define lzo_dict_p    lzo_dict_t *
+#else
+#  define lzo_dict_t    lzo_uint
+#  define lzo_dict_p    lzo_dict_t *
+#endif
+
+
+#endif /* already included */
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_dict.h b/tools/z64compress/src/enc/lzo/lzo_dict.h
new file mode 100644
index 000000000..e48addb17
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_dict.h
@@ -0,0 +1,307 @@
+/* lzo_dict.h -- dictionary definitions for the the LZO library
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the library and is subject
+   to change.
+ */
+
+
+#ifndef __LZO_DICT_H
+#define __LZO_DICT_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/***********************************************************************
+// dictionary size
+************************************************************************/
+
+/* dictionary needed for compression */
+#if !defined(D_BITS) && defined(DBITS)
+#  define D_BITS        DBITS
+#endif
+#if !defined(D_BITS)
+#  error "D_BITS is not defined"
+#endif
+#if (D_BITS < 16)
+#  define D_SIZE        LZO_SIZE(D_BITS)
+#  define D_MASK        LZO_MASK(D_BITS)
+#else
+#  define D_SIZE        LZO_USIZE(D_BITS)
+#  define D_MASK        LZO_UMASK(D_BITS)
+#endif
+#define D_HIGH          ((D_MASK >> 1) + 1)
+
+
+/* dictionary depth */
+#if !defined(DD_BITS)
+#  define DD_BITS       0
+#endif
+#define DD_SIZE         LZO_SIZE(DD_BITS)
+#define DD_MASK         LZO_MASK(DD_BITS)
+
+/* dictionary length */
+#if !defined(DL_BITS)
+#  define DL_BITS       (D_BITS - DD_BITS)
+#endif
+#if (DL_BITS < 16)
+#  define DL_SIZE       LZO_SIZE(DL_BITS)
+#  define DL_MASK       LZO_MASK(DL_BITS)
+#else
+#  define DL_SIZE       LZO_USIZE(DL_BITS)
+#  define DL_MASK       LZO_UMASK(DL_BITS)
+#endif
+
+
+#if (D_BITS != DL_BITS + DD_BITS)
+#  error "D_BITS does not match"
+#endif
+#if (D_BITS < 6 || D_BITS > 18)
+#  error "invalid D_BITS"
+#endif
+#if (DL_BITS < 6 || DL_BITS > 20)
+#  error "invalid DL_BITS"
+#endif
+#if (DD_BITS < 0 || DD_BITS > 6)
+#  error "invalid DD_BITS"
+#endif
+
+
+#if !defined(DL_MIN_LEN)
+#  define DL_MIN_LEN    3
+#endif
+#if !defined(DL_SHIFT)
+#  define DL_SHIFT      ((DL_BITS + (DL_MIN_LEN - 1)) / DL_MIN_LEN)
+#endif
+
+
+
+/***********************************************************************
+// dictionary access
+************************************************************************/
+
+#define LZO_HASH_GZIP                   1
+#define LZO_HASH_GZIP_INCREMENTAL       2
+#define LZO_HASH_LZO_INCREMENTAL_A      3
+#define LZO_HASH_LZO_INCREMENTAL_B      4
+
+#if !defined(LZO_HASH)
+#  error "choose a hashing strategy"
+#endif
+
+#undef DM
+#undef DX
+
+#if (DL_MIN_LEN == 3)
+#  define _DV2_A(p,shift1,shift2) \
+        (((( (lzo_xint)((p)[0]) << shift1) ^ (p)[1]) << shift2) ^ (p)[2])
+#  define _DV2_B(p,shift1,shift2) \
+        (((( (lzo_xint)((p)[2]) << shift1) ^ (p)[1]) << shift2) ^ (p)[0])
+#  define _DV3_B(p,shift1,shift2,shift3) \
+        ((_DV2_B((p)+1,shift1,shift2) << (shift3)) ^ (p)[0])
+#elif (DL_MIN_LEN == 2)
+#  define _DV2_A(p,shift1,shift2) \
+        (( (lzo_xint)(p[0]) << shift1) ^ p[1])
+#  define _DV2_B(p,shift1,shift2) \
+        (( (lzo_xint)(p[1]) << shift1) ^ p[2])
+#else
+#  error "invalid DL_MIN_LEN"
+#endif
+#define _DV_A(p,shift)      _DV2_A(p,shift,shift)
+#define _DV_B(p,shift)      _DV2_B(p,shift,shift)
+#define DA2(p,s1,s2) \
+        (((((lzo_xint)((p)[2]) << (s2)) + (p)[1]) << (s1)) + (p)[0])
+#define DS2(p,s1,s2) \
+        (((((lzo_xint)((p)[2]) << (s2)) - (p)[1]) << (s1)) - (p)[0])
+#define DX2(p,s1,s2) \
+        (((((lzo_xint)((p)[2]) << (s2)) ^ (p)[1]) << (s1)) ^ (p)[0])
+#define DA3(p,s1,s2,s3) ((DA2((p)+1,s2,s3) << (s1)) + (p)[0])
+#define DS3(p,s1,s2,s3) ((DS2((p)+1,s2,s3) << (s1)) - (p)[0])
+#define DX3(p,s1,s2,s3) ((DX2((p)+1,s2,s3) << (s1)) ^ (p)[0])
+#define DMS(v,s)        ((lzo_uint) (((v) & (D_MASK >> (s))) << (s)))
+#define DM(v)           DMS(v,0)
+
+
+#if (LZO_HASH == LZO_HASH_GZIP)
+   /* hash function like in gzip/zlib (deflate) */
+#  define _DINDEX(dv,p)     (_DV_A((p),DL_SHIFT))
+
+#elif (LZO_HASH == LZO_HASH_GZIP_INCREMENTAL)
+   /* incremental hash like in gzip/zlib (deflate) */
+#  define __LZO_HASH_INCREMENTAL 1
+#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),DL_SHIFT)
+#  define DVAL_NEXT(dv,p)   dv = (((dv) << DL_SHIFT) ^ p[2])
+#  define _DINDEX(dv,p)     (dv)
+#  define DVAL_LOOKAHEAD    DL_MIN_LEN
+
+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_A)
+   /* incremental LZO hash version A */
+#  define __LZO_HASH_INCREMENTAL 1
+#  define DVAL_FIRST(dv,p)  dv = _DV_A((p),5)
+#  define DVAL_NEXT(dv,p) \
+                dv ^= (lzo_xint)(p[-1]) << (2*5); dv = (((dv) << 5) ^ p[2])
+#  define _DINDEX(dv,p)     ((DMUL(0x9f5f,dv)) >> 5)
+#  define DVAL_LOOKAHEAD    DL_MIN_LEN
+
+#elif (LZO_HASH == LZO_HASH_LZO_INCREMENTAL_B)
+   /* incremental LZO hash version B */
+#  define __LZO_HASH_INCREMENTAL 1
+#  define DVAL_FIRST(dv,p)  dv = _DV_B((p),5)
+#  define DVAL_NEXT(dv,p) \
+                dv ^= p[-1]; dv = (((dv) >> 5) ^ ((lzo_xint)(p[2]) << (2*5)))
+#  define _DINDEX(dv,p)     ((DMUL(0x9f5f,dv)) >> 5)
+#  define DVAL_LOOKAHEAD    DL_MIN_LEN
+
+#else
+#  error "choose a hashing strategy"
+#endif
+
+
+#ifndef DINDEX
+#define DINDEX(dv,p)        ((lzo_uint)((_DINDEX(dv,p)) & DL_MASK) << DD_BITS)
+#endif
+#if !defined(DINDEX1) && defined(D_INDEX1)
+#define DINDEX1             D_INDEX1
+#endif
+#if !defined(DINDEX2) && defined(D_INDEX2)
+#define DINDEX2             D_INDEX2
+#endif
+
+
+
+#if !defined(__LZO_HASH_INCREMENTAL)
+#  define DVAL_FIRST(dv,p)  ((void) 0)
+#  define DVAL_NEXT(dv,p)   ((void) 0)
+#  define DVAL_LOOKAHEAD    0
+#endif
+
+
+#if !defined(DVAL_ASSERT)
+#if defined(__LZO_HASH_INCREMENTAL) && !defined(NDEBUG)
+#if 1 && (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || (LZO_CC_GNUC >= 0x020700ul) || LZO_CC_INTELC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE || LZO_CC_PGI)
+static void __attribute__((__unused__))
+#else
+static void
+#endif
+DVAL_ASSERT(lzo_xint dv, const lzo_bytep p)
+{
+    lzo_xint df;
+    DVAL_FIRST(df,(p));
+    assert(DINDEX(dv,p) == DINDEX(df,p));
+}
+#else
+#  define DVAL_ASSERT(dv,p) ((void) 0)
+#endif
+#endif
+
+
+
+/***********************************************************************
+// dictionary updating
+************************************************************************/
+
+#if (LZO_DICT_USE_PTR)
+#  define DENTRY(p,in)                          (p)
+#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_pos = dict[dindex]
+#else
+#  define DENTRY(p,in)                          ((lzo_dict_t) pd(p, in))
+#  define GINDEX(m_pos,m_off,dict,dindex,in)    m_off = dict[dindex]
+#endif
+
+
+#if (DD_BITS == 0)
+
+#  define UPDATE_D(dict,drun,dv,p,in)       dict[ DINDEX(dv,p) ] = DENTRY(p,in)
+#  define UPDATE_I(dict,drun,index,p,in)    dict[index] = DENTRY(p,in)
+#  define UPDATE_P(ptr,drun,p,in)           (ptr)[0] = DENTRY(p,in)
+
+#else
+
+#  define UPDATE_D(dict,drun,dv,p,in)   \
+        dict[ DINDEX(dv,p) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
+#  define UPDATE_I(dict,drun,index,p,in)    \
+        dict[ (index) + drun++ ] = DENTRY(p,in); drun &= DD_MASK
+#  define UPDATE_P(ptr,drun,p,in)   \
+        (ptr) [ drun++ ] = DENTRY(p,in); drun &= DD_MASK
+
+#endif
+
+
+/***********************************************************************
+// test for a match
+************************************************************************/
+
+#if (LZO_DICT_USE_PTR)
+
+/* m_pos is either NULL or a valid pointer */
+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
+        (m_pos == NULL || (m_off = pd(ip, m_pos)) > max_offset)
+
+/* m_pos may point anywhere... */
+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
+    (BOUNDS_CHECKING_OFF_IN_EXPR(( \
+        m_pos = ip - (lzo_uint) PTR_DIFF(ip,m_pos), \
+        PTR_LT(m_pos,in) || \
+        (m_off = (lzo_uint) PTR_DIFF(ip,m_pos)) == 0 || \
+         m_off > max_offset )))
+
+#else
+
+#define LZO_CHECK_MPOS_DET(m_pos,m_off,in,ip,max_offset) \
+        (m_off == 0 || \
+         ((m_off = pd(ip, in) - m_off) > max_offset) || \
+         (m_pos = (ip) - (m_off), 0) )
+
+#define LZO_CHECK_MPOS_NON_DET(m_pos,m_off,in,ip,max_offset) \
+        (pd(ip, in) <= m_off || \
+         ((m_off = pd(ip, in) - m_off) > max_offset) || \
+         (m_pos = (ip) - (m_off), 0) )
+
+#endif
+
+
+#if (LZO_DETERMINISTIC)
+#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_DET
+#else
+#  define LZO_CHECK_MPOS    LZO_CHECK_MPOS_NON_DET
+#endif
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* already included */
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_func.h b/tools/z64compress/src/enc/lzo/lzo_func.h
new file mode 100644
index 000000000..f3ac8e344
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_func.h
@@ -0,0 +1,491 @@
+/* lzo_func.h -- functions
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the library and is subject
+   to change.
+ */
+
+
+#ifndef __LZO_FUNC_H
+#define __LZO_FUNC_H 1
+
+
+/***********************************************************************
+// bitops
+************************************************************************/
+
+#if !defined(LZO_BITOPS_USE_ASM_BITSCAN) && !defined(LZO_BITOPS_USE_GNUC_BITSCAN) && !defined(LZO_BITOPS_USE_MSC_BITSCAN)
+#if 1 && (LZO_ARCH_AMD64) && (LZO_CC_GNUC && (LZO_CC_GNUC < 0x040000ul)) && (LZO_ASM_SYNTAX_GNUC)
+#define LZO_BITOPS_USE_ASM_BITSCAN 1
+#elif (LZO_CC_CLANG || (LZO_CC_GNUC >= 0x030400ul) || (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 1000)) || (LZO_CC_LLVM && (!defined(__llvm_tools_version__) || (__llvm_tools_version__+0 >= 0x010500ul))))
+#define LZO_BITOPS_USE_GNUC_BITSCAN 1
+#elif (LZO_OS_WIN32 || LZO_OS_WIN64) && ((LZO_CC_INTELC_MSC && (__INTEL_COMPILER >= 1010)) || (LZO_CC_MSC && (_MSC_VER >= 1400)))
+#define LZO_BITOPS_USE_MSC_BITSCAN 1
+#if (LZO_CC_MSC) && (LZO_ARCH_AMD64 || LZO_ARCH_I386)
+#include <intrin.h>
+#endif
+#if (LZO_CC_MSC) && (LZO_ARCH_AMD64 || LZO_ARCH_I386)
+#pragma intrinsic(_BitScanReverse)
+#pragma intrinsic(_BitScanForward)
+#endif
+#if (LZO_CC_MSC) && (LZO_ARCH_AMD64)
+#pragma intrinsic(_BitScanReverse64)
+#pragma intrinsic(_BitScanForward64)
+#endif
+#endif
+#endif
+
+__lzo_static_forceinline unsigned lzo_bitops_ctlz32_func(lzo_uint32_t v)
+{
+#if (LZO_BITOPS_USE_MSC_BITSCAN) && (LZO_ARCH_AMD64 || LZO_ARCH_I386)
+    unsigned long r; (void) _BitScanReverse(&r, v); return (unsigned) r ^ 31;
+#define lzo_bitops_ctlz32(v)    lzo_bitops_ctlz32_func(v)
+#elif (LZO_BITOPS_USE_ASM_BITSCAN) && (LZO_ARCH_AMD64 || LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+    lzo_uint32_t r;
+    __asm__("bsr %1,%0" : "=r" (r) : "rm" (v) __LZO_ASM_CLOBBER_LIST_CC);
+    return (unsigned) r ^ 31;
+#define lzo_bitops_ctlz32(v)    lzo_bitops_ctlz32_func(v)
+#elif (LZO_BITOPS_USE_GNUC_BITSCAN) && (LZO_SIZEOF_INT == 4)
+    unsigned r; r = (unsigned) __builtin_clz(v); return r;
+#define lzo_bitops_ctlz32(v)    ((unsigned) __builtin_clz(v))
+#elif (LZO_BITOPS_USE_GNUC_BITSCAN) && (LZO_SIZEOF_LONG == 8) && (LZO_WORDSIZE >= 8)
+    unsigned r; r = (unsigned) __builtin_clzl(v); return r ^ 32;
+#define lzo_bitops_ctlz32(v)    (((unsigned) __builtin_clzl(v)) ^ 32)
+#else
+    LZO_UNUSED(v); return 0;
+#endif
+}
+
+#if defined(lzo_uint64_t)
+__lzo_static_forceinline unsigned lzo_bitops_ctlz64_func(lzo_uint64_t v)
+{
+#if (LZO_BITOPS_USE_MSC_BITSCAN) && (LZO_ARCH_AMD64)
+    unsigned long r; (void) _BitScanReverse64(&r, v); return (unsigned) r ^ 63;
+#define lzo_bitops_ctlz64(v)    lzo_bitops_ctlz64_func(v)
+#elif (LZO_BITOPS_USE_ASM_BITSCAN) && (LZO_ARCH_AMD64) && (LZO_ASM_SYNTAX_GNUC)
+    lzo_uint64_t r;
+    __asm__("bsr %1,%0" : "=r" (r) : "rm" (v) __LZO_ASM_CLOBBER_LIST_CC);
+    return (unsigned) r ^ 63;
+#define lzo_bitops_ctlz64(v)    lzo_bitops_ctlz64_func(v)
+#elif (LZO_BITOPS_USE_GNUC_BITSCAN) && (LZO_SIZEOF_LONG == 8) && (LZO_WORDSIZE >= 8)
+    unsigned r; r = (unsigned) __builtin_clzl(v); return r;
+#define lzo_bitops_ctlz64(v)    ((unsigned) __builtin_clzl(v))
+#elif (LZO_BITOPS_USE_GNUC_BITSCAN) && (LZO_SIZEOF_LONG_LONG == 8) && (LZO_WORDSIZE >= 8)
+    unsigned r; r = (unsigned) __builtin_clzll(v); return r;
+#define lzo_bitops_ctlz64(v)    ((unsigned) __builtin_clzll(v))
+#else
+    LZO_UNUSED(v); return 0;
+#endif
+}
+#endif
+
+__lzo_static_forceinline unsigned lzo_bitops_cttz32_func(lzo_uint32_t v)
+{
+#if (LZO_BITOPS_USE_MSC_BITSCAN) && (LZO_ARCH_AMD64 || LZO_ARCH_I386)
+    unsigned long r; (void) _BitScanForward(&r, v); return (unsigned) r;
+#define lzo_bitops_cttz32(v)    lzo_bitops_cttz32_func(v)
+#elif (LZO_BITOPS_USE_ASM_BITSCAN) && (LZO_ARCH_AMD64 || LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+    lzo_uint32_t r;
+    __asm__("bsf %1,%0" : "=r" (r) : "rm" (v) __LZO_ASM_CLOBBER_LIST_CC);
+    return (unsigned) r;
+#define lzo_bitops_cttz32(v)    lzo_bitops_cttz32_func(v)
+#elif (LZO_BITOPS_USE_GNUC_BITSCAN) && (LZO_SIZEOF_INT >= 4)
+    unsigned r; r = (unsigned) __builtin_ctz(v); return r;
+#define lzo_bitops_cttz32(v)    ((unsigned) __builtin_ctz(v))
+#else
+    LZO_UNUSED(v); return 0;
+#endif
+}
+
+#if defined(lzo_uint64_t)
+__lzo_static_forceinline unsigned lzo_bitops_cttz64_func(lzo_uint64_t v)
+{
+#if (LZO_BITOPS_USE_MSC_BITSCAN) && (LZO_ARCH_AMD64)
+    unsigned long r; (void) _BitScanForward64(&r, v); return (unsigned) r;
+#define lzo_bitops_cttz64(v)    lzo_bitops_cttz64_func(v)
+#elif (LZO_BITOPS_USE_ASM_BITSCAN) && (LZO_ARCH_AMD64) && (LZO_ASM_SYNTAX_GNUC)
+    lzo_uint64_t r;
+    __asm__("bsf %1,%0" : "=r" (r) : "rm" (v) __LZO_ASM_CLOBBER_LIST_CC);
+    return (unsigned) r;
+#define lzo_bitops_cttz64(v)    lzo_bitops_cttz64_func(v)
+#elif (LZO_BITOPS_USE_GNUC_BITSCAN) && (LZO_SIZEOF_LONG >= 8) && (LZO_WORDSIZE >= 8)
+    unsigned r; r = (unsigned) __builtin_ctzl(v); return r;
+#define lzo_bitops_cttz64(v)    ((unsigned) __builtin_ctzl(v))
+#elif (LZO_BITOPS_USE_GNUC_BITSCAN) && (LZO_SIZEOF_LONG_LONG >= 8) && (LZO_WORDSIZE >= 8)
+    unsigned r; r = (unsigned) __builtin_ctzll(v); return r;
+#define lzo_bitops_cttz64(v)    ((unsigned) __builtin_ctzll(v))
+#else
+    LZO_UNUSED(v); return 0;
+#endif
+}
+#endif
+
+lzo_unused_funcs_impl(void, lzo_bitops_unused_funcs)(void)
+{
+    LZO_UNUSED_FUNC(lzo_bitops_unused_funcs);
+    LZO_UNUSED_FUNC(lzo_bitops_ctlz32_func);
+    LZO_UNUSED_FUNC(lzo_bitops_cttz32_func);
+#if defined(lzo_uint64_t)
+    LZO_UNUSED_FUNC(lzo_bitops_ctlz64_func);
+    LZO_UNUSED_FUNC(lzo_bitops_cttz64_func);
+#endif
+}
+
+
+/***********************************************************************
+// memops
+************************************************************************/
+
+#if defined(__lzo_alignof) && !(LZO_CFG_NO_UNALIGNED)
+/* CBUG: disabled because of gcc bug 64516 */
+#if !defined(lzo_memops_tcheck__) && 0
+#define lzo_memops_tcheck__(t,a,b) ((void)0, sizeof(t) == (a) && __lzo_alignof(t) == (b))
+#endif
+#endif
+#ifndef lzo_memops_TU0p
+#define lzo_memops_TU0p void __LZO_MMODEL *
+#endif
+#ifndef lzo_memops_TU1p
+#define lzo_memops_TU1p unsigned char __LZO_MMODEL *
+#endif
+#ifndef lzo_memops_TU2p
+#if (LZO_OPT_UNALIGNED16)
+typedef lzo_uint16_t __lzo_may_alias lzo_memops_TU2;
+#define lzo_memops_TU2p volatile lzo_memops_TU2 *
+#elif defined(__lzo_byte_struct)
+__lzo_byte_struct(lzo_memops_TU2_struct,2)
+typedef struct lzo_memops_TU2_struct lzo_memops_TU2;
+#else
+struct lzo_memops_TU2_struct { unsigned char a[2]; } __lzo_may_alias;
+typedef struct lzo_memops_TU2_struct lzo_memops_TU2;
+#endif
+#ifndef lzo_memops_TU2p
+#define lzo_memops_TU2p lzo_memops_TU2 *
+#endif
+#endif
+#ifndef lzo_memops_TU4p
+#if (LZO_OPT_UNALIGNED32)
+typedef lzo_uint32_t __lzo_may_alias lzo_memops_TU4;
+#define lzo_memops_TU4p volatile lzo_memops_TU4 __LZO_MMODEL *
+#elif defined(__lzo_byte_struct)
+__lzo_byte_struct(lzo_memops_TU4_struct,4)
+typedef struct lzo_memops_TU4_struct lzo_memops_TU4;
+#else
+struct lzo_memops_TU4_struct { unsigned char a[4]; } __lzo_may_alias;
+typedef struct lzo_memops_TU4_struct lzo_memops_TU4;
+#endif
+#ifndef lzo_memops_TU4p
+#define lzo_memops_TU4p lzo_memops_TU4 __LZO_MMODEL *
+#endif
+#endif
+#ifndef lzo_memops_TU8p
+#if (LZO_OPT_UNALIGNED64)
+typedef lzo_uint64_t __lzo_may_alias lzo_memops_TU8;
+#define lzo_memops_TU8p volatile lzo_memops_TU8 __LZO_MMODEL *
+#elif defined(__lzo_byte_struct)
+__lzo_byte_struct(lzo_memops_TU8_struct,8)
+typedef struct lzo_memops_TU8_struct lzo_memops_TU8;
+#else
+struct lzo_memops_TU8_struct { unsigned char a[8]; } __lzo_may_alias;
+typedef struct lzo_memops_TU8_struct lzo_memops_TU8;
+#endif
+#ifndef lzo_memops_TU8p
+#define lzo_memops_TU8p lzo_memops_TU8 __LZO_MMODEL *
+#endif
+#endif
+#ifndef lzo_memops_set_TU1p
+#define lzo_memops_set_TU1p     volatile lzo_memops_TU1p
+#endif
+#ifndef lzo_memops_move_TU1p
+#define lzo_memops_move_TU1p    lzo_memops_TU1p
+#endif
+#define LZO_MEMOPS_SET1(dd,cc) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_set_TU1p d__1 = (lzo_memops_set_TU1p) (lzo_memops_TU0p) (dd); \
+    d__1[0] = LZO_BYTE(cc); \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_SET2(dd,cc) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_set_TU1p d__2 = (lzo_memops_set_TU1p) (lzo_memops_TU0p) (dd); \
+    d__2[0] = LZO_BYTE(cc); d__2[1] = LZO_BYTE(cc); \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_SET3(dd,cc) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_set_TU1p d__3 = (lzo_memops_set_TU1p) (lzo_memops_TU0p) (dd); \
+    d__3[0] = LZO_BYTE(cc); d__3[1] = LZO_BYTE(cc); d__3[2] = LZO_BYTE(cc); \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_SET4(dd,cc) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_set_TU1p d__4 = (lzo_memops_set_TU1p) (lzo_memops_TU0p) (dd); \
+    d__4[0] = LZO_BYTE(cc); d__4[1] = LZO_BYTE(cc); d__4[2] = LZO_BYTE(cc); d__4[3] = LZO_BYTE(cc); \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_MOVE1(dd,ss) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_move_TU1p d__1 = (lzo_memops_move_TU1p) (lzo_memops_TU0p) (dd); \
+    const lzo_memops_move_TU1p s__1 = (const lzo_memops_move_TU1p) (const lzo_memops_TU0p) (ss); \
+    d__1[0] = s__1[0]; \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_MOVE2(dd,ss) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_move_TU1p d__2 = (lzo_memops_move_TU1p) (lzo_memops_TU0p) (dd); \
+    const lzo_memops_move_TU1p s__2 = (const lzo_memops_move_TU1p) (const lzo_memops_TU0p) (ss); \
+    d__2[0] = s__2[0]; d__2[1] = s__2[1]; \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_MOVE3(dd,ss) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_move_TU1p d__3 = (lzo_memops_move_TU1p) (lzo_memops_TU0p) (dd); \
+    const lzo_memops_move_TU1p s__3 = (const lzo_memops_move_TU1p) (const lzo_memops_TU0p) (ss); \
+    d__3[0] = s__3[0]; d__3[1] = s__3[1]; d__3[2] = s__3[2]; \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_MOVE4(dd,ss) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_move_TU1p d__4 = (lzo_memops_move_TU1p) (lzo_memops_TU0p) (dd); \
+    const lzo_memops_move_TU1p s__4 = (const lzo_memops_move_TU1p) (const lzo_memops_TU0p) (ss); \
+    d__4[0] = s__4[0]; d__4[1] = s__4[1]; d__4[2] = s__4[2]; d__4[3] = s__4[3]; \
+    LZO_BLOCK_END
+#define LZO_MEMOPS_MOVE8(dd,ss) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_move_TU1p d__8 = (lzo_memops_move_TU1p) (lzo_memops_TU0p) (dd); \
+    const lzo_memops_move_TU1p s__8 = (const lzo_memops_move_TU1p) (const lzo_memops_TU0p) (ss); \
+    d__8[0] = s__8[0]; d__8[1] = s__8[1]; d__8[2] = s__8[2]; d__8[3] = s__8[3]; \
+    d__8[4] = s__8[4]; d__8[5] = s__8[5]; d__8[6] = s__8[6]; d__8[7] = s__8[7]; \
+    LZO_BLOCK_END
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(*(lzo_memops_TU1p)0)==1)
+#define LZO_MEMOPS_COPY1(dd,ss) LZO_MEMOPS_MOVE1(dd,ss)
+#if (LZO_OPT_UNALIGNED16)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(*(lzo_memops_TU2p)0)==2)
+#define LZO_MEMOPS_COPY2(dd,ss) \
+    * (lzo_memops_TU2p) (lzo_memops_TU0p) (dd) = * (const lzo_memops_TU2p) (const lzo_memops_TU0p) (ss)
+#elif defined(lzo_memops_tcheck__)
+#define LZO_MEMOPS_COPY2(dd,ss) \
+    LZO_BLOCK_BEGIN if (lzo_memops_tcheck__(lzo_memops_TU2,2,1)) { \
+        * (lzo_memops_TU2p) (lzo_memops_TU0p) (dd) = * (const lzo_memops_TU2p) (const lzo_memops_TU0p) (ss); \
+    } else { LZO_MEMOPS_MOVE2(dd,ss); } LZO_BLOCK_END
+#else
+#define LZO_MEMOPS_COPY2(dd,ss) LZO_MEMOPS_MOVE2(dd,ss)
+#endif
+#if (LZO_OPT_UNALIGNED32)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(*(lzo_memops_TU4p)0)==4)
+#define LZO_MEMOPS_COPY4(dd,ss) \
+    * (lzo_memops_TU4p) (lzo_memops_TU0p) (dd) = * (const lzo_memops_TU4p) (const lzo_memops_TU0p) (ss)
+#elif defined(lzo_memops_tcheck__)
+#define LZO_MEMOPS_COPY4(dd,ss) \
+    LZO_BLOCK_BEGIN if (lzo_memops_tcheck__(lzo_memops_TU4,4,1)) { \
+        * (lzo_memops_TU4p) (lzo_memops_TU0p) (dd) = * (const lzo_memops_TU4p) (const lzo_memops_TU0p) (ss); \
+    } else { LZO_MEMOPS_MOVE4(dd,ss); } LZO_BLOCK_END
+#else
+#define LZO_MEMOPS_COPY4(dd,ss) LZO_MEMOPS_MOVE4(dd,ss)
+#endif
+#if (LZO_WORDSIZE != 8)
+#define LZO_MEMOPS_COPY8(dd,ss) \
+    LZO_BLOCK_BEGIN LZO_MEMOPS_COPY4(dd,ss); LZO_MEMOPS_COPY4((lzo_memops_TU1p)(lzo_memops_TU0p)(dd)+4,(const lzo_memops_TU1p)(const lzo_memops_TU0p)(ss)+4); LZO_BLOCK_END
+#else
+#if (LZO_OPT_UNALIGNED64)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(*(lzo_memops_TU8p)0)==8)
+#define LZO_MEMOPS_COPY8(dd,ss) \
+    * (lzo_memops_TU8p) (lzo_memops_TU0p) (dd) = * (const lzo_memops_TU8p) (const lzo_memops_TU0p) (ss)
+#elif (LZO_OPT_UNALIGNED32)
+#define LZO_MEMOPS_COPY8(dd,ss) \
+    LZO_BLOCK_BEGIN LZO_MEMOPS_COPY4(dd,ss); LZO_MEMOPS_COPY4((lzo_memops_TU1p)(lzo_memops_TU0p)(dd)+4,(const lzo_memops_TU1p)(const lzo_memops_TU0p)(ss)+4); LZO_BLOCK_END
+#elif defined(lzo_memops_tcheck__)
+#define LZO_MEMOPS_COPY8(dd,ss) \
+    LZO_BLOCK_BEGIN if (lzo_memops_tcheck__(lzo_memops_TU8,8,1)) { \
+        * (lzo_memops_TU8p) (lzo_memops_TU0p) (dd) = * (const lzo_memops_TU8p) (const lzo_memops_TU0p) (ss); \
+    } else { LZO_MEMOPS_MOVE8(dd,ss); } LZO_BLOCK_END
+#else
+#define LZO_MEMOPS_COPY8(dd,ss) LZO_MEMOPS_MOVE8(dd,ss)
+#endif
+#endif
+#define LZO_MEMOPS_COPYN(dd,ss,nn) \
+    LZO_BLOCK_BEGIN \
+    lzo_memops_TU1p d__n = (lzo_memops_TU1p) (lzo_memops_TU0p) (dd); \
+    const lzo_memops_TU1p s__n = (const lzo_memops_TU1p) (const lzo_memops_TU0p) (ss); \
+    lzo_uint n__n = (nn); \
+    while ((void)0, n__n >= 8) { LZO_MEMOPS_COPY8(d__n, s__n); d__n += 8; s__n += 8; n__n -= 8; } \
+    if ((void)0, n__n >= 4) { LZO_MEMOPS_COPY4(d__n, s__n); d__n += 4; s__n += 4; n__n -= 4; } \
+    if ((void)0, n__n > 0) do { *d__n++ = *s__n++; } while (--n__n > 0); \
+    LZO_BLOCK_END
+
+__lzo_static_forceinline lzo_uint16_t lzo_memops_get_le16(const lzo_voidp ss)
+{
+    lzo_uint16_t v;
+#if (LZO_ABI_LITTLE_ENDIAN)
+    LZO_MEMOPS_COPY2(&v, ss);
+#elif (LZO_OPT_UNALIGNED16 && LZO_ARCH_POWERPC && LZO_ABI_BIG_ENDIAN) && (LZO_ASM_SYNTAX_GNUC)
+    const lzo_memops_TU2p s = (const lzo_memops_TU2p) ss;
+    unsigned long vv;
+    __asm__("lhbrx %0,0,%1" : "=r" (vv) : "r" (s), "m" (*s));
+    v = (lzo_uint16_t) vv;
+#else
+    const lzo_memops_TU1p s = (const lzo_memops_TU1p) ss;
+    v = (lzo_uint16_t) (((lzo_uint16_t)s[0]) | ((lzo_uint16_t)s[1] << 8));
+#endif
+    return v;
+}
+#if (LZO_OPT_UNALIGNED16) && (LZO_ABI_LITTLE_ENDIAN)
+#define LZO_MEMOPS_GET_LE16(ss)    (* (const lzo_memops_TU2p) (const lzo_memops_TU0p) (ss))
+#else
+#define LZO_MEMOPS_GET_LE16(ss)    lzo_memops_get_le16(ss)
+#endif
+
+__lzo_static_forceinline lzo_uint32_t lzo_memops_get_le32(const lzo_voidp ss)
+{
+    lzo_uint32_t v;
+#if (LZO_ABI_LITTLE_ENDIAN)
+    LZO_MEMOPS_COPY4(&v, ss);
+#elif (LZO_OPT_UNALIGNED32 && LZO_ARCH_POWERPC && LZO_ABI_BIG_ENDIAN) && (LZO_ASM_SYNTAX_GNUC)
+    const lzo_memops_TU4p s = (const lzo_memops_TU4p) ss;
+    unsigned long vv;
+    __asm__("lwbrx %0,0,%1" : "=r" (vv) : "r" (s), "m" (*s));
+    v = (lzo_uint32_t) vv;
+#else
+    const lzo_memops_TU1p s = (const lzo_memops_TU1p) ss;
+    v = (lzo_uint32_t) (((lzo_uint32_t)s[0]) | ((lzo_uint32_t)s[1] << 8) | ((lzo_uint32_t)s[2] << 16) | ((lzo_uint32_t)s[3] << 24));
+#endif
+    return v;
+}
+#if (LZO_OPT_UNALIGNED32) && (LZO_ABI_LITTLE_ENDIAN)
+#define LZO_MEMOPS_GET_LE32(ss)    (* (const lzo_memops_TU4p) (const lzo_memops_TU0p) (ss))
+#else
+#define LZO_MEMOPS_GET_LE32(ss)    lzo_memops_get_le32(ss)
+#endif
+
+#if (LZO_OPT_UNALIGNED64) && (LZO_ABI_LITTLE_ENDIAN)
+#define LZO_MEMOPS_GET_LE64(ss)    (* (const lzo_memops_TU8p) (const lzo_memops_TU0p) (ss))
+#endif
+
+__lzo_static_forceinline lzo_uint16_t lzo_memops_get_ne16(const lzo_voidp ss)
+{
+    lzo_uint16_t v;
+    LZO_MEMOPS_COPY2(&v, ss);
+    return v;
+}
+#if (LZO_OPT_UNALIGNED16)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(*(lzo_memops_TU2p)0)==2)
+#define LZO_MEMOPS_GET_NE16(ss)    (* (const lzo_memops_TU2p) (const lzo_memops_TU0p) (ss))
+#else
+#define LZO_MEMOPS_GET_NE16(ss)    lzo_memops_get_ne16(ss)
+#endif
+
+__lzo_static_forceinline lzo_uint32_t lzo_memops_get_ne32(const lzo_voidp ss)
+{
+    lzo_uint32_t v;
+    LZO_MEMOPS_COPY4(&v, ss);
+    return v;
+}
+#if (LZO_OPT_UNALIGNED32)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(*(lzo_memops_TU4p)0)==4)
+#define LZO_MEMOPS_GET_NE32(ss)    (* (const lzo_memops_TU4p) (const lzo_memops_TU0p) (ss))
+#else
+#define LZO_MEMOPS_GET_NE32(ss)    lzo_memops_get_ne32(ss)
+#endif
+
+#if (LZO_OPT_UNALIGNED64)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(*(lzo_memops_TU8p)0)==8)
+#define LZO_MEMOPS_GET_NE64(ss)    (* (const lzo_memops_TU8p) (const lzo_memops_TU0p) (ss))
+#endif
+
+__lzo_static_forceinline void lzo_memops_put_le16(lzo_voidp dd, lzo_uint16_t vv)
+{
+#if (LZO_ABI_LITTLE_ENDIAN)
+    LZO_MEMOPS_COPY2(dd, &vv);
+#elif (LZO_OPT_UNALIGNED16 && LZO_ARCH_POWERPC && LZO_ABI_BIG_ENDIAN) && (LZO_ASM_SYNTAX_GNUC)
+    lzo_memops_TU2p d = (lzo_memops_TU2p) dd;
+    unsigned long v = vv;
+    __asm__("sthbrx %2,0,%1" : "=m" (*d) : "r" (d), "r" (v));
+#else
+    lzo_memops_TU1p d = (lzo_memops_TU1p) dd;
+    d[0] = LZO_BYTE((vv      ) & 0xff);
+    d[1] = LZO_BYTE((vv >>  8) & 0xff);
+#endif
+}
+#if (LZO_OPT_UNALIGNED16) && (LZO_ABI_LITTLE_ENDIAN)
+#define LZO_MEMOPS_PUT_LE16(dd,vv) (* (lzo_memops_TU2p) (lzo_memops_TU0p) (dd) = (vv))
+#else
+#define LZO_MEMOPS_PUT_LE16(dd,vv) lzo_memops_put_le16(dd,vv)
+#endif
+
+__lzo_static_forceinline void lzo_memops_put_le32(lzo_voidp dd, lzo_uint32_t vv)
+{
+#if (LZO_ABI_LITTLE_ENDIAN)
+    LZO_MEMOPS_COPY4(dd, &vv);
+#elif (LZO_OPT_UNALIGNED32 && LZO_ARCH_POWERPC && LZO_ABI_BIG_ENDIAN) && (LZO_ASM_SYNTAX_GNUC)
+    lzo_memops_TU4p d = (lzo_memops_TU4p) dd;
+    unsigned long v = vv;
+    __asm__("stwbrx %2,0,%1" : "=m" (*d) : "r" (d), "r" (v));
+#else
+    lzo_memops_TU1p d = (lzo_memops_TU1p) dd;
+    d[0] = LZO_BYTE((vv      ) & 0xff);
+    d[1] = LZO_BYTE((vv >>  8) & 0xff);
+    d[2] = LZO_BYTE((vv >> 16) & 0xff);
+    d[3] = LZO_BYTE((vv >> 24) & 0xff);
+#endif
+}
+#if (LZO_OPT_UNALIGNED32) && (LZO_ABI_LITTLE_ENDIAN)
+#define LZO_MEMOPS_PUT_LE32(dd,vv) (* (lzo_memops_TU4p) (lzo_memops_TU0p) (dd) = (vv))
+#else
+#define LZO_MEMOPS_PUT_LE32(dd,vv) lzo_memops_put_le32(dd,vv)
+#endif
+
+__lzo_static_forceinline void lzo_memops_put_ne16(lzo_voidp dd, lzo_uint16_t vv)
+{
+    LZO_MEMOPS_COPY2(dd, &vv);
+}
+#if (LZO_OPT_UNALIGNED16)
+#define LZO_MEMOPS_PUT_NE16(dd,vv) (* (lzo_memops_TU2p) (lzo_memops_TU0p) (dd) = (vv))
+#else
+#define LZO_MEMOPS_PUT_NE16(dd,vv) lzo_memops_put_ne16(dd,vv)
+#endif
+
+__lzo_static_forceinline void lzo_memops_put_ne32(lzo_voidp dd, lzo_uint32_t vv)
+{
+    LZO_MEMOPS_COPY4(dd, &vv);
+}
+#if (LZO_OPT_UNALIGNED32)
+#define LZO_MEMOPS_PUT_NE32(dd,vv) (* (lzo_memops_TU4p) (lzo_memops_TU0p) (dd) = (vv))
+#else
+#define LZO_MEMOPS_PUT_NE32(dd,vv) lzo_memops_put_ne32(dd,vv)
+#endif
+
+lzo_unused_funcs_impl(void, lzo_memops_unused_funcs)(void)
+{
+    LZO_UNUSED_FUNC(lzo_memops_unused_funcs);
+    LZO_UNUSED_FUNC(lzo_memops_get_le16);
+    LZO_UNUSED_FUNC(lzo_memops_get_le32);
+    LZO_UNUSED_FUNC(lzo_memops_get_ne16);
+    LZO_UNUSED_FUNC(lzo_memops_get_ne32);
+    LZO_UNUSED_FUNC(lzo_memops_put_le16);
+    LZO_UNUSED_FUNC(lzo_memops_put_le32);
+    LZO_UNUSED_FUNC(lzo_memops_put_ne16);
+    LZO_UNUSED_FUNC(lzo_memops_put_ne32);
+}
+
+#endif /* already included */
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_mchw.ch b/tools/z64compress/src/enc/lzo/lzo_mchw.ch
new file mode 100644
index 000000000..94bedc3d9
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_mchw.ch
@@ -0,0 +1,222 @@
+/* lzo_mchw.ch -- matching functions using a window
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+typedef struct
+{
+    unsigned init;
+
+    lzo_uint look;          /* bytes in lookahead buffer */
+
+    lzo_uint m_len;
+    lzo_uint m_off;
+
+    lzo_uint last_m_len;
+    lzo_uint last_m_off;
+
+    const lzo_bytep bp;
+    const lzo_bytep ip;
+    const lzo_bytep in;
+    const lzo_bytep in_end;
+    lzo_bytep out;
+
+    lzo_callback_p cb;
+
+    lzo_uint textsize;      /* text size counter */
+    lzo_uint codesize;      /* code size counter */
+    lzo_uint printcount;    /* counter for reporting progress every 1K bytes */
+
+    /* some stats */
+    lzo_uint lit_bytes;
+    lzo_uint match_bytes;
+    lzo_uint rep_bytes;
+    lzo_uint lazy;
+
+#if defined(LZO1B)
+    lzo_uint r1_m_len;
+
+    /* some stats */
+    lzo_uint r1_r, m3_r, m2_m, m3_m;
+#endif
+
+#if defined(LZO1C)
+    lzo_uint r1_m_len;
+    lzo_bytep m3;
+
+    /* some stats */
+    lzo_uint r1_r, m3_r, m2_m, m3_m;
+#endif
+
+#if defined(LZO1F)
+    lzo_uint r1_lit;
+    lzo_uint r1_m_len;
+
+    /* some stats */
+    lzo_uint r1_r, m2_m, m3_m;
+#endif
+
+#if defined(LZO1X) || defined(LZO1Y) || defined(LZO1Z)
+    lzo_uint r1_lit;
+    lzo_uint r1_m_len;
+
+    /* some stats */
+    lzo_uint m1a_m, m1b_m, m2_m, m3_m, m4_m;
+    lzo_uint lit1_r, lit2_r, lit3_r;
+#endif
+
+#if defined(LZO2A)
+    /* some stats */
+    lzo_uint m1, m2, m3, m4;
+#endif
+}
+LZO_COMPRESS_T;
+
+
+#define getbyte(c)  ((c).ip < (c).in_end ? *((c).ip)++ : (-1))
+
+#include "lzo_swd.ch"
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static int
+init_match ( LZO_COMPRESS_T *c, lzo_swd_p s,
+             const lzo_bytep dict, lzo_uint dict_len,
+             lzo_uint32_t flags )
+{
+    int r;
+
+    assert(!c->init);
+    c->init = 1;
+
+    s->c = c;
+
+    c->last_m_len = c->last_m_off = 0;
+
+    c->textsize = c->codesize = c->printcount = 0;
+    c->lit_bytes = c->match_bytes = c->rep_bytes = 0;
+    c->lazy = 0;
+
+    r = swd_init(s,dict,dict_len);
+    if (r != LZO_E_OK)
+    {
+        swd_exit(s);
+        return r;
+    }
+
+    s->use_best_off = (flags & 1) ? 1 : 0;
+    return LZO_E_OK;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static int
+find_match ( LZO_COMPRESS_T *c, lzo_swd_p s,
+             lzo_uint this_len, lzo_uint skip )
+{
+    assert(c->init);
+
+    if (skip > 0)
+    {
+        assert(this_len >= skip);
+        swd_accept(s, this_len - skip);
+        c->textsize += this_len - skip + 1;
+    }
+    else
+    {
+        assert(this_len <= 1);
+        c->textsize += this_len - skip;
+    }
+
+    s->m_len = SWD_THRESHOLD;
+    s->m_off = 0;
+#ifdef SWD_BEST_OFF
+    if (s->use_best_off)
+        lzo_memset(s->best_pos,0,sizeof(s->best_pos));
+#endif
+    swd_findbest(s);
+    c->m_len = s->m_len;
+    c->m_off = s->m_off;
+
+    swd_getbyte(s);
+
+    if (s->b_char < 0)
+    {
+        c->look = 0;
+        c->m_len = 0;
+        swd_exit(s);
+    }
+    else
+    {
+        c->look = s->look + 1;
+    }
+    c->bp = c->ip - c->look;
+
+#if 0
+    /* brute force match search */
+    if (c->m_len > SWD_THRESHOLD && c->m_len + 1 <= c->look)
+    {
+        const lzo_bytep ip = c->bp;
+        const lzo_bytep m  = c->bp - c->m_off;
+        const lzo_bytep in = c->in;
+
+        if (ip - in > s->swd_n)
+            in = ip - s->swd_n;
+        for (;;)
+        {
+            while (*in != *ip)
+                in++;
+            if (in == ip)
+                break;
+            if (in != m)
+                if (lzo_memcmp(in,ip,c->m_len+1) == 0)
+                    printf("%p %p %p %5d\n",in,ip,m,c->m_len);
+            in++;
+        }
+    }
+#endif
+
+    if (c->cb && c->cb->nprogress && c->textsize > c->printcount)
+    {
+        (*c->cb->nprogress)(c->cb, c->textsize, c->codesize, 0);
+        c->printcount += 1024;
+    }
+
+    return LZO_E_OK;
+}
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_ptr.c b/tools/z64compress/src/enc/lzo/lzo_ptr.c
new file mode 100644
index 000000000..d2def7f63
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_ptr.c
@@ -0,0 +1,80 @@
+/* lzo_ptr.c -- low-level pointer constructs
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#include "lzo_conf.h"
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+LZO_PUBLIC(lzo_uintptr_t)
+__lzo_ptr_linear(const lzo_voidp ptr)
+{
+    lzo_uintptr_t p;
+
+#if (LZO_ARCH_I086)
+#error "LZO_ARCH_I086 is unsupported"
+#elif (LZO_MM_PVP)
+#error "LZO_MM_PVP is unsupported"
+#else
+    p = (lzo_uintptr_t) PTR_LINEAR(ptr);
+#endif
+
+    return p;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+LZO_PUBLIC(unsigned)
+__lzo_align_gap(const lzo_voidp ptr, lzo_uint size)
+{
+#if (__LZO_UINTPTR_T_IS_POINTER)
+#error "__LZO_UINTPTR_T_IS_POINTER is unsupported"
+#else
+    lzo_uintptr_t p, n;
+    if (size < 2) return 0;
+    p = __lzo_ptr_linear(ptr);
+#if 0
+    n = (((p + size - 1) / size) * size) - p;
+#else
+    if ((size & (size - 1)) != 0)
+        return 0;
+    n = size; n = ((p + n - 1) & ~(n - 1)) - p;
+#endif
+#endif
+    assert((long)n >= 0);
+    assert(n <= size);
+    return (unsigned)n;
+}
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_ptr.h b/tools/z64compress/src/enc/lzo/lzo_ptr.h
new file mode 100644
index 000000000..8d7ee4483
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_ptr.h
@@ -0,0 +1,123 @@
+/* lzo_ptr.h -- low-level pointer constructs
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the library and is subject
+   to change.
+ */
+
+
+#ifndef __LZO_PTR_H
+#define __LZO_PTR_H 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+/* Always use the safe (=integral) version for pointer-comparisons.
+ * The compiler should optimize away the additional casts anyway.
+ *
+ * Note that this only works if the representation and ordering
+ * of the pointer and the integral is the same (at bit level).
+ */
+
+#if (LZO_ARCH_I086)
+#error "LZO_ARCH_I086 is unsupported"
+#elif (LZO_MM_PVP)
+#error "LZO_MM_PVP is unsupported"
+#else
+#define PTR(a)              ((lzo_uintptr_t) (a))
+#define PTR_LINEAR(a)       PTR(a)
+#define PTR_ALIGNED_4(a)    ((PTR_LINEAR(a) & 3) == 0)
+#define PTR_ALIGNED_8(a)    ((PTR_LINEAR(a) & 7) == 0)
+#define PTR_ALIGNED2_4(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 3) == 0)
+#define PTR_ALIGNED2_8(a,b) (((PTR_LINEAR(a) | PTR_LINEAR(b)) & 7) == 0)
+#endif
+
+#define PTR_LT(a,b)         (PTR(a) < PTR(b))
+#define PTR_GE(a,b)         (PTR(a) >= PTR(b))
+#define PTR_DIFF(a,b)       (PTR(a) - PTR(b))
+#define pd(a,b)             ((lzo_uint) ((a)-(b)))
+
+
+LZO_EXTERN(lzo_uintptr_t)
+__lzo_ptr_linear(const lzo_voidp ptr);
+
+
+typedef union
+{
+    char            a_char;
+    unsigned char   a_uchar;
+    short           a_short;
+    unsigned short  a_ushort;
+    int             a_int;
+    unsigned int    a_uint;
+    long            a_long;
+    unsigned long   a_ulong;
+    lzo_int         a_lzo_int;
+    lzo_uint        a_lzo_uint;
+    lzo_xint        a_lzo_xint;
+    lzo_int16_t     a_lzo_int16_t;
+    lzo_uint16_t    a_lzo_uint16_t;
+    lzo_int32_t     a_lzo_int32_t;
+    lzo_uint32_t    a_lzo_uint32_t;
+#if defined(lzo_uint64_t)
+    lzo_int64_t     a_lzo_int64_t;
+    lzo_uint64_t    a_lzo_uint64_t;
+#endif
+    size_t          a_size_t;
+    ptrdiff_t       a_ptrdiff_t;
+    lzo_uintptr_t   a_lzo_uintptr_t;
+    void *          a_void_p;
+    char *          a_char_p;
+    unsigned char * a_uchar_p;
+    const void *          a_c_void_p;
+    const char *          a_c_char_p;
+    const unsigned char * a_c_uchar_p;
+    lzo_voidp       a_lzo_voidp;
+    lzo_bytep       a_lzo_bytep;
+    const lzo_voidp a_c_lzo_voidp;
+    const lzo_bytep a_c_lzo_bytep;
+}
+lzo_full_align_t;
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* already included */
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_supp.h b/tools/z64compress/src/enc/lzo/lzo_supp.h
new file mode 100644
index 000000000..a2c90210d
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_supp.h
@@ -0,0 +1,3678 @@
+/* lzo_supp.h -- architecture, OS and compiler specific defines
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#ifndef __LZO_SUPP_H_INCLUDED
+#define __LZO_SUPP_H_INCLUDED 1
+#if (LZO_CFG_NO_CONFIG_HEADER)
+#elif defined(LZO_CFG_CONFIG_HEADER)
+#else
+#if !(LZO_CFG_AUTO_NO_HEADERS)
+#if (LZO_LIBC_NAKED)
+#elif (LZO_LIBC_FREESTANDING)
+#  define HAVE_LIMITS_H 1
+#  define HAVE_STDARG_H 1
+#  define HAVE_STDDEF_H 1
+#elif (LZO_LIBC_MOSTLY_FREESTANDING)
+#  define HAVE_LIMITS_H 1
+#  define HAVE_SETJMP_H 1
+#  define HAVE_STDARG_H 1
+#  define HAVE_STDDEF_H 1
+#  define HAVE_STDIO_H 1
+#  define HAVE_STRING_H 1
+#else
+#define STDC_HEADERS 1
+#define HAVE_ASSERT_H 1
+#define HAVE_CTYPE_H 1
+#define HAVE_DIRENT_H 1
+#define HAVE_ERRNO_H 1
+#define HAVE_FCNTL_H 1
+#define HAVE_FLOAT_H 1
+#define HAVE_LIMITS_H 1
+#define HAVE_MALLOC_H 1
+#define HAVE_MEMORY_H 1
+#define HAVE_SETJMP_H 1
+#define HAVE_SIGNAL_H 1
+#define HAVE_STDARG_H 1
+#define HAVE_STDDEF_H 1
+#define HAVE_STDIO_H 1
+#define HAVE_STDLIB_H 1
+#define HAVE_STRING_H 1
+#define HAVE_TIME_H 1
+#define HAVE_UNISTD_H 1
+#define HAVE_UTIME_H 1
+#define HAVE_SYS_STAT_H 1
+#define HAVE_SYS_TIME_H 1
+#define HAVE_SYS_TYPES_H 1
+#if (LZO_OS_POSIX)
+#  if (LZO_OS_POSIX_AIX)
+#    define HAVE_SYS_RESOURCE_H 1
+#  elif (LZO_OS_POSIX_DARWIN || LZO_OS_POSIX_FREEBSD || LZO_OS_POSIX_NETBSD || LZO_OS_POSIX_OPENBSD)
+#    define HAVE_STRINGS_H 1
+#    undef HAVE_MALLOC_H
+#  elif (LZO_OS_POSIX_HPUX || LZO_OS_POSIX_INTERIX)
+#    define HAVE_ALLOCA_H 1
+#  elif (LZO_OS_POSIX_DARWIN && LZO_LIBC_MSL)
+#    undef HAVE_SYS_TIME_H
+#    undef HAVE_SYS_TYPES_H
+#  elif (LZO_OS_POSIX_SOLARIS || LZO_OS_POSIX_SUNOS)
+#    define HAVE_ALLOCA_H 1
+#  endif
+#  if (LZO_LIBC_DIETLIBC || LZO_LIBC_GLIBC || LZO_LIBC_UCLIBC)
+#    define HAVE_STRINGS_H 1
+#    define HAVE_SYS_MMAN_H 1
+#    define HAVE_SYS_RESOURCE_H 1
+#    define HAVE_SYS_WAIT_H 1
+#  endif
+#  if (LZO_LIBC_NEWLIB)
+#    undef HAVE_STRINGS_H
+#  endif
+#elif (LZO_OS_CYGWIN)
+#  define HAVE_IO_H 1
+#elif (LZO_OS_EMX)
+#  define HAVE_ALLOCA_H 1
+#  define HAVE_IO_H 1
+#elif (LZO_ARCH_M68K && LZO_OS_TOS && LZO_CC_GNUC)
+#  if !defined(__MINT__)
+#    undef HAVE_MALLOC_H
+#  endif
+#elif (LZO_ARCH_M68K && LZO_OS_TOS && (LZO_CC_PUREC || LZO_CC_TURBOC))
+#  undef HAVE_DIRENT_H
+#  undef HAVE_FCNTL_H
+#  undef HAVE_MALLOC_H
+#  undef HAVE_MEMORY_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_STAT_H
+#  undef HAVE_SYS_TIME_H
+#  undef HAVE_SYS_TYPES_H
+#endif
+#if (LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_OS216 || LZO_OS_WIN16 || LZO_OS_WIN32 || LZO_OS_WIN64)
+#define HAVE_CONIO_H 1
+#define HAVE_DIRECT_H 1
+#define HAVE_DOS_H 1
+#define HAVE_IO_H 1
+#define HAVE_SHARE_H 1
+#if (LZO_CC_AZTECC)
+#  undef HAVE_CONIO_H
+#  undef HAVE_DIRECT_H
+#  undef HAVE_DIRENT_H
+#  undef HAVE_MALLOC_H
+#  undef HAVE_SHARE_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_STAT_H
+#  undef HAVE_SYS_TIME_H
+#  undef HAVE_SYS_TYPES_H
+#elif (LZO_CC_BORLANDC)
+#  undef HAVE_UNISTD_H
+#  undef HAVE_SYS_TIME_H
+#  if (LZO_OS_WIN32 || LZO_OS_WIN64)
+#    undef HAVE_DIRENT_H
+#  endif
+#  if (__BORLANDC__ < 0x0400)
+#    undef HAVE_DIRENT_H
+#    undef HAVE_UTIME_H
+#  endif
+#elif (LZO_CC_DMC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_UNISTD_H
+#  define HAVE_SYS_DIRENT_H 1
+#elif (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+#elif (LZO_OS_DOS32 && LZO_CC_HIGHC)
+#  define HAVE_ALLOCA_H 1
+#  undef HAVE_DIRENT_H
+#  undef HAVE_UNISTD_H
+#elif (LZO_CC_IBMC && LZO_OS_OS2)
+#  undef HAVE_DOS_H
+#  undef HAVE_DIRENT_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_TIME_H
+#  define HAVE_SYS_UTIME_H 1
+#elif (LZO_CC_CLANG_C2 || LZO_CC_CLANG_MSC || LZO_CC_GHS || LZO_CC_INTELC_MSC || LZO_CC_MSC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_TIME_H
+#  define HAVE_SYS_UTIME_H 1
+#elif (LZO_CC_LCCWIN32)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_DOS_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_SYS_TIME_H
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__MINGW32__)
+#  undef HAVE_UTIME_H
+#  define HAVE_SYS_UTIME_H 1
+#elif (LZO_OS_WIN32 && LZO_LIBC_MSL)
+#  define HAVE_ALLOCA_H 1
+#  undef HAVE_DOS_H
+#  undef HAVE_SHARE_H
+#  undef HAVE_SYS_TIME_H
+#elif (LZO_CC_NDPC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_DOS_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_TIME_H
+#elif (LZO_CC_PACIFICC)
+#  undef HAVE_DIRECT_H
+#  undef HAVE_DIRENT_H
+#  undef HAVE_FCNTL_H
+#  undef HAVE_IO_H
+#  undef HAVE_MALLOC_H
+#  undef HAVE_MEMORY_H
+#  undef HAVE_SHARE_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_STAT_H
+#  undef HAVE_SYS_TIME_H
+#  undef HAVE_SYS_TYPES_H
+#elif (LZO_OS_WIN32 && LZO_CC_PELLESC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_DOS_H
+#  undef HAVE_MALLOC_H
+#  undef HAVE_SHARE_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_TIME_H
+#  if (__POCC__ < 280)
+#  else
+#    define HAVE_SYS_UTIME_H 1
+#  endif
+#elif (LZO_OS_WIN32 && LZO_CC_PGI) && defined(__MINGW32__)
+#  undef HAVE_UTIME_H
+#  define HAVE_SYS_UTIME_H 1
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__PW32__)
+#elif (LZO_CC_SYMANTECC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_UNISTD_H
+#  if (__SC__ < 0x700)
+#    undef HAVE_UTIME_H
+#    undef HAVE_SYS_TIME_H
+#  endif
+#elif (LZO_CC_TOPSPEEDC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_STAT_H
+#  undef HAVE_SYS_TIME_H
+#  undef HAVE_SYS_TYPES_H
+#elif (LZO_CC_TURBOC)
+#  undef HAVE_UNISTD_H
+#  undef HAVE_SYS_TIME_H
+#  undef HAVE_SYS_TYPES_H
+#  if (LZO_OS_WIN32 || LZO_OS_WIN64)
+#    undef HAVE_DIRENT_H
+#  endif
+#  if (__TURBOC__ < 0x0200)
+#    undef HAVE_SIGNAL_H
+#  endif
+#  if (__TURBOC__ < 0x0400)
+#    undef HAVE_DIRECT_H
+#    undef HAVE_DIRENT_H
+#    undef HAVE_MALLOC_H
+#    undef HAVE_MEMORY_H
+#    undef HAVE_UTIME_H
+#  endif
+#elif (LZO_CC_WATCOMC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_TIME_H
+#  define HAVE_SYS_UTIME_H 1
+#  if (__WATCOMC__ < 950)
+#    undef HAVE_UNISTD_H
+#  endif
+#elif (LZO_CC_ZORTECHC)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_MEMORY_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_TIME_H
+#endif
+#endif
+#if (LZO_OS_CONSOLE)
+#  undef HAVE_DIRENT_H
+#endif
+#if (LZO_OS_EMBEDDED)
+#  undef HAVE_DIRENT_H
+#endif
+#if (LZO_LIBC_ISOC90 || LZO_LIBC_ISOC99)
+#  undef HAVE_DIRENT_H
+#  undef HAVE_FCNTL_H
+#  undef HAVE_MALLOC_H
+#  undef HAVE_UNISTD_H
+#  undef HAVE_UTIME_H
+#  undef HAVE_SYS_STAT_H
+#  undef HAVE_SYS_TIME_H
+#  undef HAVE_SYS_TYPES_H
+#endif
+#if (LZO_LIBC_GLIBC >= 0x020100ul)
+#  define HAVE_STDINT_H 1
+#elif (LZO_LIBC_DIETLIBC)
+#  undef HAVE_STDINT_H
+#elif (LZO_LIBC_UCLIBC)
+#  define HAVE_STDINT_H 1
+#elif (LZO_CC_BORLANDC) && (__BORLANDC__ >= 0x560)
+#  undef HAVE_STDINT_H
+#elif (LZO_CC_DMC) && (__DMC__ >= 0x825)
+#  define HAVE_STDINT_H 1
+#endif
+#if (HAVE_SYS_TIME_H && HAVE_TIME_H)
+#  define TIME_WITH_SYS_TIME 1
+#endif
+#endif
+#endif
+#if !(LZO_CFG_AUTO_NO_FUNCTIONS)
+#if (LZO_LIBC_NAKED)
+#elif (LZO_LIBC_FREESTANDING)
+#elif (LZO_LIBC_MOSTLY_FREESTANDING)
+#  define HAVE_LONGJMP 1
+#  define HAVE_MEMCMP 1
+#  define HAVE_MEMCPY 1
+#  define HAVE_MEMMOVE 1
+#  define HAVE_MEMSET 1
+#  define HAVE_SETJMP 1
+#else
+#define HAVE_ACCESS 1
+#define HAVE_ALLOCA 1
+#define HAVE_ATEXIT 1
+#define HAVE_ATOI 1
+#define HAVE_ATOL 1
+#define HAVE_CHMOD 1
+#define HAVE_CHOWN 1
+#define HAVE_CTIME 1
+#define HAVE_DIFFTIME 1
+#define HAVE_FILENO 1
+#define HAVE_FSTAT 1
+#define HAVE_GETENV 1
+#define HAVE_GETTIMEOFDAY 1
+#define HAVE_GMTIME 1
+#define HAVE_ISATTY 1
+#define HAVE_LOCALTIME 1
+#define HAVE_LONGJMP 1
+#define HAVE_LSTAT 1
+#define HAVE_MEMCMP 1
+#define HAVE_MEMCPY 1
+#define HAVE_MEMMOVE 1
+#define HAVE_MEMSET 1
+#define HAVE_MKDIR 1
+#define HAVE_MKTIME 1
+#define HAVE_QSORT 1
+#define HAVE_RAISE 1
+#define HAVE_RMDIR 1
+#define HAVE_SETJMP 1
+#define HAVE_SIGNAL 1
+#define HAVE_SNPRINTF 1
+#define HAVE_STAT 1
+#define HAVE_STRCHR 1
+#define HAVE_STRDUP 1
+#define HAVE_STRERROR 1
+#define HAVE_STRFTIME 1
+#define HAVE_STRRCHR 1
+#define HAVE_STRSTR 1
+#define HAVE_TIME 1
+#define HAVE_UMASK 1
+#define HAVE_UTIME 1
+#define HAVE_VSNPRINTF 1
+#if (LZO_OS_BEOS || LZO_OS_CYGWIN || LZO_OS_POSIX || LZO_OS_QNX || LZO_OS_VMS)
+#  define HAVE_STRCASECMP 1
+#  define HAVE_STRNCASECMP 1
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__PW32__)
+#  define HAVE_STRCASECMP 1
+#  define HAVE_STRNCASECMP 1
+#else
+#  define HAVE_STRICMP 1
+#  define HAVE_STRNICMP 1
+#endif
+#if (LZO_OS_POSIX)
+#  if (LZO_OS_POSIX_AIX)
+#    define HAVE_GETRUSAGE 1
+#  elif (LZO_OS_POSIX_DARWIN && LZO_LIBC_MSL)
+#    undef HAVE_CHOWN
+#    undef HAVE_LSTAT
+#  elif (LZO_OS_POSIX_UNICOS)
+#    undef HAVE_ALLOCA
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  endif
+#  if (LZO_CC_TINYC)
+#    undef HAVE_ALLOCA
+#  endif
+#  if (LZO_LIBC_DIETLIBC || LZO_LIBC_GLIBC || LZO_LIBC_UCLIBC)
+#    define HAVE_GETRUSAGE 1
+#    define HAVE_GETPAGESIZE 1
+#    define HAVE_MMAP 1
+#    define HAVE_MPROTECT 1
+#    define HAVE_MUNMAP 1
+#  endif
+#elif (LZO_OS_CYGWIN)
+#  if (LZO_CC_GNUC < 0x025a00ul)
+#    undef HAVE_GETTIMEOFDAY
+#    undef HAVE_LSTAT
+#  endif
+#  if (LZO_CC_GNUC < 0x025f00ul)
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  endif
+#elif (LZO_OS_EMX)
+#  undef HAVE_CHOWN
+#  undef HAVE_LSTAT
+#elif (LZO_ARCH_M68K && LZO_OS_TOS && LZO_CC_GNUC)
+#  if !defined(__MINT__)
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  endif
+#elif (LZO_ARCH_M68K && LZO_OS_TOS && (LZO_CC_PUREC || LZO_CC_TURBOC))
+#  undef HAVE_ALLOCA
+#  undef HAVE_ACCESS
+#  undef HAVE_CHMOD
+#  undef HAVE_CHOWN
+#  undef HAVE_FSTAT
+#  undef HAVE_GETTIMEOFDAY
+#  undef HAVE_LSTAT
+#  undef HAVE_SNPRINTF
+#  undef HAVE_UMASK
+#  undef HAVE_UTIME
+#  undef HAVE_VSNPRINTF
+#endif
+#if (LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_OS216 || LZO_OS_WIN16 || LZO_OS_WIN32 || LZO_OS_WIN64)
+#undef HAVE_CHOWN
+#undef HAVE_GETTIMEOFDAY
+#undef HAVE_LSTAT
+#undef HAVE_UMASK
+#if (LZO_CC_AZTECC)
+#  undef HAVE_ALLOCA
+#  undef HAVE_DIFFTIME
+#  undef HAVE_FSTAT
+#  undef HAVE_STRDUP
+#  undef HAVE_SNPRINTF
+#  undef HAVE_UTIME
+#  undef HAVE_VSNPRINTF
+#elif (LZO_CC_BORLANDC)
+#  if (__BORLANDC__ < 0x0400)
+#    undef HAVE_ALLOCA
+#    undef HAVE_UTIME
+#  endif
+#  if ((__BORLANDC__ < 0x0410) && LZO_OS_WIN16)
+#    undef HAVE_ALLOCA
+#  endif
+#  if (__BORLANDC__ < 0x0550)
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  endif
+#elif (LZO_CC_DMC)
+#  if (LZO_OS_WIN16)
+#    undef HAVE_ALLOCA
+#  endif
+#  define snprintf _snprintf
+#  define vsnprintf _vsnprintf
+#elif (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+#  undef HAVE_SNPRINTF
+#  undef HAVE_VSNPRINTF
+#elif (LZO_OS_DOS32 && LZO_CC_HIGHC)
+#  undef HAVE_SNPRINTF
+#  undef HAVE_VSNPRINTF
+#elif (LZO_CC_GHS)
+#  undef HAVE_ALLOCA
+#  ifndef snprintf
+#  define snprintf _snprintf
+#  endif
+#  ifndef vsnprintf
+#  define vsnprintf _vsnprintf
+#  endif
+#elif (LZO_CC_IBMC)
+#  undef HAVE_SNPRINTF
+#  undef HAVE_VSNPRINTF
+#elif (LZO_CC_CLANG_MSC || LZO_CC_INTELC_MSC)
+#  ifndef snprintf
+#  define snprintf _snprintf
+#  endif
+#  ifndef vsnprintf
+#  define vsnprintf _vsnprintf
+#  endif
+#elif (LZO_CC_LCCWIN32)
+#  define utime _utime
+#elif (LZO_CC_CLANG_C2 || LZO_CC_MSC)
+#  if (_MSC_VER < 600)
+#    undef HAVE_STRFTIME
+#  endif
+#  if (_MSC_VER < 700)
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  elif (_MSC_VER < 1500)
+#    ifndef snprintf
+#    define snprintf _snprintf
+#    endif
+#    ifndef vsnprintf
+#    define vsnprintf _vsnprintf
+#    endif
+#  elif (_MSC_VER < 1900)
+#    ifndef snprintf
+#    define snprintf _snprintf
+#    endif
+#  endif
+#  if ((_MSC_VER < 800) && LZO_OS_WIN16)
+#    undef HAVE_ALLOCA
+#  endif
+#  if (LZO_ARCH_I086) && defined(__cplusplus)
+#    undef HAVE_LONGJMP
+#    undef HAVE_SETJMP
+#  endif
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__MINGW32__)
+#  if (LZO_CC_GNUC < 0x025f00ul)
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  else
+#    define snprintf _snprintf
+#    define vsnprintf _vsnprintf
+#  endif
+#elif (LZO_OS_WIN32 && LZO_LIBC_MSL)
+#  if (__MSL__ < 0x8000ul)
+#    undef HAVE_CHMOD
+#  endif
+#elif (LZO_CC_NDPC)
+#  undef HAVE_ALLOCA
+#  undef HAVE_SNPRINTF
+#  undef HAVE_STRNICMP
+#  undef HAVE_UTIME
+#  undef HAVE_VSNPRINTF
+#  if defined(__cplusplus)
+#    undef HAVE_STAT
+#  endif
+#elif (LZO_CC_PACIFICC)
+#  undef HAVE_ACCESS
+#  undef HAVE_ALLOCA
+#  undef HAVE_CHMOD
+#  undef HAVE_DIFFTIME
+#  undef HAVE_FSTAT
+#  undef HAVE_MKTIME
+#  undef HAVE_RAISE
+#  undef HAVE_SNPRINTF
+#  undef HAVE_STRFTIME
+#  undef HAVE_UTIME
+#  undef HAVE_VSNPRINTF
+#elif (LZO_OS_WIN32 && LZO_CC_PELLESC)
+#  if (__POCC__ < 280)
+#    define alloca _alloca
+#    undef HAVE_UTIME
+#  endif
+#elif (LZO_OS_WIN32 && LZO_CC_PGI) && defined(__MINGW32__)
+#  define snprintf _snprintf
+#  define vsnprintf _vsnprintf
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__PW32__)
+#  undef HAVE_SNPRINTF
+#  undef HAVE_VSNPRINTF
+#elif (LZO_CC_SYMANTECC)
+#  if (LZO_OS_WIN16 && (LZO_MM_MEDIUM || LZO_MM_LARGE || LZO_MM_HUGE))
+#    undef HAVE_ALLOCA
+#  endif
+#  if (__SC__ < 0x600)
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  else
+#    define snprintf _snprintf
+#    define vsnprintf _vsnprintf
+#  endif
+#  if (__SC__ < 0x700)
+#    undef HAVE_DIFFTIME
+#    undef HAVE_UTIME
+#  endif
+#elif (LZO_CC_TOPSPEEDC)
+#  undef HAVE_SNPRINTF
+#  undef HAVE_VSNPRINTF
+#elif (LZO_CC_TURBOC)
+#  undef HAVE_ALLOCA
+#  undef HAVE_SNPRINTF
+#  undef HAVE_VSNPRINTF
+#  if (__TURBOC__ < 0x0200)
+#    undef HAVE_RAISE
+#    undef HAVE_SIGNAL
+#  endif
+#  if (__TURBOC__ < 0x0295)
+#    undef HAVE_MKTIME
+#    undef HAVE_STRFTIME
+#  endif
+#  if (__TURBOC__ < 0x0400)
+#    undef HAVE_UTIME
+#  endif
+#elif (LZO_CC_WATCOMC)
+#  if (__WATCOMC__ < 1100)
+#    undef HAVE_SNPRINTF
+#    undef HAVE_VSNPRINTF
+#  elif (__WATCOMC__ < 1200)
+#    define snprintf _snprintf
+#    define vsnprintf _vsnprintf
+#  endif
+#elif (LZO_CC_ZORTECHC)
+#  if (LZO_OS_WIN16 && (LZO_MM_MEDIUM || LZO_MM_LARGE || LZO_MM_HUGE))
+#    undef HAVE_ALLOCA
+#  endif
+#  undef HAVE_DIFFTIME
+#  undef HAVE_SNPRINTF
+#  undef HAVE_UTIME
+#  undef HAVE_VSNPRINTF
+#endif
+#endif
+#if (LZO_OS_CONSOLE)
+#  undef HAVE_ACCESS
+#  undef HAVE_CHMOD
+#  undef HAVE_CHOWN
+#  undef HAVE_GETTIMEOFDAY
+#  undef HAVE_LSTAT
+#  undef HAVE_TIME
+#  undef HAVE_UMASK
+#  undef HAVE_UTIME
+#endif
+#if (LZO_LIBC_ISOC90 || LZO_LIBC_ISOC99)
+#  undef HAVE_ACCESS
+#  undef HAVE_CHMOD
+#  undef HAVE_CHOWN
+#  undef HAVE_FILENO
+#  undef HAVE_FSTAT
+#  undef HAVE_GETTIMEOFDAY
+#  undef HAVE_LSTAT
+#  undef HAVE_STAT
+#  undef HAVE_UMASK
+#  undef HAVE_UTIME
+# if 1
+#  undef HAVE_ALLOCA
+#  undef HAVE_ISATTY
+#  undef HAVE_MKDIR
+#  undef HAVE_RMDIR
+#  undef HAVE_STRDUP
+#  undef HAVE_STRICMP
+#  undef HAVE_STRNICMP
+# endif
+#endif
+#endif
+#endif
+#if !(LZO_CFG_AUTO_NO_SIZES)
+#if !defined(SIZEOF_SHORT) && defined(LZO_SIZEOF_SHORT)
+#  define SIZEOF_SHORT          LZO_SIZEOF_SHORT
+#endif
+#if !defined(SIZEOF_INT) && defined(LZO_SIZEOF_INT)
+#  define SIZEOF_INT            LZO_SIZEOF_INT
+#endif
+#if !defined(SIZEOF_LONG) && defined(LZO_SIZEOF_LONG)
+#  define SIZEOF_LONG           LZO_SIZEOF_LONG
+#endif
+#if !defined(SIZEOF_LONG_LONG) && defined(LZO_SIZEOF_LONG_LONG)
+#  define SIZEOF_LONG_LONG      LZO_SIZEOF_LONG_LONG
+#endif
+#if !defined(SIZEOF___INT32) && defined(LZO_SIZEOF___INT32)
+#  define SIZEOF___INT32        LZO_SIZEOF___INT32
+#endif
+#if !defined(SIZEOF___INT64) && defined(LZO_SIZEOF___INT64)
+#  define SIZEOF___INT64        LZO_SIZEOF___INT64
+#endif
+#if !defined(SIZEOF_VOID_P) && defined(LZO_SIZEOF_VOID_P)
+#  define SIZEOF_VOID_P         LZO_SIZEOF_VOID_P
+#endif
+#if !defined(SIZEOF_SIZE_T) && defined(LZO_SIZEOF_SIZE_T)
+#  define SIZEOF_SIZE_T         LZO_SIZEOF_SIZE_T
+#endif
+#if !defined(SIZEOF_PTRDIFF_T) && defined(LZO_SIZEOF_PTRDIFF_T)
+#  define SIZEOF_PTRDIFF_T      LZO_SIZEOF_PTRDIFF_T
+#endif
+#endif
+#if (HAVE_SIGNAL) && !defined(RETSIGTYPE)
+#  define RETSIGTYPE void
+#endif
+#endif
+#if !(LZO_CFG_SKIP_LZO_TYPES)
+#if 1 && !defined(lzo_signo_t) && defined(__linux__) && defined(__dietlibc__) && (LZO_SIZEOF_INT != 4)
+#  define lzo_signo_t               lzo_int32e_t
+#endif
+#if !defined(lzo_signo_t)
+#  define lzo_signo_t               int
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if (LZO_BROKEN_CDECL_ALT_SYNTAX)
+typedef void __lzo_cdecl_sighandler (*lzo_sighandler_t)(lzo_signo_t);
+#else
+typedef void (__lzo_cdecl_sighandler *lzo_sighandler_t)(lzo_signo_t);
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif
+#endif
+#if defined(LZO_WANT_ACC_INCD_H)
+#  undef LZO_WANT_ACC_INCD_H
+#ifndef __LZO_INCD_H_INCLUDED
+#define __LZO_INCD_H_INCLUDED 1
+#if (LZO_LIBC_NAKED)
+#ifndef __LZO_FALLBACK_STDDEF_H_INCLUDED
+#define __LZO_FALLBACK_STDDEF_H_INCLUDED 1
+#if defined(__PTRDIFF_TYPE__)
+typedef __PTRDIFF_TYPE__ lzo_fallback_ptrdiff_t;
+#elif defined(__MIPS_PSX2__)
+typedef int lzo_fallback_ptrdiff_t;
+#else
+typedef long lzo_fallback_ptrdiff_t;
+#endif
+#if defined(__SIZE_TYPE__)
+typedef __SIZE_TYPE__ lzo_fallback_size_t;
+#elif defined(__MIPS_PSX2__)
+typedef unsigned int lzo_fallback_size_t;
+#else
+typedef unsigned long lzo_fallback_size_t;
+#endif
+#if !defined(ptrdiff_t)
+typedef lzo_fallback_ptrdiff_t ptrdiff_t;
+#ifndef _PTRDIFF_T_DEFINED
+#define _PTRDIFF_T_DEFINED 1
+#endif
+#endif
+#if !defined(size_t)
+typedef lzo_fallback_size_t size_t;
+#ifndef _SIZE_T_DEFINED
+#define _SIZE_T_DEFINED 1
+#endif
+#endif
+#if !defined(__cplusplus) && !defined(wchar_t)
+typedef unsigned short wchar_t;
+#ifndef _WCHAR_T_DEFINED
+#define _WCHAR_T_DEFINED 1
+#endif
+#endif
+#ifndef NULL
+#if defined(__cplusplus) && defined(__GNUC__) && (__GNUC__ >= 4)
+#define NULL    __null
+#elif defined(__cplusplus)
+#define NULL    0
+#else
+#define NULL    ((void*)0)
+#endif
+#endif
+#ifndef offsetof
+#define offsetof(s,m)   ((size_t)((ptrdiff_t)&(((s*)0)->m)))
+#endif
+#endif
+#elif (LZO_LIBC_FREESTANDING)
+# if defined(HAVE_STDDEF_H) && (HAVE_STDDEF_H+0)
+#  include <stddef.h>
+# endif
+# if defined(HAVE_STDINT_H) && (HAVE_STDINT_H+0)
+#  include <stdint.h>
+# endif
+#elif (LZO_LIBC_MOSTLY_FREESTANDING)
+# if defined(HAVE_STDIO_H) && (HAVE_STDIO_H+0)
+#  include <stdio.h>
+# endif
+# if defined(HAVE_STDDEF_H) && (HAVE_STDDEF_H+0)
+#  include <stddef.h>
+# endif
+# if defined(HAVE_STDINT_H) && (HAVE_STDINT_H+0)
+#  include <stdint.h>
+# endif
+#else
+#include <stdio.h>
+#if defined(HAVE_TIME_H) && (HAVE_TIME_H+0) && defined(__MSL__) && defined(__cplusplus)
+# include <time.h>
+#endif
+#if defined(HAVE_SYS_TYPES_H) && (HAVE_SYS_TYPES_H+0)
+# include <sys/types.h>
+#endif
+#if defined(HAVE_SYS_STAT_H) && (HAVE_SYS_STAT_H+0)
+# include <sys/stat.h>
+#endif
+#if defined(STDC_HEADERS) && (STDC_HEADERS+0)
+# include <stdlib.h>
+#elif defined(HAVE_STDLIB_H) && (HAVE_STDLIB_H+0)
+# include <stdlib.h>
+#endif
+#include <stddef.h>
+#if defined(HAVE_STRING_H) && (HAVE_STRING_H+0)
+# if defined(STDC_HEADERS) && (STDC_HEADERS+0)
+# elif defined(HAVE_MEMORY_H) && (HAVE_MEMORY_H+0)
+#  include <memory.h>
+# endif
+# include <string.h>
+#endif
+#if defined(HAVE_STRINGS_H) && (HAVE_STRINGS_H+0)
+# include <strings.h>
+#endif
+#if defined(HAVE_INTTYPES_H) && (HAVE_INTTYPES_H+0)
+# include <inttypes.h>
+#endif
+#if defined(HAVE_STDINT_H) && (HAVE_STDINT_H+0)
+# include <stdint.h>
+#endif
+#if defined(HAVE_UNISTD_H) && (HAVE_UNISTD_H+0)
+# include <unistd.h>
+#endif
+#endif
+#endif
+#endif
+#if defined(LZO_WANT_ACC_INCE_H)
+#  undef LZO_WANT_ACC_INCE_H
+#ifndef __LZO_INCE_H_INCLUDED
+#define __LZO_INCE_H_INCLUDED 1
+#if (LZO_LIBC_NAKED)
+#elif (LZO_LIBC_FREESTANDING)
+#elif (LZO_LIBC_MOSTLY_FREESTANDING)
+#  if (HAVE_SETJMP_H)
+#    include <setjmp.h>
+#  endif
+#else
+#if (HAVE_STDARG_H)
+#  include <stdarg.h>
+#endif
+#if (HAVE_CTYPE_H)
+#  include <ctype.h>
+#endif
+#if (HAVE_ERRNO_H)
+#  include <errno.h>
+#endif
+#if (HAVE_MALLOC_H)
+#  include <malloc.h>
+#endif
+#if (HAVE_ALLOCA_H)
+#  include <alloca.h>
+#endif
+#if (HAVE_FCNTL_H)
+#  include <fcntl.h>
+#endif
+#if (HAVE_DIRENT_H)
+#  include <dirent.h>
+#endif
+#if (HAVE_SETJMP_H)
+#  include <setjmp.h>
+#endif
+#if (HAVE_SIGNAL_H)
+#  include <signal.h>
+#endif
+#if (HAVE_SYS_TIME_H && HAVE_TIME_H)
+#  include <sys/time.h>
+#  include <time.h>
+#elif (HAVE_TIME_H)
+#  include <time.h>
+#endif
+#if (HAVE_UTIME_H)
+#  include <utime.h>
+#elif (HAVE_SYS_UTIME_H)
+#  include <sys/utime.h>
+#endif
+#if (HAVE_IO_H)
+#  include <io.h>
+#endif
+#if (HAVE_DOS_H)
+#  include <dos.h>
+#endif
+#if (HAVE_DIRECT_H)
+#  include <direct.h>
+#endif
+#if (HAVE_SHARE_H)
+#  include <share.h>
+#endif
+#if (LZO_CC_NDPC)
+#  include <os.h>
+#endif
+#if defined(__TOS__) && (defined(__PUREC__) || defined(__TURBOC__))
+#  include <ext.h>
+#endif
+#endif
+#endif
+#endif
+#if defined(LZO_WANT_ACC_INCI_H)
+#  undef LZO_WANT_ACC_INCI_H
+#ifndef __LZO_INCI_H_INCLUDED
+#define __LZO_INCI_H_INCLUDED 1
+#if (LZO_LIBC_NAKED)
+#elif (LZO_LIBC_FREESTANDING)
+#elif (LZO_LIBC_MOSTLY_FREESTANDING)
+#else
+#if (LZO_OS_TOS && (LZO_CC_PUREC || LZO_CC_TURBOC))
+#  include <tos.h>
+#elif (LZO_HAVE_WINDOWS_H)
+#  if 1 && !defined(WIN32_LEAN_AND_MEAN)
+#    define WIN32_LEAN_AND_MEAN 1
+#  endif
+#  if 1 && !defined(_WIN32_WINNT)
+#    define _WIN32_WINNT 0x0400
+#  endif
+#  include <windows.h>
+#  if (LZO_CC_BORLANDC || LZO_CC_TURBOC)
+#    include <dir.h>
+#  endif
+#elif (LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_WIN16)
+#  if (LZO_CC_AZTECC)
+#    include <model.h>
+#    include <stat.h>
+#  elif (LZO_CC_BORLANDC || LZO_CC_TURBOC)
+#    include <alloc.h>
+#    include <dir.h>
+#  elif (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+#    include <sys/exceptn.h>
+#  elif (LZO_CC_PACIFICC)
+#    include <unixio.h>
+#    include <stat.h>
+#    include <sys.h>
+#  elif (LZO_CC_WATCOMC)
+#    include <i86.h>
+#  endif
+#elif (LZO_OS_OS216)
+#  if (LZO_CC_WATCOMC)
+#    include <i86.h>
+#  endif
+#endif
+#if (HAVE_SYS_MMAN_H)
+#  include <sys/mman.h>
+#endif
+#if (HAVE_SYS_RESOURCE_H)
+#  include <sys/resource.h>
+#endif
+#if (LZO_OS_DOS16 || LZO_OS_OS216 || LZO_OS_WIN16)
+#  if defined(FP_OFF)
+#    define LZO_PTR_FP_OFF(x)   FP_OFF(x)
+#  elif defined(_FP_OFF)
+#    define LZO_PTR_FP_OFF(x)   _FP_OFF(x)
+#  else
+#    define LZO_PTR_FP_OFF(x)   (((const unsigned __far*)&(x))[0])
+#  endif
+#  if defined(FP_SEG)
+#    define LZO_PTR_FP_SEG(x)   FP_SEG(x)
+#  elif defined(_FP_SEG)
+#    define LZO_PTR_FP_SEG(x)   _FP_SEG(x)
+#  else
+#    define LZO_PTR_FP_SEG(x)   (((const unsigned __far*)&(x))[1])
+#  endif
+#  if defined(MK_FP)
+#    define LZO_PTR_MK_FP(s,o)  MK_FP(s,o)
+#  elif defined(_MK_FP)
+#    define LZO_PTR_MK_FP(s,o)  _MK_FP(s,o)
+#  else
+#    define LZO_PTR_MK_FP(s,o)  ((void __far*)(((unsigned long)(s)<<16)+(unsigned)(o)))
+#  endif
+#  if 0
+#    undef LZO_PTR_FP_OFF
+#    undef LZO_PTR_FP_SEG
+#    undef LZO_PTR_MK_FP
+#    define LZO_PTR_FP_OFF(x)   (((const unsigned __far*)&(x))[0])
+#    define LZO_PTR_FP_SEG(x)   (((const unsigned __far*)&(x))[1])
+#    define LZO_PTR_MK_FP(s,o)  ((void __far*)(((unsigned long)(s)<<16)+(unsigned)(o)))
+#  endif
+#endif
+#endif
+#endif
+#endif
+#if defined(LZO_WANT_ACC_LIB_H)
+#  undef LZO_WANT_ACC_LIB_H
+#ifndef __LZO_LIB_H_INCLUDED
+#define __LZO_LIB_H_INCLUDED 1
+#if !defined(__LZOLIB_FUNCNAME)
+#  define __LZOLIB_FUNCNAME(f)  f
+#endif
+#if !defined(LZOLIB_EXTERN)
+#  define LZOLIB_EXTERN(r,f)                extern r __LZOLIB_FUNCNAME(f)
+#endif
+#if !defined(LZOLIB_EXTERN_NOINLINE)
+#  if defined(__lzo_noinline)
+#    define LZOLIB_EXTERN_NOINLINE(r,f)     extern __lzo_noinline r __LZOLIB_FUNCNAME(f)
+#  else
+#    define LZOLIB_EXTERN_NOINLINE(r,f)     extern r __LZOLIB_FUNCNAME(f)
+#  endif
+#endif
+#if (LZO_SIZEOF_LONG > LZO_SIZEOF_VOID_P)
+#  define lzolib_handle_t       long
+#else
+#  define lzolib_handle_t       lzo_intptr_t
+#endif
+#if 0
+LZOLIB_EXTERN(int, lzo_ascii_digit)   (int);
+LZOLIB_EXTERN(int, lzo_ascii_islower) (int);
+LZOLIB_EXTERN(int, lzo_ascii_isupper) (int);
+LZOLIB_EXTERN(int, lzo_ascii_tolower) (int);
+LZOLIB_EXTERN(int, lzo_ascii_toupper) (int);
+LZOLIB_EXTERN(int, lzo_ascii_utolower) (int);
+LZOLIB_EXTERN(int, lzo_ascii_utoupper) (int);
+#endif
+#define lzo_ascii_isdigit(c)    ((LZO_ICAST(unsigned, c) - 48) < 10)
+#define lzo_ascii_islower(c)    ((LZO_ICAST(unsigned, c) - 97) < 26)
+#define lzo_ascii_isupper(c)    ((LZO_ICAST(unsigned, c) - 65) < 26)
+#define lzo_ascii_tolower(c)    (LZO_ICAST(int, c) + (lzo_ascii_isupper(c) << 5))
+#define lzo_ascii_toupper(c)    (LZO_ICAST(int, c) - (lzo_ascii_islower(c) << 5))
+#define lzo_ascii_utolower(c)   lzo_ascii_tolower(LZO_ITRUNC(unsigned char, c))
+#define lzo_ascii_utoupper(c)   lzo_ascii_toupper(LZO_ITRUNC(unsigned char, c))
+#ifndef lzo_hsize_t
+#if (LZO_HAVE_MM_HUGE_PTR)
+#  define lzo_hsize_t   unsigned long
+#  define lzo_hvoid_p   void __huge *
+#  define lzo_hchar_p   char __huge *
+#  define lzo_hchar_pp  char __huge * __huge *
+#  define lzo_hbyte_p   unsigned char __huge *
+#else
+#  define lzo_hsize_t   size_t
+#  define lzo_hvoid_p   void *
+#  define lzo_hchar_p   char *
+#  define lzo_hchar_pp  char **
+#  define lzo_hbyte_p   unsigned char *
+#endif
+#endif
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_halloc) (lzo_hsize_t);
+LZOLIB_EXTERN(void, lzo_hfree) (lzo_hvoid_p);
+#if (LZO_OS_DOS16 || LZO_OS_OS216)
+LZOLIB_EXTERN(void __far*, lzo_dos_alloc) (unsigned long);
+LZOLIB_EXTERN(int, lzo_dos_free) (void __far*);
+#endif
+LZOLIB_EXTERN(int, lzo_hmemcmp) (const lzo_hvoid_p, const lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_hmemcpy) (lzo_hvoid_p, const lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_hmemmove) (lzo_hvoid_p, const lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_hmemset) (lzo_hvoid_p, int, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hstrlen) (const lzo_hchar_p);
+LZOLIB_EXTERN(int, lzo_hstrcmp) (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(int, lzo_hstrncmp)(const lzo_hchar_p, const lzo_hchar_p, lzo_hsize_t);
+LZOLIB_EXTERN(int, lzo_ascii_hstricmp) (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(int, lzo_ascii_hstrnicmp)(const lzo_hchar_p, const lzo_hchar_p, lzo_hsize_t);
+LZOLIB_EXTERN(int, lzo_ascii_hmemicmp) (const lzo_hvoid_p, const lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrstr) (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_ascii_hstristr) (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_hmemmem) (const lzo_hvoid_p, lzo_hsize_t, const lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_ascii_hmemimem) (const lzo_hvoid_p, lzo_hsize_t, const lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrcpy) (lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrcat) (lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hstrlcpy) (lzo_hchar_p, const lzo_hchar_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hstrlcat) (lzo_hchar_p, const lzo_hchar_p, lzo_hsize_t);
+LZOLIB_EXTERN(int, lzo_hstrscpy) (lzo_hchar_p, const lzo_hchar_p, lzo_hsize_t);
+LZOLIB_EXTERN(int, lzo_hstrscat) (lzo_hchar_p, const lzo_hchar_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrccpy) (lzo_hchar_p, const lzo_hchar_p, int);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_hmemccpy) (lzo_hvoid_p, const lzo_hvoid_p, int, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrchr)  (const lzo_hchar_p, int);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrrchr) (const lzo_hchar_p, int);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_ascii_hstrichr) (const lzo_hchar_p, int);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_ascii_hstrrichr) (const lzo_hchar_p, int);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_hmemchr)  (const lzo_hvoid_p, int, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_hmemrchr) (const lzo_hvoid_p, int, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_ascii_hmemichr) (const lzo_hvoid_p, int, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_ascii_hmemrichr) (const lzo_hvoid_p, int, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hstrspn)  (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hstrrspn) (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hstrcspn)  (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hstrrcspn) (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrpbrk)  (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrrpbrk) (const lzo_hchar_p, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrsep)  (lzo_hchar_pp, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_hstrrsep) (lzo_hchar_pp, const lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_ascii_hstrlwr) (lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hchar_p, lzo_ascii_hstrupr) (lzo_hchar_p);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_ascii_hmemlwr) (lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hvoid_p, lzo_ascii_hmemupr) (lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hfread) (void *, lzo_hvoid_p, lzo_hsize_t);
+LZOLIB_EXTERN(lzo_hsize_t, lzo_hfwrite) (void *, const lzo_hvoid_p, lzo_hsize_t);
+#if (LZO_HAVE_MM_HUGE_PTR)
+LZOLIB_EXTERN(long, lzo_hread) (int, lzo_hvoid_p, long);
+LZOLIB_EXTERN(long, lzo_hwrite) (int, const lzo_hvoid_p, long);
+#endif
+LZOLIB_EXTERN(long, lzo_safe_hread) (int, lzo_hvoid_p, long);
+LZOLIB_EXTERN(long, lzo_safe_hwrite) (int, const lzo_hvoid_p, long);
+LZOLIB_EXTERN(unsigned, lzo_ua_get_be16) (const lzo_hvoid_p);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_ua_get_be24) (const lzo_hvoid_p);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_ua_get_be32) (const lzo_hvoid_p);
+LZOLIB_EXTERN(void, lzo_ua_set_be16) (lzo_hvoid_p, unsigned);
+LZOLIB_EXTERN(void, lzo_ua_set_be24) (lzo_hvoid_p, lzo_uint32l_t);
+LZOLIB_EXTERN(void, lzo_ua_set_be32) (lzo_hvoid_p, lzo_uint32l_t);
+LZOLIB_EXTERN(unsigned, lzo_ua_get_le16) (const lzo_hvoid_p);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_ua_get_le24) (const lzo_hvoid_p);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_ua_get_le32) (const lzo_hvoid_p);
+LZOLIB_EXTERN(void, lzo_ua_set_le16) (lzo_hvoid_p, unsigned);
+LZOLIB_EXTERN(void, lzo_ua_set_le24) (lzo_hvoid_p, lzo_uint32l_t);
+LZOLIB_EXTERN(void, lzo_ua_set_le32) (lzo_hvoid_p, lzo_uint32l_t);
+#if defined(lzo_int64l_t)
+LZOLIB_EXTERN(lzo_uint64l_t, lzo_ua_get_be64) (const lzo_hvoid_p);
+LZOLIB_EXTERN(void, lzo_ua_set_be64) (lzo_hvoid_p, lzo_uint64l_t);
+LZOLIB_EXTERN(lzo_uint64l_t, lzo_ua_get_le64) (const lzo_hvoid_p);
+LZOLIB_EXTERN(void, lzo_ua_set_le64) (lzo_hvoid_p, lzo_uint64l_t);
+#endif
+LZOLIB_EXTERN_NOINLINE(short, lzo_vget_short) (short, int);
+LZOLIB_EXTERN_NOINLINE(int, lzo_vget_int) (int, int);
+LZOLIB_EXTERN_NOINLINE(long, lzo_vget_long) (long, int);
+#if defined(lzo_int64l_t)
+LZOLIB_EXTERN_NOINLINE(lzo_int64l_t, lzo_vget_lzo_int64l_t) (lzo_int64l_t, int);
+#endif
+LZOLIB_EXTERN_NOINLINE(lzo_hsize_t, lzo_vget_lzo_hsize_t) (lzo_hsize_t, int);
+#if !(LZO_CFG_NO_FLOAT)
+LZOLIB_EXTERN_NOINLINE(float, lzo_vget_float) (float, int);
+#endif
+#if !(LZO_CFG_NO_DOUBLE)
+LZOLIB_EXTERN_NOINLINE(double, lzo_vget_double) (double, int);
+#endif
+LZOLIB_EXTERN_NOINLINE(lzo_hvoid_p, lzo_vget_lzo_hvoid_p) (lzo_hvoid_p, int);
+LZOLIB_EXTERN_NOINLINE(const lzo_hvoid_p, lzo_vget_lzo_hvoid_cp) (const lzo_hvoid_p, int);
+#if !defined(LZO_FN_PATH_MAX)
+#if (LZO_OS_DOS16 || LZO_OS_WIN16)
+#  define LZO_FN_PATH_MAX   143
+#elif (LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_OS216 || LZO_OS_WIN32 || LZO_OS_WIN64)
+#  define LZO_FN_PATH_MAX   259
+#elif (LZO_OS_TOS)
+#  define LZO_FN_PATH_MAX   259
+#endif
+#endif
+#if !defined(LZO_FN_PATH_MAX)
+#  define LZO_FN_PATH_MAX   1023
+#endif
+#if !defined(LZO_FN_NAME_MAX)
+#if (LZO_OS_DOS16 || LZO_OS_WIN16)
+#  define LZO_FN_NAME_MAX   12
+#elif (LZO_ARCH_M68K && LZO_OS_TOS && (LZO_CC_PUREC || LZO_CC_TURBOC))
+#  define LZO_FN_NAME_MAX   12
+#elif (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+#elif (LZO_OS_DOS32)
+#  define LZO_FN_NAME_MAX   12
+#endif
+#endif
+#if !defined(LZO_FN_NAME_MAX)
+#  define LZO_FN_NAME_MAX   LZO_FN_PATH_MAX
+#endif
+#define LZO_FNMATCH_NOESCAPE        1
+#define LZO_FNMATCH_PATHNAME        2
+#define LZO_FNMATCH_PATHSTAR        4
+#define LZO_FNMATCH_PERIOD          8
+#define LZO_FNMATCH_ASCII_CASEFOLD  16
+LZOLIB_EXTERN(int, lzo_fnmatch) (const lzo_hchar_p, const lzo_hchar_p, int);
+#undef __LZOLIB_USE_OPENDIR
+#if (HAVE_DIRENT_H || LZO_CC_WATCOMC)
+#  define __LZOLIB_USE_OPENDIR 1
+#  if (LZO_OS_DOS32 && defined(__BORLANDC__))
+#  elif (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+#  elif (LZO_OS_OS2 || LZO_OS_OS216)
+#  elif (LZO_ARCH_M68K && LZO_OS_TOS && LZO_CC_GNUC)
+#  elif (LZO_OS_WIN32 && !(LZO_HAVE_WINDOWS_H))
+#  elif (LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_OS216 || LZO_OS_TOS || LZO_OS_WIN16 || LZO_OS_WIN32 || LZO_OS_WIN64)
+#    undef __LZOLIB_USE_OPENDIR
+#  endif
+#endif
+typedef struct
+{
+#if defined(__LZOLIB_USE_OPENDIR)
+    void* u_dirp;
+# if (LZO_CC_WATCOMC)
+    unsigned short f_time;
+    unsigned short f_date;
+    unsigned long f_size;
+# endif
+    char f_name[LZO_FN_NAME_MAX+1];
+#elif (LZO_OS_WIN32 || LZO_OS_WIN64)
+    lzolib_handle_t u_handle;
+    unsigned f_attr;
+    unsigned f_size_low;
+    unsigned f_size_high;
+    char f_name[LZO_FN_NAME_MAX+1];
+#elif (LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_TOS || LZO_OS_WIN16)
+    char u_dta[21];
+    unsigned char f_attr;
+    unsigned short f_time;
+    unsigned short f_date;
+    unsigned short f_size_low;
+    unsigned short f_size_high;
+    char f_name[LZO_FN_NAME_MAX+1];
+    char u_dirp;
+#else
+    void* u_dirp;
+    char f_name[LZO_FN_NAME_MAX+1];
+#endif
+} lzo_dir_t;
+#ifndef lzo_dir_p
+#define lzo_dir_p lzo_dir_t *
+#endif
+LZOLIB_EXTERN(int, lzo_opendir)  (lzo_dir_p, const char*);
+LZOLIB_EXTERN(int, lzo_readdir)  (lzo_dir_p);
+LZOLIB_EXTERN(int, lzo_closedir) (lzo_dir_p);
+#if (LZO_CC_GNUC) && (defined(__CYGWIN__) || defined(__MINGW32__))
+#  define lzo_alloca(x)     __builtin_alloca((x))
+#elif (LZO_CC_GNUC) && (LZO_OS_CONSOLE_PS2)
+#  define lzo_alloca(x)     __builtin_alloca((x))
+#elif (LZO_CC_BORLANDC || LZO_CC_LCC) && defined(__linux__)
+#elif (HAVE_ALLOCA)
+#  define lzo_alloca(x)     LZO_STATIC_CAST(void *, alloca((x)))
+#endif
+#if (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+#  define lzo_stackavail()  stackavail()
+#elif (LZO_ARCH_I086 && LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0410))
+#  define lzo_stackavail()  stackavail()
+#elif (LZO_ARCH_I086 && LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0400))
+#  if (LZO_OS_WIN16) && (LZO_MM_TINY || LZO_MM_SMALL || LZO_MM_MEDIUM)
+#  else
+#    define lzo_stackavail()  stackavail()
+#  endif
+#elif ((LZO_ARCH_I086 || LZO_ARCH_I386) && (LZO_CC_DMC || LZO_CC_SYMANTECC))
+#  define lzo_stackavail()  stackavail()
+#elif ((LZO_ARCH_I086) && LZO_CC_MSC && (_MSC_VER >= 700))
+#  define lzo_stackavail()  _stackavail()
+#elif ((LZO_ARCH_I086) && LZO_CC_MSC)
+#  define lzo_stackavail()  stackavail()
+#elif ((LZO_ARCH_I086 || LZO_ARCH_I386) && LZO_CC_TURBOC && (__TURBOC__ >= 0x0450))
+#  define lzo_stackavail()  stackavail()
+#elif (LZO_ARCH_I086 && LZO_CC_TURBOC && (__TURBOC__ >= 0x0400))
+   LZO_EXTERN_C size_t __cdecl stackavail(void);
+#  define lzo_stackavail()  stackavail()
+#elif ((LZO_ARCH_I086 || LZO_ARCH_I386) && (LZO_CC_WATCOMC))
+#  define lzo_stackavail()  stackavail()
+#elif (LZO_ARCH_I086 && LZO_CC_ZORTECHC)
+#  define lzo_stackavail()  _chkstack()
+#endif
+LZOLIB_EXTERN(lzo_intptr_t, lzo_get_osfhandle) (int);
+LZOLIB_EXTERN(const char *, lzo_getenv) (const char *);
+LZOLIB_EXTERN(int, lzo_isatty) (int);
+LZOLIB_EXTERN(int, lzo_mkdir) (const char*, unsigned);
+LZOLIB_EXTERN(int, lzo_rmdir) (const char*);
+LZOLIB_EXTERN(int, lzo_response) (int*, char***);
+LZOLIB_EXTERN(int, lzo_set_binmode) (int, int);
+#if defined(lzo_int32e_t)
+LZOLIB_EXTERN(lzo_int32e_t, lzo_muldiv32s) (lzo_int32e_t, lzo_int32e_t, lzo_int32e_t);
+LZOLIB_EXTERN(lzo_uint32e_t, lzo_muldiv32u) (lzo_uint32e_t, lzo_uint32e_t, lzo_uint32e_t);
+#endif
+LZOLIB_EXTERN(void, lzo_wildargv) (int*, char***);
+LZOLIB_EXTERN_NOINLINE(void, lzo_debug_break) (void);
+LZOLIB_EXTERN_NOINLINE(void, lzo_debug_nop) (void);
+LZOLIB_EXTERN_NOINLINE(int, lzo_debug_align_check_query) (void);
+LZOLIB_EXTERN_NOINLINE(int, lzo_debug_align_check_enable) (int);
+LZOLIB_EXTERN_NOINLINE(unsigned, lzo_debug_running_on_qemu) (void);
+LZOLIB_EXTERN_NOINLINE(unsigned, lzo_debug_running_on_valgrind) (void);
+#if defined(lzo_int32e_t)
+LZOLIB_EXTERN(int, lzo_tsc_read) (lzo_uint32e_t*);
+#endif
+struct lzo_pclock_handle_t;
+struct lzo_pclock_t;
+typedef struct lzo_pclock_handle_t lzo_pclock_handle_t;
+typedef struct lzo_pclock_t lzo_pclock_t;
+#ifndef lzo_pclock_handle_p
+#define lzo_pclock_handle_p lzo_pclock_handle_t *
+#endif
+#ifndef lzo_pclock_p
+#define lzo_pclock_p lzo_pclock_t *
+#endif
+#define LZO_PCLOCK_REALTIME             0
+#define LZO_PCLOCK_MONOTONIC            1
+#define LZO_PCLOCK_PROCESS_CPUTIME_ID   2
+#define LZO_PCLOCK_THREAD_CPUTIME_ID    3
+typedef int (*lzo_pclock_gettime_t) (lzo_pclock_handle_p, lzo_pclock_p);
+struct lzo_pclock_handle_t {
+    lzolib_handle_t h;
+    int mode;
+    int read_error;
+    const char* name;
+    lzo_pclock_gettime_t gettime;
+#if defined(lzo_int64l_t)
+    lzo_uint64l_t ticks_base;
+#endif
+};
+struct lzo_pclock_t {
+#if defined(lzo_int64l_t)
+    lzo_int64l_t tv_sec;
+#else
+    lzo_int32l_t tv_sec_high;
+    lzo_uint32l_t tv_sec_low;
+#endif
+    lzo_uint32l_t tv_nsec;
+};
+LZOLIB_EXTERN(int, lzo_pclock_open)  (lzo_pclock_handle_p, int);
+LZOLIB_EXTERN(int, lzo_pclock_open_default) (lzo_pclock_handle_p);
+LZOLIB_EXTERN(int, lzo_pclock_close) (lzo_pclock_handle_p);
+LZOLIB_EXTERN(void, lzo_pclock_read) (lzo_pclock_handle_p, lzo_pclock_p);
+#if !(LZO_CFG_NO_DOUBLE)
+LZOLIB_EXTERN(double, lzo_pclock_get_elapsed) (lzo_pclock_handle_p, const lzo_pclock_p, const lzo_pclock_p);
+#endif
+LZOLIB_EXTERN(int, lzo_pclock_flush_cpu_cache) (lzo_pclock_handle_p, unsigned);
+struct lzo_getopt_t;
+typedef struct lzo_getopt_t lzo_getopt_t;
+#ifndef lzo_getopt_p
+#define lzo_getopt_p lzo_getopt_t *
+#endif
+struct lzo_getopt_longopt_t;
+typedef struct lzo_getopt_longopt_t lzo_getopt_longopt_t;
+#ifndef lzo_getopt_longopt_p
+#define lzo_getopt_longopt_p lzo_getopt_longopt_t *
+#endif
+struct lzo_getopt_longopt_t {
+    const char* name;
+    int has_arg;
+    int* flag;
+    int val;
+};
+typedef void (*lzo_getopt_opterr_t)(lzo_getopt_p, const char*, void *);
+struct lzo_getopt_t {
+    void *user;
+    const char *progname;
+    int bad_option;
+    char *optarg;
+    lzo_getopt_opterr_t opterr;
+    int optind;
+    int optopt;
+    int errcount;
+    int argc; char** argv;
+    int eof; int shortpos;
+    int pending_rotate_first, pending_rotate_middle;
+};
+enum { LZO_GETOPT_NO_ARG, LZO_GETOPT_REQUIRED_ARG, LZO_GETOPT_OPTIONAL_ARG, LZO_GETOPT_EXACT_ARG = 0x10 };
+enum { LZO_GETOPT_PERMUTE, LZO_GETOPT_RETURN_IN_ORDER, LZO_GETOPT_REQUIRE_ORDER };
+LZOLIB_EXTERN(void, lzo_getopt_init) (lzo_getopt_p g,
+                                      int start_argc, int argc, char** argv);
+LZOLIB_EXTERN(int, lzo_getopt) (lzo_getopt_p g,
+                                const char* shortopts,
+                                const lzo_getopt_longopt_p longopts,
+                                int* longind);
+typedef struct {
+    lzo_uint32l_t seed;
+} lzo_rand31_t;
+#ifndef lzo_rand31_p
+#define lzo_rand31_p lzo_rand31_t *
+#endif
+LZOLIB_EXTERN(void, lzo_srand31) (lzo_rand31_p, lzo_uint32l_t);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_rand31) (lzo_rand31_p);
+#if defined(lzo_int64l_t)
+typedef struct {
+    lzo_uint64l_t seed;
+} lzo_rand48_t;
+#ifndef lzo_rand48_p
+#define lzo_rand48_p lzo_rand48_t *
+#endif
+LZOLIB_EXTERN(void, lzo_srand48) (lzo_rand48_p, lzo_uint32l_t);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_rand48) (lzo_rand48_p);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_rand48_r32) (lzo_rand48_p);
+#endif
+#if defined(lzo_int64l_t)
+typedef struct {
+    lzo_uint64l_t seed;
+} lzo_rand64_t;
+#ifndef lzo_rand64_p
+#define lzo_rand64_p lzo_rand64_t *
+#endif
+LZOLIB_EXTERN(void, lzo_srand64) (lzo_rand64_p, lzo_uint64l_t);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_rand64) (lzo_rand64_p);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_rand64_r32) (lzo_rand64_p);
+#endif
+typedef struct {
+    unsigned n;
+    lzo_uint32l_t s[624];
+} lzo_randmt_t;
+#ifndef lzo_randmt_p
+#define lzo_randmt_p lzo_randmt_t *
+#endif
+LZOLIB_EXTERN(void, lzo_srandmt) (lzo_randmt_p, lzo_uint32l_t);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_randmt) (lzo_randmt_p);
+LZOLIB_EXTERN(lzo_uint32l_t, lzo_randmt_r32) (lzo_randmt_p);
+#if defined(lzo_int64l_t)
+typedef struct {
+    unsigned n;
+    lzo_uint64l_t s[312];
+} lzo_randmt64_t;
+#ifndef lzo_randmt64_p
+#define lzo_randmt64_p lzo_randmt64_t *
+#endif
+LZOLIB_EXTERN(void, lzo_srandmt64) (lzo_randmt64_p, lzo_uint64l_t);
+LZOLIB_EXTERN(lzo_uint64l_t, lzo_randmt64_r64) (lzo_randmt64_p);
+#endif
+#define LZO_SPAWN_P_WAIT    0
+#define LZO_SPAWN_P_NOWAIT  1
+LZOLIB_EXTERN(int, lzo_spawnv)  (int mode, const char* fn, const char* const * argv);
+LZOLIB_EXTERN(int, lzo_spawnvp) (int mode, const char* fn, const char* const * argv);
+LZOLIB_EXTERN(int, lzo_spawnve) (int mode, const char* fn, const char* const * argv, const char * const envp);
+#endif
+#endif
+#if defined(LZO_WANT_ACC_CXX_H)
+#  undef LZO_WANT_ACC_CXX_H
+#ifndef __LZO_CXX_H_INCLUDED
+#define __LZO_CXX_H_INCLUDED 1
+#if defined(__cplusplus)
+#if defined(LZO_CXX_NOTHROW)
+#elif (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020800ul))
+#elif (LZO_CC_BORLANDC && (__BORLANDC__ < 0x0450))
+#elif (LZO_CC_GHS && !defined(__EXCEPTIONS))
+#elif (LZO_CC_HIGHC)
+#elif (LZO_CC_MSC && (_MSC_VER < 1100))
+#elif (LZO_CC_NDPC)
+#elif (LZO_CC_TURBOC)
+#elif (LZO_CC_WATCOMC && !defined(_CPPUNWIND))
+#elif (LZO_CC_ZORTECHC)
+#else
+#  define LZO_CXX_NOTHROW           throw()
+#endif
+#if !defined(LZO_CXX_NOTHROW)
+#  define LZO_CXX_NOTHROW           /*empty*/
+#endif
+#if defined(__LZO_CXX_DO_NEW)
+#elif (LZO_CC_GHS || LZO_CC_NDPC || LZO_CC_PGI)
+#  define __LZO_CXX_DO_NEW          { return 0; }
+#elif ((LZO_CC_BORLANDC || LZO_CC_TURBOC) && LZO_ARCH_I086)
+#  define __LZO_CXX_DO_NEW          { return 0; }
+#else
+#  define __LZO_CXX_DO_NEW          ;
+#endif
+#if defined(__LZO_CXX_DO_DELETE)
+#elif (LZO_CC_BORLANDC || LZO_CC_TURBOC)
+#  define __LZO_CXX_DO_DELETE       { }
+#else
+#  define __LZO_CXX_DO_DELETE       LZO_CXX_NOTHROW { }
+#endif
+#if (LZO_CC_BORLANDC && (__BORLANDC__ < 0x0450))
+#elif (LZO_CC_MSC && LZO_MM_HUGE)
+#  define LZO_CXX_DISABLE_NEW_DELETE private:
+#elif (LZO_CC_MSC && (_MSC_VER < 1100))
+#elif (LZO_CC_NDPC)
+#elif (LZO_CC_SYMANTECC || LZO_CC_ZORTECHC)
+#elif (LZO_CC_TURBOC)
+#elif (LZO_CC_WATCOMC && (__WATCOMC__ < 1100))
+#else
+#  define __LZO_CXX_HAVE_ARRAY_NEW 1
+#endif
+#if (__LZO_CXX_HAVE_ARRAY_NEW)
+#  define __LZO_CXX_HAVE_PLACEMENT_NEW 1
+#endif
+#if (__LZO_CXX_HAVE_PLACEMENT_NEW)
+#  if (LZO_CC_GNUC >= 0x030000ul)
+#    define __LZO_CXX_HAVE_PLACEMENT_DELETE 1
+#  elif (LZO_CC_INTELC)
+#    define __LZO_CXX_HAVE_PLACEMENT_DELETE 1
+#  elif (LZO_CC_MSC && (_MSC_VER >= 1200))
+#    define __LZO_CXX_HAVE_PLACEMENT_DELETE 1
+#  elif (LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#    define __LZO_CXX_HAVE_PLACEMENT_DELETE 1
+#  elif (LZO_CC_PGI)
+#    define __LZO_CXX_HAVE_PLACEMENT_DELETE 1
+#  endif
+#endif
+#if defined(LZO_CXX_DISABLE_NEW_DELETE)
+#elif defined(new) || defined(delete)
+#  define LZO_CXX_DISABLE_NEW_DELETE private:
+#elif (LZO_CC_GNUC && (LZO_CC_GNUC < 0x025b00ul))
+#  define LZO_CXX_DISABLE_NEW_DELETE private:
+#elif  (LZO_CC_HIGHC)
+#  define LZO_CXX_DISABLE_NEW_DELETE private:
+#elif !(__LZO_CXX_HAVE_ARRAY_NEW)
+#  define LZO_CXX_DISABLE_NEW_DELETE \
+        protected: static void operator delete(void*) __LZO_CXX_DO_DELETE \
+        protected: static void* operator new(size_t) __LZO_CXX_DO_NEW \
+        private:
+#else
+#  define LZO_CXX_DISABLE_NEW_DELETE \
+        protected: static void operator delete(void*) __LZO_CXX_DO_DELETE \
+                   static void operator delete[](void*) __LZO_CXX_DO_DELETE \
+        private:   static void* operator new(size_t)  __LZO_CXX_DO_NEW \
+                   static void* operator new[](size_t) __LZO_CXX_DO_NEW
+#endif
+#if defined(LZO_CXX_TRIGGER_FUNCTION)
+#else
+#  define LZO_CXX_TRIGGER_FUNCTION \
+        protected: virtual const void* lzo_cxx_trigger_function() const; \
+        private:
+#endif
+#if defined(LZO_CXX_TRIGGER_FUNCTION_IMPL)
+#else
+#  define LZO_CXX_TRIGGER_FUNCTION_IMPL(klass) \
+        const void* klass::lzo_cxx_trigger_function() const { return LZO_STATIC_CAST(const void *, 0); }
+#endif
+#endif
+#endif
+#endif
+#if defined(LZO_WANT_ACC_CHK_CH)
+#  undef LZO_WANT_ACC_CHK_CH
+#if !defined(LZOCHK_ASSERT)
+#  define LZOCHK_ASSERT(expr)   LZO_COMPILE_TIME_ASSERT_HEADER(expr)
+#endif
+#if !defined(LZOCHK_ASSERT_SIGN_T)
+#  define LZOCHK_ASSERT_SIGN_T(type,relop) \
+        LZOCHK_ASSERT( LZO_STATIC_CAST(type, -1)  relop  LZO_STATIC_CAST(type, 0)) \
+        LZOCHK_ASSERT( LZO_STATIC_CAST(type, ~LZO_STATIC_CAST(type, 0)) relop  LZO_STATIC_CAST(type, 0)) \
+        LZOCHK_ASSERT( LZO_STATIC_CAST(type, ~LZO_STATIC_CAST(type, 0)) ==     LZO_STATIC_CAST(type, -1))
+#endif
+#if !defined(LZOCHK_ASSERT_IS_SIGNED_T)
+#  define LZOCHK_ASSERT_IS_SIGNED_T(type)       LZOCHK_ASSERT_SIGN_T(type,<)
+#endif
+#if !defined(LZOCHK_ASSERT_IS_UNSIGNED_T)
+#  if (LZO_BROKEN_INTEGRAL_PROMOTION)
+#    define LZOCHK_ASSERT_IS_UNSIGNED_T(type) \
+        LZOCHK_ASSERT( LZO_STATIC_CAST(type, -1) > LZO_STATIC_CAST(type, 0) )
+#  else
+#    define LZOCHK_ASSERT_IS_UNSIGNED_T(type)   LZOCHK_ASSERT_SIGN_T(type,>)
+#  endif
+#endif
+#if defined(LZOCHK_CFG_PEDANTIC)
+#if (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0550) && (__BORLANDC__ < 0x0560))
+#  pragma option push -w-8055
+#elif (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0530) && (__BORLANDC__ < 0x0550))
+#  pragma option push -w-osh
+#endif
+#endif
+#if (LZO_0xffffffffL - LZO_UINT32_C(4294967294) != 1)
+#  error "preprocessor error"
+#endif
+#if (LZO_0xffffffffL - LZO_UINT32_C(0xfffffffd) != 2)
+#  error "preprocessor error"
+#endif
+#if +0
+#  error "preprocessor error"
+#endif
+#if -0
+#  error "preprocessor error"
+#endif
+#if +0 != 0
+#  error "preprocessor error"
+#endif
+#if -0 != 0
+#  error "preprocessor error"
+#endif
+#define LZOCHK_VAL  1
+#define LZOCHK_TMP1 LZOCHK_VAL
+#undef LZOCHK_VAL
+#define LZOCHK_VAL  2
+#define LZOCHK_TMP2 LZOCHK_VAL
+#if (LZOCHK_TMP1 != 2)
+#  error "preprocessor error 3a"
+#endif
+#if (LZOCHK_TMP2 != 2)
+#  error "preprocessor error 3b"
+#endif
+#undef LZOCHK_VAL
+#if (LZOCHK_TMP2)
+#  error "preprocessor error 3c"
+#endif
+#if (LZOCHK_TMP2 + 0 != 0)
+#  error "preprocessor error 3d"
+#endif
+#undef LZOCHK_TMP1
+#undef LZOCHK_TMP2
+#if 0 || defined(LZOCHK_CFG_PEDANTIC)
+#  if (LZO_ARCH_MIPS) && defined(_MIPS_SZINT)
+    LZOCHK_ASSERT((_MIPS_SZINT) == 8 * sizeof(int))
+#  endif
+#  if (LZO_ARCH_MIPS) && defined(_MIPS_SZLONG)
+    LZOCHK_ASSERT((_MIPS_SZLONG) == 8 * sizeof(long))
+#  endif
+#  if (LZO_ARCH_MIPS) && defined(_MIPS_SZPTR)
+    LZOCHK_ASSERT((_MIPS_SZPTR) == 8 * sizeof(void *))
+#  endif
+#endif
+    LZOCHK_ASSERT(1 == 1)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1u,1) == 1u)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1u,2) == 3u)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1u,3) == 7u)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1u,8) == 255u)
+#if (LZO_SIZEOF_INT >= 2)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1,15) == 32767)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1u,16) == 0xffffU)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(0u,16) == 0u)
+#endif
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1ul,16) == 0xffffUL)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(0ul,16) == 0ul)
+#if (LZO_SIZEOF_INT >= 4)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1,31) == 2147483647)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1u,32) == 0xffffffffU)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(0u,32) == 0u)
+#endif
+#if (LZO_SIZEOF_LONG >= 4)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1ul,32) == 0xffffffffUL)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(0ul,32) == 0ul)
+#endif
+#if (LZO_SIZEOF_LONG >= 8)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1ul,64) == 0xffffffffffffffffUL)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(0ul,64) == 0ul)
+#endif
+#if !(LZO_BROKEN_INTEGRAL_PROMOTION)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1u,LZO_SIZEOF_INT*8) == ~0u)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1ul,LZO_SIZEOF_LONG*8) == ~0ul)
+#endif
+#if 1
+    LZOCHK_ASSERT(__LZO_MASK_GEN(0,0) == 0)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(1,0) == 0)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(2,0) == 0)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(4,0) == 0)
+#endif
+#if 1
+    LZOCHK_ASSERT(__LZO_MASK_GEN(2,1) == 2)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(4,1) == 4)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(8,1) == 8)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(2,2) == 2+4)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(4,2) == 4+8)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(8,2) == 8+16)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(2,3) == 2+4+8)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(4,3) == 4+8+16)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(8,3) == 8+16+32)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(7,1) == 7)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(7,2) == 7+14)
+    LZOCHK_ASSERT(__LZO_MASK_GEN(7,3) == 7+14+28)
+#endif
+#if !(LZO_BROKEN_SIGNED_RIGHT_SHIFT)
+    LZOCHK_ASSERT(((-1) >> 7) == -1)
+#endif
+    LZOCHK_ASSERT(((1)  >> 7) == 0)
+#if (LZO_CC_INTELC && (__INTEL_COMPILER >= 900))
+#  pragma warning(push)
+#  pragma warning(disable: 1025)
+#endif
+    LZOCHK_ASSERT((~0l  & ~0)  == ~0l)
+    LZOCHK_ASSERT((~0l  & ~0u) == ~0u)
+    LZOCHK_ASSERT((~0ul & ~0)  == ~0ul)
+    LZOCHK_ASSERT((~0ul & ~0u) == ~0u)
+#if defined(__MSDOS__) && defined(__TURBOC__) && (__TURBOC__ < 0x0150)
+#elif (LZO_SIZEOF_INT == 2)
+    LZOCHK_ASSERT((~0l  & ~0u) == 0xffffU)
+    LZOCHK_ASSERT((~0ul & ~0u) == 0xffffU)
+#elif (LZO_SIZEOF_INT == 4)
+    LZOCHK_ASSERT((~0l  & ~0u) == 0xffffffffU)
+    LZOCHK_ASSERT((~0ul & ~0u) == 0xffffffffU)
+#endif
+#if (LZO_CC_INTELC && (__INTEL_COMPILER >= 900))
+#  pragma warning(pop)
+#endif
+    LZOCHK_ASSERT_IS_SIGNED_T(signed char)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(unsigned char)
+    LZOCHK_ASSERT(sizeof(signed char) == sizeof(char))
+    LZOCHK_ASSERT(sizeof(unsigned char) == sizeof(char))
+    LZOCHK_ASSERT(sizeof(char) == 1)
+#if (LZO_CC_CILLY) && (!defined(__CILLY__) || (__CILLY__ < 0x010302L))
+#else
+    LZOCHK_ASSERT(sizeof(char) == sizeof(LZO_STATIC_CAST(char, 0)))
+#endif
+#if defined(__cplusplus)
+    LZOCHK_ASSERT(sizeof('\0') == sizeof(char))
+#else
+#  if (LZO_CC_DMC)
+#  else
+    LZOCHK_ASSERT(sizeof('\0') == sizeof(int))
+#  endif
+#endif
+#if defined(__lzo_alignof)
+    LZOCHK_ASSERT(__lzo_alignof(char) == 1)
+    LZOCHK_ASSERT(__lzo_alignof(signed char) == 1)
+    LZOCHK_ASSERT(__lzo_alignof(unsigned char) == 1)
+#if defined(lzo_int16e_t)
+    LZOCHK_ASSERT(__lzo_alignof(lzo_int16e_t) >= 1)
+    LZOCHK_ASSERT(__lzo_alignof(lzo_int16e_t) <= 2)
+#endif
+#if defined(lzo_int32e_t)
+    LZOCHK_ASSERT(__lzo_alignof(lzo_int32e_t) >= 1)
+    LZOCHK_ASSERT(__lzo_alignof(lzo_int32e_t) <= 4)
+#endif
+#endif
+    LZOCHK_ASSERT_IS_SIGNED_T(short)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(unsigned short)
+    LZOCHK_ASSERT(sizeof(short) == sizeof(unsigned short))
+#if !(LZO_ABI_I8LP16)
+    LZOCHK_ASSERT(sizeof(short) >= 2)
+#endif
+    LZOCHK_ASSERT(sizeof(short) >= sizeof(char))
+#if (LZO_CC_CILLY) && (!defined(__CILLY__) || (__CILLY__ < 0x010302L))
+#else
+    LZOCHK_ASSERT(sizeof(short) == sizeof(LZO_STATIC_CAST(short, 0)))
+#endif
+#if (LZO_SIZEOF_SHORT > 0)
+    LZOCHK_ASSERT(sizeof(short) == LZO_SIZEOF_SHORT)
+#endif
+    LZOCHK_ASSERT_IS_SIGNED_T(int)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(unsigned int)
+    LZOCHK_ASSERT(sizeof(int) == sizeof(unsigned int))
+#if !(LZO_ABI_I8LP16)
+    LZOCHK_ASSERT(sizeof(int) >= 2)
+#endif
+    LZOCHK_ASSERT(sizeof(int) >= sizeof(short))
+    LZOCHK_ASSERT(sizeof(int) == sizeof(0))
+    LZOCHK_ASSERT(sizeof(int) == sizeof(LZO_STATIC_CAST(int, 0)))
+#if (LZO_SIZEOF_INT > 0)
+    LZOCHK_ASSERT(sizeof(int) == LZO_SIZEOF_INT)
+#endif
+    LZOCHK_ASSERT(sizeof(0) == sizeof(int))
+    LZOCHK_ASSERT_IS_SIGNED_T(long)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(unsigned long)
+    LZOCHK_ASSERT(sizeof(long) == sizeof(unsigned long))
+#if !(LZO_ABI_I8LP16)
+    LZOCHK_ASSERT(sizeof(long) >= 4)
+#endif
+    LZOCHK_ASSERT(sizeof(long) >= sizeof(int))
+    LZOCHK_ASSERT(sizeof(long) == sizeof(0L))
+    LZOCHK_ASSERT(sizeof(long) == sizeof(LZO_STATIC_CAST(long, 0)))
+#if (LZO_SIZEOF_LONG > 0)
+    LZOCHK_ASSERT(sizeof(long) == LZO_SIZEOF_LONG)
+#endif
+    LZOCHK_ASSERT(sizeof(0L) == sizeof(long))
+    LZOCHK_ASSERT_IS_UNSIGNED_T(size_t)
+    LZOCHK_ASSERT(sizeof(size_t) >= sizeof(int))
+    LZOCHK_ASSERT(sizeof(size_t) == sizeof(sizeof(0)))
+#if (LZO_SIZEOF_SIZE_T > 0)
+    LZOCHK_ASSERT(sizeof(size_t) == LZO_SIZEOF_SIZE_T)
+#endif
+    LZOCHK_ASSERT_IS_SIGNED_T(ptrdiff_t)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) >= sizeof(int))
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) >= sizeof(size_t))
+#if !(LZO_BROKEN_SIZEOF)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == sizeof(LZO_STATIC_CAST(char*, 0) - LZO_STATIC_CAST(char*, 0)))
+# if (LZO_HAVE_MM_HUGE_PTR)
+    LZOCHK_ASSERT(4 == sizeof(LZO_STATIC_CAST(char __huge*, 0) - LZO_STATIC_CAST(char __huge*, 0)))
+# endif
+#endif
+#if (LZO_SIZEOF_PTRDIFF_T > 0)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == LZO_SIZEOF_PTRDIFF_T)
+#endif
+    LZOCHK_ASSERT(sizeof(void*) >= sizeof(char*))
+#if (LZO_SIZEOF_VOID_P > 0)
+    LZOCHK_ASSERT(sizeof(void*) == LZO_SIZEOF_VOID_P)
+    LZOCHK_ASSERT(sizeof(char*) == LZO_SIZEOF_VOID_P)
+#endif
+#if (LZO_HAVE_MM_HUGE_PTR)
+    LZOCHK_ASSERT(4 == sizeof(void __huge*))
+    LZOCHK_ASSERT(4 == sizeof(char __huge*))
+#endif
+#if (LZO_ABI_I8LP16)
+    LZOCHK_ASSERT((((1u  <<  7) + 1) >>  7) == 1)
+    LZOCHK_ASSERT((((1ul << 15) + 1) >> 15) == 1)
+#else
+    LZOCHK_ASSERT((((1u  << 15) + 1) >> 15) == 1)
+    LZOCHK_ASSERT((((1ul << 31) + 1) >> 31) == 1)
+#endif
+#if defined(LZOCHK_CFG_PEDANTIC)
+#if defined(__MSDOS__) && defined(__TURBOC__) && (__TURBOC__ < 0x0150)
+#else
+    LZOCHK_ASSERT((1   << (8*LZO_SIZEOF_INT-1)) < 0)
+#endif
+#endif
+    LZOCHK_ASSERT((1u  << (8*LZO_SIZEOF_INT-1)) > 0)
+#if defined(LZOCHK_CFG_PEDANTIC)
+    LZOCHK_ASSERT((1l  << (8*LZO_SIZEOF_LONG-1)) < 0)
+#endif
+    LZOCHK_ASSERT((1ul << (8*LZO_SIZEOF_LONG-1)) > 0)
+#if defined(lzo_int16e_t)
+    LZOCHK_ASSERT(sizeof(lzo_int16e_t) == 2)
+    LZOCHK_ASSERT(sizeof(lzo_int16e_t) == LZO_SIZEOF_LZO_INT16E_T)
+    LZOCHK_ASSERT(sizeof(lzo_uint16e_t) == 2)
+    LZOCHK_ASSERT(sizeof(lzo_int16e_t) == sizeof(lzo_uint16e_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int16e_t)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint16e_t)
+#if defined(__MSDOS__) && defined(__TURBOC__) && (__TURBOC__ < 0x0150)
+#else
+    LZOCHK_ASSERT((LZO_STATIC_CAST(lzo_uint16e_t, (~LZO_STATIC_CAST(lzo_uint16e_t,0ul))) >> 15) == 1)
+#endif
+    LZOCHK_ASSERT( LZO_STATIC_CAST(lzo_int16e_t, (1 + ~LZO_STATIC_CAST(lzo_int16e_t, 0))) == 0)
+#if defined(LZOCHK_CFG_PEDANTIC)
+    LZOCHK_ASSERT( LZO_STATIC_CAST(lzo_uint16e_t, (1 + ~LZO_STATIC_CAST(lzo_uint16e_t, 0))) == 0)
+#endif
+#endif
+#if defined(lzo_int32e_t)
+    LZOCHK_ASSERT(sizeof(lzo_int32e_t) == 4)
+    LZOCHK_ASSERT(sizeof(lzo_int32e_t) == LZO_SIZEOF_LZO_INT32E_T)
+    LZOCHK_ASSERT(sizeof(lzo_uint32e_t) == 4)
+    LZOCHK_ASSERT(sizeof(lzo_int32e_t) == sizeof(lzo_uint32e_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int32e_t)
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_int32e_t, 1) << 30) + 1) >> 30) == 1)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint32e_t)
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_uint32e_t, 1) << 31) + 1) >> 31) == 1)
+    LZOCHK_ASSERT((LZO_STATIC_CAST(lzo_uint32e_t, (~LZO_STATIC_CAST(lzo_uint32e_t, 0ul))) >> 31) == 1)
+    LZOCHK_ASSERT( LZO_STATIC_CAST(lzo_int32e_t, (1 + ~LZO_STATIC_CAST(lzo_int32e_t, 0))) == 0)
+#if defined(LZOCHK_CFG_PEDANTIC)
+    LZOCHK_ASSERT( LZO_STATIC_CAST(lzo_uint32e_t, (1 + ~LZO_STATIC_CAST(lzo_uint32e_t, 0))) == 0)
+#endif
+#endif
+#if defined(lzo_int32e_t)
+    LZOCHK_ASSERT(sizeof(lzo_int32l_t) >= sizeof(lzo_int32e_t))
+#endif
+    LZOCHK_ASSERT(sizeof(lzo_int32l_t) >= 4)
+    LZOCHK_ASSERT(sizeof(lzo_int32l_t) == LZO_SIZEOF_LZO_INT32L_T)
+    LZOCHK_ASSERT(sizeof(lzo_uint32l_t) >= 4)
+    LZOCHK_ASSERT(sizeof(lzo_int32l_t) == sizeof(lzo_uint32l_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int32l_t)
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_int32l_t, 1) << 30) + 1) >> 30) == 1)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint32l_t)
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_uint32l_t, 1) << 31) + 1) >> 31) == 1)
+    LZOCHK_ASSERT(sizeof(lzo_int32f_t) >= sizeof(int))
+#if defined(lzo_int32e_t)
+    LZOCHK_ASSERT(sizeof(lzo_int32f_t) >= sizeof(lzo_int32e_t))
+#endif
+    LZOCHK_ASSERT(sizeof(lzo_int32f_t) >= sizeof(lzo_int32l_t))
+    LZOCHK_ASSERT(sizeof(lzo_int32f_t) >= 4)
+    LZOCHK_ASSERT(sizeof(lzo_int32f_t) >= sizeof(lzo_int32l_t))
+    LZOCHK_ASSERT(sizeof(lzo_int32f_t) == LZO_SIZEOF_LZO_INT32F_T)
+    LZOCHK_ASSERT(sizeof(lzo_uint32f_t) >= 4)
+    LZOCHK_ASSERT(sizeof(lzo_uint32f_t) >= sizeof(lzo_uint32l_t))
+    LZOCHK_ASSERT(sizeof(lzo_int32f_t) == sizeof(lzo_uint32f_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int32f_t)
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_int32f_t, 1) << 30) + 1) >> 30) == 1)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint32f_t)
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_uint32f_t, 1) << 31) + 1) >> 31) == 1)
+#if defined(lzo_int64e_t)
+    LZOCHK_ASSERT(sizeof(lzo_int64e_t) == 8)
+    LZOCHK_ASSERT(sizeof(lzo_int64e_t) == LZO_SIZEOF_LZO_INT64E_T)
+    LZOCHK_ASSERT(sizeof(lzo_uint64e_t) == 8)
+    LZOCHK_ASSERT(sizeof(lzo_int64e_t) == sizeof(lzo_uint64e_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int64e_t)
+#if (LZO_CC_BORLANDC && (__BORLANDC__ < 0x0530))
+#else
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint64e_t)
+#endif
+#endif
+#if defined(lzo_int64l_t)
+#if defined(lzo_int64e_t)
+    LZOCHK_ASSERT(sizeof(lzo_int64l_t) >= sizeof(lzo_int64e_t))
+#endif
+    LZOCHK_ASSERT(sizeof(lzo_int64l_t) >= 8)
+    LZOCHK_ASSERT(sizeof(lzo_int64l_t) == LZO_SIZEOF_LZO_INT64L_T)
+    LZOCHK_ASSERT(sizeof(lzo_uint64l_t) >= 8)
+    LZOCHK_ASSERT(sizeof(lzo_int64l_t) == sizeof(lzo_uint64l_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int64l_t)
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_int64l_t, 1) << 62) + 1) >> 62) == 1)
+    LZOCHK_ASSERT(((( LZO_INT64_C(1) << 62) + 1) >> 62) == 1)
+#if (LZO_CC_BORLANDC && (__BORLANDC__ < 0x0530))
+#else
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint64l_t)
+    LZOCHK_ASSERT(LZO_UINT64_C(18446744073709551615)     > 0)
+#endif
+    LZOCHK_ASSERT(((( LZO_STATIC_CAST(lzo_uint64l_t, 1) << 63) + 1) >> 63) == 1)
+    LZOCHK_ASSERT(((( LZO_UINT64_C(1) << 63) + 1) >> 63) == 1)
+#if (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020600ul))
+    LZOCHK_ASSERT(LZO_INT64_C(9223372036854775807)       > LZO_INT64_C(0))
+#else
+    LZOCHK_ASSERT(LZO_INT64_C(9223372036854775807)       > 0)
+#endif
+    LZOCHK_ASSERT(LZO_INT64_C(-9223372036854775807) - 1  < 0)
+    LZOCHK_ASSERT( LZO_INT64_C(9223372036854775807) % LZO_INT32_C(2147483629)  == 721)
+    LZOCHK_ASSERT( LZO_INT64_C(9223372036854775807) % LZO_INT32_C(2147483647)  == 1)
+    LZOCHK_ASSERT(LZO_UINT64_C(9223372036854775807) % LZO_UINT32_C(2147483629) == 721)
+    LZOCHK_ASSERT(LZO_UINT64_C(9223372036854775807) % LZO_UINT32_C(2147483647) == 1)
+#endif
+#if defined(lzo_int64f_t)
+#if defined(lzo_int64e_t)
+    LZOCHK_ASSERT(sizeof(lzo_int64f_t) >= sizeof(lzo_int64e_t))
+#endif
+    LZOCHK_ASSERT(sizeof(lzo_int64f_t) >= sizeof(lzo_int64l_t))
+    LZOCHK_ASSERT(sizeof(lzo_int64f_t) >= 8)
+    LZOCHK_ASSERT(sizeof(lzo_int64f_t) >= sizeof(lzo_int64l_t))
+    LZOCHK_ASSERT(sizeof(lzo_int64f_t) == LZO_SIZEOF_LZO_INT64F_T)
+    LZOCHK_ASSERT(sizeof(lzo_uint64f_t) >= 8)
+    LZOCHK_ASSERT(sizeof(lzo_uint64f_t) >= sizeof(lzo_uint64l_t))
+    LZOCHK_ASSERT(sizeof(lzo_int64f_t) == sizeof(lzo_uint64f_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int64f_t)
+#if (LZO_CC_BORLANDC && (__BORLANDC__ < 0x0530))
+#else
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint64f_t)
+#endif
+#endif
+#if !defined(__LZO_INTPTR_T_IS_POINTER)
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_intptr_t)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uintptr_t)
+#endif
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) >= sizeof(void *))
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == LZO_SIZEOF_LZO_INTPTR_T)
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(lzo_uintptr_t))
+#if defined(lzo_word_t)
+    LZOCHK_ASSERT(LZO_WORDSIZE == LZO_SIZEOF_LZO_WORD_T)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_word_t)
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_sword_t)
+    LZOCHK_ASSERT(sizeof(lzo_word_t) == LZO_SIZEOF_LZO_WORD_T)
+    LZOCHK_ASSERT(sizeof(lzo_word_t) == sizeof(lzo_sword_t))
+#endif
+    LZOCHK_ASSERT(sizeof(lzo_int8_t) == 1)
+    LZOCHK_ASSERT(sizeof(lzo_uint8_t) == 1)
+    LZOCHK_ASSERT(sizeof(lzo_int8_t) == sizeof(lzo_uint8_t))
+    LZOCHK_ASSERT_IS_SIGNED_T(lzo_int8_t)
+    LZOCHK_ASSERT_IS_UNSIGNED_T(lzo_uint8_t)
+#if defined(LZO_INT16_C)
+    LZOCHK_ASSERT(sizeof(LZO_INT16_C(0)) >= 2)
+    LZOCHK_ASSERT(sizeof(LZO_UINT16_C(0)) >= 2)
+    LZOCHK_ASSERT((LZO_UINT16_C(0xffff) >> 15) == 1)
+#endif
+#if defined(LZO_INT32_C)
+    LZOCHK_ASSERT(sizeof(LZO_INT32_C(0)) >= 4)
+    LZOCHK_ASSERT(sizeof(LZO_UINT32_C(0)) >= 4)
+    LZOCHK_ASSERT((LZO_UINT32_C(0xffffffff) >> 31) == 1)
+#endif
+#if defined(LZO_INT64_C)
+#if (LZO_CC_BORLANDC && (__BORLANDC__ < 0x0560))
+#else
+    LZOCHK_ASSERT(sizeof(LZO_INT64_C(0)) >= 8)
+    LZOCHK_ASSERT(sizeof(LZO_UINT64_C(0)) >= 8)
+#endif
+    LZOCHK_ASSERT((LZO_UINT64_C(0xffffffffffffffff) >> 63) == 1)
+    LZOCHK_ASSERT((LZO_UINT64_C(0xffffffffffffffff) & ~0)  == LZO_UINT64_C(0xffffffffffffffff))
+    LZOCHK_ASSERT((LZO_UINT64_C(0xffffffffffffffff) & ~0l) == LZO_UINT64_C(0xffffffffffffffff))
+#if (LZO_SIZEOF_INT == 4)
+# if (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020000ul))
+# else
+    LZOCHK_ASSERT((LZO_UINT64_C(0xffffffffffffffff) & (~0u+0u)) == 0xffffffffu)
+# endif
+#endif
+#if (LZO_SIZEOF_LONG == 4)
+# if (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020000ul))
+# else
+    LZOCHK_ASSERT((LZO_UINT64_C(0xffffffffffffffff) & (~0ul+0ul)) == 0xfffffffful)
+# endif
+#endif
+#endif
+#if (LZO_MM_TINY || LZO_MM_SMALL || LZO_MM_MEDIUM)
+    LZOCHK_ASSERT(sizeof(void*) == 2)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == 2)
+#elif (LZO_MM_COMPACT || LZO_MM_LARGE || LZO_MM_HUGE)
+    LZOCHK_ASSERT(sizeof(void*) == 4)
+#endif
+#if (LZO_MM_TINY || LZO_MM_SMALL || LZO_MM_COMPACT)
+    LZOCHK_ASSERT(sizeof(void (*)(void)) == 2)
+#elif (LZO_MM_MEDIUM || LZO_MM_LARGE || LZO_MM_HUGE)
+    LZOCHK_ASSERT(sizeof(void (*)(void)) == 4)
+#endif
+#if (LZO_ABI_ILP32)
+    LZOCHK_ASSERT(sizeof(int) == 4)
+    LZOCHK_ASSERT(sizeof(long) == 4)
+    LZOCHK_ASSERT(sizeof(void*) == 4)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(size_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#endif
+#if (LZO_ABI_ILP64)
+    LZOCHK_ASSERT(sizeof(int) == 8)
+    LZOCHK_ASSERT(sizeof(long) == 8)
+    LZOCHK_ASSERT(sizeof(void*) == 8)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(size_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#endif
+#if (LZO_ABI_IP32L64)
+    LZOCHK_ASSERT(sizeof(int) == 4)
+    LZOCHK_ASSERT(sizeof(long) == 8)
+    LZOCHK_ASSERT(sizeof(void*) == 4)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(size_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#endif
+#if (LZO_ABI_LLP64)
+    LZOCHK_ASSERT(sizeof(int) == 4)
+    LZOCHK_ASSERT(sizeof(long) == 4)
+    LZOCHK_ASSERT(sizeof(void*) == 8)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(size_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#endif
+#if (LZO_ABI_LP32)
+    LZOCHK_ASSERT(sizeof(int) == 2)
+    LZOCHK_ASSERT(sizeof(long) == 4)
+    LZOCHK_ASSERT(sizeof(void*) == 4)
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#endif
+#if (LZO_ABI_LP64)
+    LZOCHK_ASSERT(sizeof(int) == 4)
+    LZOCHK_ASSERT(sizeof(long) == 8)
+    LZOCHK_ASSERT(sizeof(void*) == 8)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(size_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#endif
+#if (LZO_ABI_IP32W64)
+    LZOCHK_ASSERT(sizeof(int) == 4)
+    LZOCHK_ASSERT(sizeof(void*) == 4)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(size_t) == sizeof(void*))
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+    LZOCHK_ASSERT(LZO_WORDSIZE == 8)
+#endif
+#if (LZO_ARCH_I086)
+    LZOCHK_ASSERT(sizeof(size_t) == 2)
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#elif (LZO_ARCH_I386 || LZO_ARCH_M68K)
+    LZOCHK_ASSERT(sizeof(size_t) == 4)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == 4)
+    LZOCHK_ASSERT(sizeof(lzo_intptr_t) == sizeof(void *))
+#endif
+#if (LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_WIN32)
+    LZOCHK_ASSERT(sizeof(size_t) == 4)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == 4)
+    LZOCHK_ASSERT(sizeof(void (*)(void)) == 4)
+#elif (LZO_OS_WIN64)
+    LZOCHK_ASSERT(sizeof(size_t) == 8)
+    LZOCHK_ASSERT(sizeof(ptrdiff_t) == 8)
+    LZOCHK_ASSERT(sizeof(void (*)(void)) == 8)
+#endif
+#if (LZO_CC_NDPC)
+#elif (LZO_SIZEOF_INT > 1)
+    LZOCHK_ASSERT( LZO_STATIC_CAST(int, LZO_STATIC_CAST(unsigned char, LZO_STATIC_CAST(signed char, -1))) == 255)
+#endif
+#if defined(LZOCHK_CFG_PEDANTIC)
+#if (LZO_CC_KEILC)
+#elif (LZO_CC_NDPC)
+#elif !(LZO_BROKEN_INTEGRAL_PROMOTION) && (LZO_SIZEOF_INT > 1)
+    LZOCHK_ASSERT( ((LZO_STATIC_CAST(unsigned char, 128)) << LZO_STATIC_CAST(int, (8*sizeof(int)-8))) < 0)
+#endif
+#endif
+#if defined(LZOCHK_CFG_PEDANTIC)
+#if (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0530) && (__BORLANDC__ < 0x0560))
+#  pragma option pop
+#endif
+#endif
+#endif
+#if defined(LZO_WANT_ACCLIB_VGET)
+#  undef LZO_WANT_ACCLIB_VGET
+#define __LZOLIB_VGET_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)                r __LZOLIB_FUNCNAME(f)
+#endif
+#if !defined(LZOLIB_PUBLIC_NOINLINE)
+#  if !defined(__lzo_noinline)
+#    define LZOLIB_PUBLIC_NOINLINE(r,f)     r __LZOLIB_FUNCNAME(f)
+#  elif (LZO_CC_CLANG || (LZO_CC_GNUC >= 0x030400ul) || LZO_CC_LLVM)
+#    define LZOLIB_PUBLIC_NOINLINE(r,f)     __lzo_noinline __attribute__((__used__)) r __LZOLIB_FUNCNAME(f)
+#  else
+#    define LZOLIB_PUBLIC_NOINLINE(r,f)     __lzo_noinline r __LZOLIB_FUNCNAME(f)
+#  endif
+#endif
+extern void* volatile lzo_vget_ptr__;
+#if (LZO_CC_CLANG || (LZO_CC_GNUC >= 0x030400ul) || LZO_CC_LLVM)
+void* volatile __attribute__((__used__)) lzo_vget_ptr__ = LZO_STATIC_CAST(void *, 0);
+#else
+void* volatile lzo_vget_ptr__ = LZO_STATIC_CAST(void *, 0);
+#endif
+#ifndef __LZOLIB_VGET_BODY
+#define __LZOLIB_VGET_BODY(T) \
+    if __lzo_unlikely(lzo_vget_ptr__) { \
+        typedef T __lzo_may_alias TT; \
+        unsigned char e; expr &= 255; e = LZO_STATIC_CAST(unsigned char, expr); \
+        * LZO_STATIC_CAST(TT *, lzo_vget_ptr__) = v; \
+        * LZO_STATIC_CAST(unsigned char *, lzo_vget_ptr__) = e; \
+        v = * LZO_STATIC_CAST(TT *, lzo_vget_ptr__); \
+    } \
+    return v;
+#endif
+LZOLIB_PUBLIC_NOINLINE(short, lzo_vget_short) (short v, int expr)
+{
+    __LZOLIB_VGET_BODY(short)
+}
+LZOLIB_PUBLIC_NOINLINE(int, lzo_vget_int) (int v, int expr)
+{
+    __LZOLIB_VGET_BODY(int)
+}
+LZOLIB_PUBLIC_NOINLINE(long, lzo_vget_long) (long v, int expr)
+{
+    __LZOLIB_VGET_BODY(long)
+}
+#if defined(lzo_int64l_t)
+LZOLIB_PUBLIC_NOINLINE(lzo_int64l_t, lzo_vget_lzo_int64l_t) (lzo_int64l_t v, int expr)
+{
+    __LZOLIB_VGET_BODY(lzo_int64l_t)
+}
+#endif
+LZOLIB_PUBLIC_NOINLINE(lzo_hsize_t, lzo_vget_lzo_hsize_t) (lzo_hsize_t v, int expr)
+{
+    __LZOLIB_VGET_BODY(lzo_hsize_t)
+}
+#if !(LZO_CFG_NO_DOUBLE)
+LZOLIB_PUBLIC_NOINLINE(double, lzo_vget_double) (double v, int expr)
+{
+    __LZOLIB_VGET_BODY(double)
+}
+#endif
+LZOLIB_PUBLIC_NOINLINE(lzo_hvoid_p, lzo_vget_lzo_hvoid_p) (lzo_hvoid_p v, int expr)
+{
+    __LZOLIB_VGET_BODY(lzo_hvoid_p)
+}
+#if (LZO_ARCH_I086 && LZO_CC_TURBOC && (__TURBOC__ == 0x0295)) && !defined(__cplusplus)
+LZOLIB_PUBLIC_NOINLINE(lzo_hvoid_p, lzo_vget_lzo_hvoid_cp) (const lzo_hvoid_p vv, int expr)
+{
+    lzo_hvoid_p v = (lzo_hvoid_p) vv;
+    __LZOLIB_VGET_BODY(lzo_hvoid_p)
+}
+#else
+LZOLIB_PUBLIC_NOINLINE(const lzo_hvoid_p, lzo_vget_lzo_hvoid_cp) (const lzo_hvoid_p v, int expr)
+{
+    __LZOLIB_VGET_BODY(const lzo_hvoid_p)
+}
+#endif
+#endif
+#if defined(LZO_WANT_ACCLIB_HMEMCPY)
+#  undef LZO_WANT_ACCLIB_HMEMCPY
+#define __LZOLIB_HMEMCPY_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+LZOLIB_PUBLIC(int, lzo_hmemcmp) (const lzo_hvoid_p s1, const lzo_hvoid_p s2, lzo_hsize_t len)
+{
+#if (LZO_HAVE_MM_HUGE_PTR) || !(HAVE_MEMCMP)
+    const lzo_hbyte_p p1 = LZO_STATIC_CAST(const lzo_hbyte_p, s1);
+    const lzo_hbyte_p p2 = LZO_STATIC_CAST(const lzo_hbyte_p, s2);
+    if __lzo_likely(len > 0) do
+    {
+        int d = *p1 - *p2;
+        if (d != 0)
+            return d;
+        p1++; p2++;
+    } while __lzo_likely(--len > 0);
+    return 0;
+#else
+    return memcmp(s1, s2, len);
+#endif
+}
+LZOLIB_PUBLIC(lzo_hvoid_p, lzo_hmemcpy) (lzo_hvoid_p dest, const lzo_hvoid_p src, lzo_hsize_t len)
+{
+#if (LZO_HAVE_MM_HUGE_PTR) || !(HAVE_MEMCPY)
+    lzo_hbyte_p p1 = LZO_STATIC_CAST(lzo_hbyte_p, dest);
+    const lzo_hbyte_p p2 = LZO_STATIC_CAST(const lzo_hbyte_p, src);
+    if (!(len > 0) || p1 == p2)
+        return dest;
+    do
+        *p1++ = *p2++;
+    while __lzo_likely(--len > 0);
+    return dest;
+#else
+    return memcpy(dest, src, len);
+#endif
+}
+LZOLIB_PUBLIC(lzo_hvoid_p, lzo_hmemmove) (lzo_hvoid_p dest, const lzo_hvoid_p src, lzo_hsize_t len)
+{
+#if (LZO_HAVE_MM_HUGE_PTR) || !(HAVE_MEMMOVE)
+    lzo_hbyte_p p1 = LZO_STATIC_CAST(lzo_hbyte_p, dest);
+    const lzo_hbyte_p p2 = LZO_STATIC_CAST(const lzo_hbyte_p, src);
+    if (!(len > 0) || p1 == p2)
+        return dest;
+    if (p1 < p2)
+    {
+        do
+            *p1++ = *p2++;
+        while __lzo_likely(--len > 0);
+    }
+    else
+    {
+        p1 += len;
+        p2 += len;
+        do
+            *--p1 = *--p2;
+        while __lzo_likely(--len > 0);
+    }
+    return dest;
+#else
+    return memmove(dest, src, len);
+#endif
+}
+LZOLIB_PUBLIC(lzo_hvoid_p, lzo_hmemset) (lzo_hvoid_p s, int cc, lzo_hsize_t len)
+{
+#if (LZO_HAVE_MM_HUGE_PTR) || !(HAVE_MEMSET)
+    lzo_hbyte_p p = LZO_STATIC_CAST(lzo_hbyte_p, s);
+    unsigned char c = LZO_ITRUNC(unsigned char, cc);
+    if __lzo_likely(len > 0) do
+        *p++ = c;
+    while __lzo_likely(--len > 0);
+    return s;
+#else
+    return memset(s, cc, len);
+#endif
+}
+#endif
+#if defined(LZO_WANT_ACCLIB_RAND)
+#  undef LZO_WANT_ACCLIB_RAND
+#define __LZOLIB_RAND_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+LZOLIB_PUBLIC(void, lzo_srand31) (lzo_rand31_p r, lzo_uint32l_t seed)
+{
+    r->seed = seed & LZO_UINT32_C(0xffffffff);
+}
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_rand31) (lzo_rand31_p r)
+{
+    r->seed = r->seed * LZO_UINT32_C(1103515245) + 12345;
+    r->seed &= LZO_UINT32_C(0x7fffffff);
+    return r->seed;
+}
+#if defined(lzo_int64l_t)
+LZOLIB_PUBLIC(void, lzo_srand48) (lzo_rand48_p r, lzo_uint32l_t seed)
+{
+    r->seed = seed & LZO_UINT32_C(0xffffffff);
+    r->seed <<= 16; r->seed |= 0x330e;
+}
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_rand48) (lzo_rand48_p r)
+{
+    lzo_uint64l_t a;
+    r->seed = r->seed * LZO_UINT64_C(25214903917) + 11;
+    r->seed &= LZO_UINT64_C(0xffffffffffff);
+    a = r->seed >> 17;
+    return LZO_STATIC_CAST(lzo_uint32l_t, a);
+}
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_rand48_r32) (lzo_rand48_p r)
+{
+    lzo_uint64l_t a;
+    r->seed = r->seed * LZO_UINT64_C(25214903917) + 11;
+    r->seed &= LZO_UINT64_C(0xffffffffffff);
+    a = r->seed >> 16;
+    return LZO_STATIC_CAST(lzo_uint32l_t, a);
+}
+#endif
+#if defined(lzo_int64l_t)
+LZOLIB_PUBLIC(void, lzo_srand64) (lzo_rand64_p r, lzo_uint64l_t seed)
+{
+    r->seed = seed & LZO_UINT64_C(0xffffffffffffffff);
+}
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_rand64) (lzo_rand64_p r)
+{
+    lzo_uint64l_t a;
+    r->seed = r->seed * LZO_UINT64_C(6364136223846793005) + 1;
+#if (LZO_SIZEOF_LZO_INT64L_T > 8)
+    r->seed &= LZO_UINT64_C(0xffffffffffffffff);
+#endif
+    a = r->seed >> 33;
+    return LZO_STATIC_CAST(lzo_uint32l_t, a);
+}
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_rand64_r32) (lzo_rand64_p r)
+{
+    lzo_uint64l_t a;
+    r->seed = r->seed * LZO_UINT64_C(6364136223846793005) + 1;
+#if (LZO_SIZEOF_LZO_INT64L_T > 8)
+    r->seed &= LZO_UINT64_C(0xffffffffffffffff);
+#endif
+    a = r->seed >> 32;
+    return LZO_STATIC_CAST(lzo_uint32l_t, a);
+}
+#endif
+LZOLIB_PUBLIC(void, lzo_srandmt) (lzo_randmt_p r, lzo_uint32l_t seed)
+{
+    unsigned i = 0;
+    do {
+        r->s[i++] = (seed &= LZO_UINT32_C(0xffffffff));
+        seed ^= seed >> 30;
+        seed = seed * LZO_UINT32_C(0x6c078965) + i;
+    } while (i != 624);
+    r->n = i;
+}
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_randmt) (lzo_randmt_p r)
+{
+    return (__LZOLIB_FUNCNAME(lzo_randmt_r32)(r)) >> 1;
+}
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_randmt_r32) (lzo_randmt_p r)
+{
+    lzo_uint32l_t v;
+    if __lzo_unlikely(r->n == 624) {
+        unsigned i = 0, j;
+        r->n = 0;
+        do {
+            j = i - 623; if (LZO_STATIC_CAST(int, j) < 0) j += 624;
+            v = (r->s[i] & LZO_UINT32_C(0x80000000)) ^ (r->s[j] & LZO_UINT32_C(0x7fffffff));
+            j = i - 227; if (LZO_STATIC_CAST(int, j) < 0) j += 624;
+            r->s[i] = r->s[j] ^ (v >> 1);
+            if (v & 1) r->s[i] ^= LZO_UINT32_C(0x9908b0df);
+        } while (++i != 624);
+    }
+    { unsigned i = r->n++; v = r->s[i]; }
+    v ^= v >> 11; v ^= (v & LZO_UINT32_C(0x013a58ad)) << 7;
+    v ^= (v & LZO_UINT32_C(0x0001df8c)) << 15; v ^= v >> 18;
+    return v;
+}
+#if defined(lzo_int64l_t)
+LZOLIB_PUBLIC(void, lzo_srandmt64) (lzo_randmt64_p r, lzo_uint64l_t seed)
+{
+    unsigned i = 0;
+    do {
+        r->s[i++] = (seed &= LZO_UINT64_C(0xffffffffffffffff));
+        seed ^= seed >> 62;
+        seed = seed * LZO_UINT64_C(0x5851f42d4c957f2d) + i;
+    } while (i != 312);
+    r->n = i;
+}
+#if 0
+LZOLIB_PUBLIC(lzo_uint32l_t, lzo_randmt64) (lzo_randmt64_p r)
+{
+    lzo_uint64l_t v;
+    v = (__LZOLIB_FUNCNAME(lzo_randmt64_r64)(r)) >> 33;
+    return LZO_STATIC_CAST(lzo_uint32l_t, v);
+}
+#endif
+LZOLIB_PUBLIC(lzo_uint64l_t, lzo_randmt64_r64) (lzo_randmt64_p r)
+{
+    lzo_uint64l_t v;
+    if __lzo_unlikely(r->n == 312) {
+        unsigned i = 0, j;
+        r->n = 0;
+        do {
+            j = i - 311; if (LZO_STATIC_CAST(int, j) < 0) j += 312;
+            v = (r->s[i] & LZO_UINT64_C(0xffffffff80000000)) ^ (r->s[j] & LZO_UINT64_C(0x7fffffff));
+            j = i - 156; if (LZO_STATIC_CAST(int, j) < 0) j += 312;
+            r->s[i] = r->s[j] ^ (v >> 1);
+            if (v & 1) r->s[i] ^= LZO_UINT64_C(0xb5026f5aa96619e9);
+        } while (++i != 312);
+    }
+    { unsigned i = r->n++; v = r->s[i]; }
+    v ^= (v & LZO_UINT64_C(0xaaaaaaaaa0000000)) >> 29;
+    v ^= (v & LZO_UINT64_C(0x38eb3ffff6d3)) << 17;
+    v ^= (v & LZO_UINT64_C(0x7ffbf77)) << 37;
+    return v ^ (v >> 43);
+}
+#endif
+#endif
+#if defined(LZO_WANT_ACCLIB_RDTSC)
+#  undef LZO_WANT_ACCLIB_RDTSC
+#define __LZOLIB_RDTSC_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+#if defined(lzo_int32e_t)
+#if (LZO_OS_WIN32 && LZO_CC_PELLESC && (__POCC__ >= 290))
+#  pragma warn(push)
+#  pragma warn(disable:2007)
+#endif
+#if (LZO_ARCH_AMD64 || LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+#if (LZO_ARCH_AMD64 && LZO_CC_INTELC)
+#  define __LZOLIB_RDTSC_REGS   : : "c" (t) : "memory", "rax", "rdx"
+#elif (LZO_ARCH_AMD64)
+#  define __LZOLIB_RDTSC_REGS   : : "c" (t) : "cc", "memory", "rax", "rdx"
+#elif (LZO_ARCH_I386 && LZO_CC_GNUC && (LZO_CC_GNUC < 0x020000ul))
+#  define __LZOLIB_RDTSC_REGS   : : "c" (t) : "ax", "dx"
+#elif (LZO_ARCH_I386 && LZO_CC_INTELC)
+#  define __LZOLIB_RDTSC_REGS   : : "c" (t) : "memory", "eax", "edx"
+#else
+#  define __LZOLIB_RDTSC_REGS   : : "c" (t) : "cc", "memory", "eax", "edx"
+#endif
+#endif
+LZOLIB_PUBLIC(int, lzo_tsc_read) (lzo_uint32e_t* t)
+{
+#if (LZO_ARCH_AMD64 || LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+    __asm__ __volatile__(
+        "clc \n" ".byte 0x0f,0x31\n"
+        "movl %%eax,(%0)\n" "movl %%edx,4(%0)\n"
+        __LZOLIB_RDTSC_REGS
+    );
+    return 0;
+#elif (LZO_ARCH_I386) && (LZO_ASM_SYNTAX_MSC)
+    LZO_UNUSED(t);
+    __asm {
+        mov ecx, t
+        clc
+#  if (LZO_CC_MSC && (_MSC_VER < 1200))
+        _emit 0x0f
+        _emit 0x31
+#  else
+        rdtsc
+#  endif
+        mov [ecx], eax
+        mov [ecx+4], edx
+    }
+    return 0;
+#else
+    t[0] = t[1] = 0; return -1;
+#endif
+}
+#if (LZO_OS_WIN32 && LZO_CC_PELLESC && (__POCC__ >= 290))
+#  pragma warn(pop)
+#endif
+#endif
+#endif
+#if defined(LZO_WANT_ACCLIB_DOSALLOC)
+#  undef LZO_WANT_ACCLIB_DOSALLOC
+#define __LZOLIB_DOSALLOC_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+#if (LZO_OS_OS216)
+LZO_EXTERN_C unsigned short __far __pascal DosAllocHuge(unsigned short, unsigned short, unsigned short __far *, unsigned short, unsigned short);
+LZO_EXTERN_C unsigned short __far __pascal DosFreeSeg(unsigned short);
+#endif
+#if (LZO_OS_DOS16 || LZO_OS_WIN16)
+#if !(LZO_CC_AZTECC)
+LZOLIB_PUBLIC(void __far*, lzo_dos_alloc) (unsigned long size)
+{
+    void __far* p = 0;
+    union REGS ri, ro;
+    if ((long)size <= 0)
+        return p;
+    size = (size + 15) >> 4;
+    if (size > 0xffffu)
+        return p;
+    ri.x.ax = 0x4800;
+    ri.x.bx = (unsigned short) size;
+    int86(0x21, &ri, &ro);
+    if ((ro.x.cflag & 1) == 0)
+        p = (void __far*) LZO_PTR_MK_FP(ro.x.ax, 0);
+    return p;
+}
+LZOLIB_PUBLIC(int, lzo_dos_free) (void __far* p)
+{
+    union REGS ri, ro;
+    struct SREGS rs;
+    if (!p)
+        return 0;
+    if (LZO_PTR_FP_OFF(p) != 0)
+        return -1;
+    segread(&rs);
+    ri.x.ax = 0x4900;
+    rs.es = LZO_PTR_FP_SEG(p);
+    int86x(0x21, &ri, &ro, &rs);
+    if (ro.x.cflag & 1)
+        return -1;
+    return 0;
+}
+#endif
+#endif
+#if (LZO_OS_OS216)
+LZOLIB_PUBLIC(void __far*, lzo_dos_alloc) (unsigned long size)
+{
+    void __far* p = 0;
+    unsigned short sel = 0;
+    if ((long)size <= 0)
+        return p;
+    if (DosAllocHuge((unsigned short)(size >> 16), (unsigned short)size, &sel, 0, 0) == 0)
+        p = (void __far*) LZO_PTR_MK_FP(sel, 0);
+    return p;
+}
+LZOLIB_PUBLIC(int, lzo_dos_free) (void __far* p)
+{
+    if (!p)
+        return 0;
+    if (LZO_PTR_FP_OFF(p) != 0)
+        return -1;
+    if (DosFreeSeg(LZO_PTR_FP_SEG(p)) != 0)
+        return -1;
+    return 0;
+}
+#endif
+#endif
+#if defined(LZO_WANT_ACCLIB_GETOPT)
+#  undef LZO_WANT_ACCLIB_GETOPT
+#define __LZOLIB_GETOPT_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+LZOLIB_PUBLIC(void, lzo_getopt_init) (lzo_getopt_p g,
+                                      int start_argc, int argc, char** argv)
+{
+    memset(g, 0, sizeof(*g));
+    g->optind = start_argc;
+    g->argc = argc; g->argv = argv;
+    g->optopt = -1;
+}
+static int __LZOLIB_FUNCNAME(lzo_getopt_rotate) (char** p, int first, int middle, int last)
+{
+    int i = middle, n = middle - first;
+    if (first >= middle || middle >= last) return 0;
+    for (;;)
+    {
+        char* t = p[first]; p[first] = p[i]; p[i] = t;
+        if (++first == middle)
+        {
+            if (++i == last) break;
+            middle = i;
+        }
+        else if (++i == last)
+            i = middle;
+    }
+    return n;
+}
+static int __LZOLIB_FUNCNAME(lzo_getopt_perror) (lzo_getopt_p g, int ret, const char* f, ...)
+{
+    if (g->opterr)
+    {
+#if (HAVE_STDARG_H)
+        struct { va_list ap; } s;
+        va_start(s.ap, f);
+        g->opterr(g, f, &s);
+        va_end(s.ap);
+#else
+        g->opterr(g, f, NULL);
+#endif
+    }
+    ++g->errcount;
+    return ret;
+}
+LZOLIB_PUBLIC(int, lzo_getopt) (lzo_getopt_p g,
+                                const char* shortopts,
+                                const lzo_getopt_longopt_p longopts,
+                                int* longind)
+{
+#define pe  __LZOLIB_FUNCNAME(lzo_getopt_perror)
+    int ordering = LZO_GETOPT_PERMUTE;
+    int missing_arg_ret = g->bad_option;
+    char* a;
+    if (shortopts)
+    {
+        if (*shortopts == '-' || *shortopts == '+')
+            ordering = *shortopts++ == '-' ? LZO_GETOPT_RETURN_IN_ORDER : LZO_GETOPT_REQUIRE_ORDER;
+        if (*shortopts == ':')
+            missing_arg_ret = *shortopts++;
+    }
+    g->optarg = NULL;
+    if (g->optopt == -1)
+        g->optopt = g->bad_option;
+    if (longind)
+        *longind = -1;
+    if (g->eof)
+        return -1;
+    if (g->shortpos)
+        goto lzo_label_next_shortopt;
+    g->optind -= __LZOLIB_FUNCNAME(lzo_getopt_rotate)(g->argv, g->pending_rotate_first, g->pending_rotate_middle, g->optind);
+    g->pending_rotate_first = g->pending_rotate_middle = g->optind;
+    if (ordering == LZO_GETOPT_PERMUTE)
+    {
+        while (g->optind < g->argc && !(g->argv[g->optind][0] == '-' && g->argv[g->optind][1]))
+            ++g->optind;
+        g->pending_rotate_middle = g->optind;
+    }
+    if (g->optind >= g->argc)
+    {
+        g->optind = g->pending_rotate_first;
+        goto lzo_label_eof;
+    }
+    a = g->argv[g->optind];
+    if (a[0] == '-' && a[1] == '-')
+    {
+        size_t l = 0;
+        const lzo_getopt_longopt_p o;
+        const lzo_getopt_longopt_p o1 = NULL;
+        const lzo_getopt_longopt_p o2 = NULL;
+        int need_exact = 0;
+        ++g->optind;
+        if (!a[2])
+            goto lzo_label_eof;
+        for (a += 2; a[l] && a[l] != '=' && a[l] != '#'; )
+            ++l;
+        for (o = longopts; l && o && o->name; ++o)
+        {
+            if (strncmp(a, o->name, l) != 0)
+                continue;
+            if (!o->name[l])
+                goto lzo_label_found_o;
+            need_exact |= o->has_arg & LZO_GETOPT_EXACT_ARG;
+            if (o1) o2 = o;
+            else    o1 = o;
+        }
+        if (!o1 || need_exact)
+            return pe(g, g->bad_option, "unrecognized option '--%s'", a);
+        if (o2)
+            return pe(g, g->bad_option, "option '--%s' is ambiguous (could be '--%s' or '--%s')", a, o1->name, o2->name);
+        o = o1;
+    lzo_label_found_o:
+        a += l;
+        switch (o->has_arg & 0x2f)
+        {
+        case LZO_GETOPT_OPTIONAL_ARG:
+            if (a[0])
+                g->optarg = a + 1;
+            break;
+        case LZO_GETOPT_REQUIRED_ARG:
+            if (a[0])
+                g->optarg = a + 1;
+            else if (g->optind < g->argc)
+                g->optarg = g->argv[g->optind++];
+            if (!g->optarg)
+                return pe(g, missing_arg_ret, "option '--%s' requires an argument", o->name);
+            break;
+        case LZO_GETOPT_REQUIRED_ARG | 0x20:
+            if (a[0] && a[1])
+                g->optarg = a + 1;
+            if (!g->optarg)
+                return pe(g, missing_arg_ret, "option '--%s=' requires an argument", o->name);
+            break;
+        default:
+            if (a[0])
+                return pe(g, g->bad_option, "option '--%s' doesn't allow an argument", o->name);
+            break;
+        }
+        if (longind)
+            *longind = (int) (o - longopts);
+        if (o->flag)
+        {
+            *o->flag = o->val;
+            return 0;
+        }
+        return o->val;
+    }
+    if (a[0] == '-' && a[1])
+    {
+        unsigned char c;
+        const char* s;
+    lzo_label_next_shortopt:
+        a = g->argv[g->optind] + ++g->shortpos;
+        c = (unsigned char) *a++; s = NULL;
+        if (c != ':' && shortopts)
+            s = strchr(shortopts, c);
+        if (!s || s[1] != ':')
+        {
+            if (!a[0])
+                { ++g->optind; g->shortpos = 0; }
+            if (!s)
+            {
+                g->optopt = c;
+                return pe(g, g->bad_option, "invalid option '-%c'", c);
+            }
+        }
+        else
+        {
+            ++g->optind; g->shortpos = 0;
+            if (a[0])
+                g->optarg = a;
+            else if (s[2] != ':')
+            {
+                if (g->optind < g->argc)
+                    g->optarg = g->argv[g->optind++];
+                else
+                {
+                    g->optopt = c;
+                    return pe(g, missing_arg_ret, "option '-%c' requires an argument", c);
+                }
+            }
+        }
+        return c;
+    }
+    if (ordering == LZO_GETOPT_RETURN_IN_ORDER)
+    {
+        ++g->optind;
+        g->optarg = a;
+        return 1;
+    }
+lzo_label_eof:
+    g->optind -= __LZOLIB_FUNCNAME(lzo_getopt_rotate)(g->argv, g->pending_rotate_first, g->pending_rotate_middle, g->optind);
+    g->pending_rotate_first = g->pending_rotate_middle = g->optind;
+    g->eof = 1;
+    return -1;
+#undef pe
+}
+#endif
+#if defined(LZO_WANT_ACCLIB_HALLOC)
+#  undef LZO_WANT_ACCLIB_HALLOC
+#define __LZOLIB_HALLOC_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+#if (LZO_HAVE_MM_HUGE_PTR)
+#if 1 && (LZO_OS_DOS16 && defined(BLX286))
+#  define __LZOLIB_HALLOC_USE_DAH 1
+#elif 1 && (LZO_OS_DOS16 && defined(DOSX286))
+#  define __LZOLIB_HALLOC_USE_DAH 1
+#elif 1 && (LZO_OS_OS216)
+#  define __LZOLIB_HALLOC_USE_DAH 1
+#elif 1 && (LZO_OS_WIN16)
+#  define __LZOLIB_HALLOC_USE_GA 1
+#elif 1 && (LZO_OS_DOS16) && (LZO_CC_BORLANDC) && defined(__DPMI16__)
+#  define __LZOLIB_HALLOC_USE_GA 1
+#endif
+#endif
+#if (__LZOLIB_HALLOC_USE_DAH)
+#if 0 && (LZO_OS_OS216)
+#include <os2.h>
+#else
+LZO_EXTERN_C unsigned short __far __pascal DosAllocHuge(unsigned short, unsigned short, unsigned short __far *, unsigned short, unsigned short);
+LZO_EXTERN_C unsigned short __far __pascal DosFreeSeg(unsigned short);
+#endif
+#endif
+#if (__LZOLIB_HALLOC_USE_GA)
+#if 0
+#define STRICT 1
+#include <windows.h>
+#else
+LZO_EXTERN_C const void __near* __far __pascal GlobalAlloc(unsigned, unsigned long);
+LZO_EXTERN_C const void __near* __far __pascal GlobalFree(const void __near*);
+LZO_EXTERN_C unsigned long __far __pascal GlobalHandle(unsigned);
+LZO_EXTERN_C void __far* __far __pascal GlobalLock(const void __near*);
+LZO_EXTERN_C int __far __pascal GlobalUnlock(const void __near*);
+#endif
+#endif
+LZOLIB_PUBLIC(lzo_hvoid_p, lzo_halloc) (lzo_hsize_t size)
+{
+    lzo_hvoid_p p = LZO_STATIC_CAST(lzo_hvoid_p, 0);
+    if (!(size > 0))
+        return p;
+#if 0 && defined(__palmos__)
+    p = MemPtrNew(size);
+#elif !(LZO_HAVE_MM_HUGE_PTR)
+    if (size < LZO_STATIC_CAST(size_t, -1))
+        p = malloc(LZO_STATIC_CAST(size_t, size));
+#else
+    if (LZO_STATIC_CAST(long, size) <= 0)
+        return p;
+{
+#if (__LZOLIB_HALLOC_USE_DAH)
+    unsigned short sel = 0;
+    if (DosAllocHuge((unsigned short)(size >> 16), (unsigned short)size, &sel, 0, 0) == 0)
+        p = (lzo_hvoid_p) LZO_PTR_MK_FP(sel, 0);
+#elif (__LZOLIB_HALLOC_USE_GA)
+    const void __near* h = GlobalAlloc(2, size);
+    if (h) {
+        p = GlobalLock(h);
+        if (p && LZO_PTR_FP_OFF(p) != 0) {
+            GlobalUnlock(h);
+            p = 0;
+        }
+        if (!p)
+            GlobalFree(h);
+    }
+#elif (LZO_CC_MSC && (_MSC_VER >= 700))
+    p = _halloc(size, 1);
+#elif (LZO_CC_MSC || LZO_CC_WATCOMC)
+    p = halloc(size, 1);
+#elif (LZO_CC_DMC || LZO_CC_SYMANTECC || LZO_CC_ZORTECHC)
+    p = farmalloc(size);
+#elif (LZO_CC_BORLANDC || LZO_CC_TURBOC)
+    p = farmalloc(size);
+#elif (LZO_CC_AZTECC)
+    p = lmalloc(size);
+#else
+    if (size < LZO_STATIC_CAST(size_t, -1))
+        p = malloc(LZO_STATIC_CAST(size_t, size));
+#endif
+}
+#endif
+    return p;
+}
+LZOLIB_PUBLIC(void, lzo_hfree) (lzo_hvoid_p p)
+{
+    if (!p)
+        return;
+#if 0 && defined(__palmos__)
+    MemPtrFree(p);
+#elif !(LZO_HAVE_MM_HUGE_PTR)
+    free(p);
+#else
+#if (__LZOLIB_HALLOC_USE_DAH)
+    if (LZO_PTR_FP_OFF(p) == 0)
+        DosFreeSeg((unsigned short) LZO_PTR_FP_SEG(p));
+#elif (__LZOLIB_HALLOC_USE_GA)
+    if (LZO_PTR_FP_OFF(p) == 0) {
+        const void __near* h = (const void __near*) (unsigned) GlobalHandle(LZO_PTR_FP_SEG(p));
+        if (h) {
+            GlobalUnlock(h);
+            GlobalFree(h);
+        }
+    }
+#elif (LZO_CC_MSC && (_MSC_VER >= 700))
+    _hfree(p);
+#elif (LZO_CC_MSC || LZO_CC_WATCOMC)
+    hfree(p);
+#elif (LZO_CC_DMC || LZO_CC_SYMANTECC || LZO_CC_ZORTECHC)
+    farfree((void __far*) p);
+#elif (LZO_CC_BORLANDC || LZO_CC_TURBOC)
+    farfree((void __far*) p);
+#elif (LZO_CC_AZTECC)
+    lfree(p);
+#else
+    free(p);
+#endif
+#endif
+}
+#endif
+#if defined(LZO_WANT_ACCLIB_HFREAD)
+#  undef LZO_WANT_ACCLIB_HFREAD
+#define __LZOLIB_HFREAD_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+LZOLIB_PUBLIC(lzo_hsize_t, lzo_hfread) (void* vfp, lzo_hvoid_p buf, lzo_hsize_t size)
+{
+    FILE* fp = LZO_STATIC_CAST(FILE *, vfp);
+#if (LZO_HAVE_MM_HUGE_PTR)
+#if (LZO_MM_TINY || LZO_MM_SMALL || LZO_MM_MEDIUM)
+#define __LZOLIB_REQUIRE_HMEMCPY_CH 1
+    unsigned char tmp[512];
+    lzo_hsize_t l = 0;
+    while (l < size)
+    {
+        size_t n = size - l > sizeof(tmp) ? sizeof(tmp) : (size_t) (size - l);
+        n = fread(tmp, 1, n, fp);
+        if (n == 0)
+            break;
+        __LZOLIB_FUNCNAME(lzo_hmemcpy)((lzo_hbyte_p)buf + l, tmp, (lzo_hsize_t)n);
+        l += n;
+    }
+    return l;
+#elif (LZO_MM_COMPACT || LZO_MM_LARGE || LZO_MM_HUGE)
+    lzo_hbyte_p b = (lzo_hbyte_p) buf;
+    lzo_hsize_t l = 0;
+    while (l < size)
+    {
+        size_t n;
+        n = LZO_PTR_FP_OFF(b); n = (n <= 1) ? 0x8000u : (0u - n);
+        if ((lzo_hsize_t) n > size - l)
+            n = (size_t) (size - l);
+        n = fread((void __far*)b, 1, n, fp);
+        if (n == 0)
+            break;
+        b += n; l += n;
+    }
+    return l;
+#else
+#  error "unknown memory model"
+#endif
+#else
+    return fread(buf, 1, size, fp);
+#endif
+}
+LZOLIB_PUBLIC(lzo_hsize_t, lzo_hfwrite) (void* vfp, const lzo_hvoid_p buf, lzo_hsize_t size)
+{
+    FILE* fp = LZO_STATIC_CAST(FILE *, vfp);
+#if (LZO_HAVE_MM_HUGE_PTR)
+#if (LZO_MM_TINY || LZO_MM_SMALL || LZO_MM_MEDIUM)
+#define __LZOLIB_REQUIRE_HMEMCPY_CH 1
+    unsigned char tmp[512];
+    lzo_hsize_t l = 0;
+    while (l < size)
+    {
+        size_t n = size - l > sizeof(tmp) ? sizeof(tmp) : (size_t) (size - l);
+        __LZOLIB_FUNCNAME(lzo_hmemcpy)(tmp, (const lzo_hbyte_p)buf + l, (lzo_hsize_t)n);
+        n = fwrite(tmp, 1, n, fp);
+        if (n == 0)
+            break;
+        l += n;
+    }
+    return l;
+#elif (LZO_MM_COMPACT || LZO_MM_LARGE || LZO_MM_HUGE)
+    const lzo_hbyte_p b = (const lzo_hbyte_p) buf;
+    lzo_hsize_t l = 0;
+    while (l < size)
+    {
+        size_t n;
+        n = LZO_PTR_FP_OFF(b); n = (n <= 1) ? 0x8000u : (0u - n);
+        if ((lzo_hsize_t) n > size - l)
+            n = (size_t) (size - l);
+        n = fwrite((void __far*)b, 1, n, fp);
+        if (n == 0)
+            break;
+        b += n; l += n;
+    }
+    return l;
+#else
+#  error "unknown memory model"
+#endif
+#else
+    return fwrite(buf, 1, size, fp);
+#endif
+}
+#endif
+#if defined(LZO_WANT_ACCLIB_HSREAD)
+#  undef LZO_WANT_ACCLIB_HSREAD
+#define __LZOLIB_HSREAD_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+LZOLIB_PUBLIC(long, lzo_safe_hread) (int fd, lzo_hvoid_p buf, long size)
+{
+    lzo_hbyte_p b = (lzo_hbyte_p) buf;
+    long l = 0;
+    int saved_errno;
+    saved_errno = errno;
+    while (l < size)
+    {
+        long n = size - l;
+#if (LZO_HAVE_MM_HUGE_PTR)
+#  define __LZOLIB_REQUIRE_HREAD_CH 1
+        errno = 0; n = lzo_hread(fd, b, n);
+#elif (LZO_OS_DOS32) && defined(__DJGPP__)
+        errno = 0; n = _read(fd, b, n);
+#else
+        errno = 0; n = read(fd, b, n);
+#endif
+        if (n == 0)
+            break;
+        if (n < 0) {
+#if defined(EAGAIN)
+            if (errno == (EAGAIN)) continue;
+#endif
+#if defined(EINTR)
+            if (errno == (EINTR)) continue;
+#endif
+            if (errno == 0) errno = 1;
+            return l;
+        }
+        b += n; l += n;
+    }
+    errno = saved_errno;
+    return l;
+}
+LZOLIB_PUBLIC(long, lzo_safe_hwrite) (int fd, const lzo_hvoid_p buf, long size)
+{
+    const lzo_hbyte_p b = (const lzo_hbyte_p) buf;
+    long l = 0;
+    int saved_errno;
+    saved_errno = errno;
+    while (l < size)
+    {
+        long n = size - l;
+#if (LZO_HAVE_MM_HUGE_PTR)
+#  define __LZOLIB_REQUIRE_HREAD_CH 1
+        errno = 0; n = lzo_hwrite(fd, b, n);
+#elif (LZO_OS_DOS32) && defined(__DJGPP__)
+        errno = 0; n = _write(fd, b, n);
+#else
+        errno = 0; n = write(fd, b, n);
+#endif
+        if (n == 0)
+            break;
+        if (n < 0) {
+#if defined(EAGAIN)
+            if (errno == (EAGAIN)) continue;
+#endif
+#if defined(EINTR)
+            if (errno == (EINTR)) continue;
+#endif
+            if (errno == 0) errno = 1;
+            return l;
+        }
+        b += n; l += n;
+    }
+    errno = saved_errno;
+    return l;
+}
+#endif
+#if defined(LZO_WANT_ACCLIB_PCLOCK)
+#  undef LZO_WANT_ACCLIB_PCLOCK
+#define __LZOLIB_PCLOCK_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+#if 1 && (LZO_OS_POSIX_LINUX && LZO_ARCH_AMD64 && LZO_ASM_SYNTAX_GNUC && !LZO_CFG_NO_SYSCALL)
+#ifndef lzo_pclock_syscall_clock_gettime
+#define lzo_pclock_syscall_clock_gettime lzo_pclock_syscall_clock_gettime
+#endif
+__lzo_static_noinline long lzo_pclock_syscall_clock_gettime(long clockid, struct timespec *ts)
+{
+    unsigned long r = 228;
+    __asm__ __volatile__("syscall\n" : "=a" (r), "=m" (*ts) : "0" (r), "D" (clockid), "S" (ts) __LZO_ASM_CLOBBER_LIST_CC);
+    return LZO_ICAST(long, r);
+}
+#endif
+#if 1 && (LZO_OS_POSIX_LINUX && LZO_ARCH_I386 && LZO_ASM_SYNTAX_GNUC && !LZO_CFG_NO_SYSCALL) && defined(lzo_int64l_t)
+#ifndef lzo_pclock_syscall_clock_gettime
+#define lzo_pclock_syscall_clock_gettime lzo_pclock_syscall_clock_gettime
+#endif
+__lzo_static_noinline long lzo_pclock_syscall_clock_gettime(long clockid, struct timespec *ts)
+{
+    unsigned long r = 265;
+    __asm__ __volatile__("pushl %%ebx\n pushl %%edx\n popl %%ebx\n int $0x80\n popl %%ebx\n": "=a" (r), "=m" (*ts) : "0" (r), "d" (clockid), "c" (ts) __LZO_ASM_CLOBBER_LIST_CC);
+    return LZO_ICAST(long, r);
+}
+#endif
+#if 0 && defined(lzo_pclock_syscall_clock_gettime)
+#ifndef lzo_pclock_read_clock_gettime_r_syscall
+#define lzo_pclock_read_clock_gettime_r_syscall lzo_pclock_read_clock_gettime_r_syscall
+#endif
+static int lzo_pclock_read_clock_gettime_r_syscall(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+     struct timespec ts;
+    if (lzo_pclock_syscall_clock_gettime(0, &ts) != 0)
+        return -1;
+    c->tv_sec = ts.tv_sec;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, ts.tv_nsec);
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if (HAVE_GETTIMEOFDAY)
+#ifndef lzo_pclock_read_gettimeofday
+#define lzo_pclock_read_gettimeofday lzo_pclock_read_gettimeofday
+#endif
+static int lzo_pclock_read_gettimeofday(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    struct timeval tv;
+    if (gettimeofday(&tv, NULL) != 0)
+        return -1;
+#if defined(lzo_int64l_t)
+    c->tv_sec = tv.tv_sec;
+#else
+    c->tv_sec_high = 0;
+    c->tv_sec_low = tv.tv_sec;
+#endif
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, (tv.tv_usec * 1000u));
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if defined(CLOCKS_PER_SEC) && !(LZO_CFG_NO_DOUBLE)
+#ifndef lzo_pclock_read_clock
+#define lzo_pclock_read_clock lzo_pclock_read_clock
+#endif
+static int lzo_pclock_read_clock(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    clock_t ticks;
+    double secs;
+#if defined(lzo_int64l_t)
+    lzo_uint64l_t nsecs;
+    ticks = clock();
+    secs = LZO_STATIC_CAST(double, ticks) / (CLOCKS_PER_SEC);
+    nsecs = LZO_STATIC_CAST(lzo_uint64l_t, (secs * 1000000000.0));
+    c->tv_sec = LZO_STATIC_CAST(lzo_int64l_t, (nsecs / 1000000000ul));
+    nsecs = (nsecs % 1000000000ul);
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, nsecs);
+#else
+    ticks = clock();
+    secs = LZO_STATIC_CAST(double, ticks) / (CLOCKS_PER_SEC);
+    c->tv_sec_high = 0;
+    c->tv_sec_low = LZO_STATIC_CAST(lzo_uint32l_t, (secs + 0.5));
+    c->tv_nsec = 0;
+#endif
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if 1 && defined(lzo_pclock_syscall_clock_gettime)
+#ifndef lzo_pclock_read_clock_gettime_m_syscall
+#define lzo_pclock_read_clock_gettime_m_syscall lzo_pclock_read_clock_gettime_m_syscall
+#endif
+static int lzo_pclock_read_clock_gettime_m_syscall(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+     struct timespec ts;
+    if (lzo_pclock_syscall_clock_gettime(1, &ts) != 0)
+        return -1;
+    c->tv_sec = ts.tv_sec;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, ts.tv_nsec);
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__) && defined(UCLOCKS_PER_SEC) && !(LZO_CFG_NO_DOUBLE)
+#ifndef lzo_pclock_read_uclock
+#define lzo_pclock_read_uclock lzo_pclock_read_uclock
+#endif
+static int lzo_pclock_read_uclock(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    lzo_uint64l_t ticks;
+    double secs;
+    lzo_uint64l_t nsecs;
+    ticks = uclock();
+    secs = LZO_STATIC_CAST(double, ticks) / (UCLOCKS_PER_SEC);
+    nsecs = LZO_STATIC_CAST(lzo_uint64l_t, (secs * 1000000000.0));
+    c->tv_sec = nsecs / 1000000000ul;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, (nsecs % 1000000000ul));
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if 1 && (HAVE_CLOCK_GETTIME) && defined(CLOCK_PROCESS_CPUTIME_ID) && defined(lzo_int64l_t)
+#ifndef lzo_pclock_read_clock_gettime_p_libc
+#define lzo_pclock_read_clock_gettime_p_libc lzo_pclock_read_clock_gettime_p_libc
+#endif
+static int lzo_pclock_read_clock_gettime_p_libc(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    struct timespec ts;
+    if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts) != 0)
+        return -1;
+    c->tv_sec = ts.tv_sec;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, ts.tv_nsec);
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if 1 && defined(lzo_pclock_syscall_clock_gettime)
+#ifndef lzo_pclock_read_clock_gettime_p_syscall
+#define lzo_pclock_read_clock_gettime_p_syscall lzo_pclock_read_clock_gettime_p_syscall
+#endif
+static int lzo_pclock_read_clock_gettime_p_syscall(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+     struct timespec ts;
+    if (lzo_pclock_syscall_clock_gettime(2, &ts) != 0)
+        return -1;
+    c->tv_sec = ts.tv_sec;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, ts.tv_nsec);
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if (LZO_OS_CYGWIN || LZO_OS_WIN32 || LZO_OS_WIN64) && (LZO_HAVE_WINDOWS_H) && defined(lzo_int64l_t)
+#ifndef lzo_pclock_read_getprocesstimes
+#define lzo_pclock_read_getprocesstimes lzo_pclock_read_getprocesstimes
+#endif
+static int lzo_pclock_read_getprocesstimes(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    FILETIME ct, et, kt, ut;
+    lzo_uint64l_t ticks;
+    if (GetProcessTimes(GetCurrentProcess(), &ct, &et, &kt, &ut) == 0)
+        return -1;
+    ticks = (LZO_STATIC_CAST(lzo_uint64l_t, ut.dwHighDateTime) << 32) | ut.dwLowDateTime;
+    if __lzo_unlikely(h->ticks_base == 0)
+        h->ticks_base = ticks;
+    else
+        ticks -= h->ticks_base;
+    c->tv_sec = LZO_STATIC_CAST(lzo_int64l_t, (ticks / 10000000ul));
+    ticks = (ticks % 10000000ul) * 100u;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, ticks);
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if (HAVE_GETRUSAGE) && defined(RUSAGE_SELF)
+#ifndef lzo_pclock_read_getrusage
+#define lzo_pclock_read_getrusage lzo_pclock_read_getrusage
+#endif
+static int lzo_pclock_read_getrusage(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    struct rusage ru;
+    if (getrusage(RUSAGE_SELF, &ru) != 0)
+        return -1;
+#if defined(lzo_int64l_t)
+    c->tv_sec = ru.ru_utime.tv_sec;
+#else
+    c->tv_sec_high = 0;
+    c->tv_sec_low = ru.ru_utime.tv_sec;
+#endif
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, (ru.ru_utime.tv_usec * 1000u));
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if 1 && (HAVE_CLOCK_GETTIME) && defined(CLOCK_THREAD_CPUTIME_ID) && defined(lzo_int64l_t)
+#ifndef lzo_pclock_read_clock_gettime_t_libc
+#define lzo_pclock_read_clock_gettime_t_libc lzo_pclock_read_clock_gettime_t_libc
+#endif
+static int lzo_pclock_read_clock_gettime_t_libc(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    struct timespec ts;
+    if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0)
+        return -1;
+    c->tv_sec = ts.tv_sec;
+    c->tv_nsec = (lzo_uint32l_t) ts.tv_nsec;
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if 1 && defined(lzo_pclock_syscall_clock_gettime)
+#ifndef lzo_pclock_read_clock_gettime_t_syscall
+#define lzo_pclock_read_clock_gettime_t_syscall lzo_pclock_read_clock_gettime_t_syscall
+#endif
+static int lzo_pclock_read_clock_gettime_t_syscall(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+     struct timespec ts;
+    if (lzo_pclock_syscall_clock_gettime(3, &ts) != 0)
+        return -1;
+    c->tv_sec = ts.tv_sec;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, ts.tv_nsec);
+    LZO_UNUSED(h); return 0;
+}
+#endif
+#if (LZO_OS_CYGWIN || LZO_OS_WIN32 || LZO_OS_WIN64) && (LZO_HAVE_WINDOWS_H) && defined(lzo_int64l_t)
+#ifndef lzo_pclock_read_getthreadtimes
+#define lzo_pclock_read_getthreadtimes lzo_pclock_read_getthreadtimes
+#endif
+static int lzo_pclock_read_getthreadtimes(lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    FILETIME ct, et, kt, ut;
+    lzo_uint64l_t ticks;
+    if (GetThreadTimes(GetCurrentThread(), &ct, &et, &kt, &ut) == 0)
+        return -1;
+    ticks = (LZO_STATIC_CAST(lzo_uint64l_t, ut.dwHighDateTime) << 32) | ut.dwLowDateTime;
+    if __lzo_unlikely(h->ticks_base == 0)
+        h->ticks_base = ticks;
+    else
+        ticks -= h->ticks_base;
+    c->tv_sec = LZO_STATIC_CAST(lzo_int64l_t, (ticks / 10000000ul));
+    ticks = (ticks % 10000000ul) * 100u;
+    c->tv_nsec = LZO_STATIC_CAST(lzo_uint32l_t, ticks);
+    LZO_UNUSED(h); return 0;
+}
+#endif
+LZOLIB_PUBLIC(int, lzo_pclock_open) (lzo_pclock_handle_p h, int mode)
+{
+    lzo_pclock_t c;
+    int i;
+    h->h = LZO_STATIC_CAST(lzolib_handle_t, 0);
+    h->mode = -1;
+    h->read_error = 2;
+    h->name = NULL;
+    h->gettime = LZO_STATIC_CAST(lzo_pclock_gettime_t, 0);
+#if defined(lzo_int64l_t)
+    h->ticks_base = 0;
+#endif
+    switch (mode)
+    {
+    case LZO_PCLOCK_REALTIME:
+#     if defined(lzo_pclock_read_clock_gettime_r_syscall)
+        if (lzo_pclock_read_clock_gettime_r_syscall(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_clock_gettime_r_syscall;
+            h->name = "CLOCK_REALTIME/syscall";
+            break;
+        }
+#     endif
+#     if defined(lzo_pclock_read_gettimeofday)
+        if (lzo_pclock_read_gettimeofday(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_gettimeofday;
+            h->name = "gettimeofday";
+            break;
+        }
+#     endif
+        break;
+    case LZO_PCLOCK_MONOTONIC:
+#     if defined(lzo_pclock_read_clock_gettime_m_syscall)
+        if (lzo_pclock_read_clock_gettime_m_syscall(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_clock_gettime_m_syscall;
+            h->name = "CLOCK_MONOTONIC/syscall";
+            break;
+        }
+#     endif
+#     if defined(lzo_pclock_read_uclock)
+        if (lzo_pclock_read_uclock(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_uclock;
+            h->name = "uclock";
+            break;
+        }
+#     endif
+#     if defined(lzo_pclock_read_clock)
+        if (lzo_pclock_read_clock(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_clock;
+            h->name = "clock";
+            break;
+        }
+#     endif
+        break;
+    case LZO_PCLOCK_PROCESS_CPUTIME_ID:
+#     if defined(lzo_pclock_read_getprocesstimes)
+        if (lzo_pclock_read_getprocesstimes(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_getprocesstimes;
+            h->name = "GetProcessTimes";
+            break;
+        }
+#     endif
+#     if defined(lzo_pclock_read_clock_gettime_p_syscall)
+        if (lzo_pclock_read_clock_gettime_p_syscall(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_clock_gettime_p_syscall;
+            h->name = "CLOCK_PROCESS_CPUTIME_ID/syscall";
+            break;
+        }
+#     endif
+#     if defined(lzo_pclock_read_clock_gettime_p_libc)
+        if (lzo_pclock_read_clock_gettime_p_libc(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_clock_gettime_p_libc;
+            h->name = "CLOCK_PROCESS_CPUTIME_ID/libc";
+            break;
+        }
+#     endif
+#     if defined(lzo_pclock_read_getrusage)
+        if (lzo_pclock_read_getrusage(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_getrusage;
+            h->name = "getrusage";
+            break;
+        }
+#     endif
+        break;
+    case LZO_PCLOCK_THREAD_CPUTIME_ID:
+#     if defined(lzo_pclock_read_getthreadtimes)
+        if (lzo_pclock_read_getthreadtimes(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_getthreadtimes;
+            h->name = "GetThreadTimes";
+        }
+#     endif
+#     if defined(lzo_pclock_read_clock_gettime_t_syscall)
+        if (lzo_pclock_read_clock_gettime_t_syscall(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_clock_gettime_t_syscall;
+            h->name = "CLOCK_THREAD_CPUTIME_ID/syscall";
+            break;
+        }
+#     endif
+#     if defined(lzo_pclock_read_clock_gettime_t_libc)
+        if (lzo_pclock_read_clock_gettime_t_libc(h, &c) == 0) {
+            h->gettime = lzo_pclock_read_clock_gettime_t_libc;
+            h->name = "CLOCK_THREAD_CPUTIME_ID/libc";
+            break;
+        }
+#     endif
+        break;
+    }
+    if (!h->gettime)
+        return -1;
+    if (!h->h)
+        h->h = LZO_STATIC_CAST(lzolib_handle_t, 1);
+    h->mode = mode;
+    h->read_error = 0;
+    if (!h->name)
+        h->name = "unknown";
+    for (i = 0; i < 10; i++) {
+        __LZOLIB_FUNCNAME(lzo_pclock_read)(h, &c);
+    }
+    return 0;
+}
+LZOLIB_PUBLIC(int, lzo_pclock_open_default) (lzo_pclock_handle_p h)
+{
+    if (__LZOLIB_FUNCNAME(lzo_pclock_open)(h, LZO_PCLOCK_PROCESS_CPUTIME_ID) == 0)
+        return 0;
+    if (__LZOLIB_FUNCNAME(lzo_pclock_open)(h, LZO_PCLOCK_MONOTONIC) == 0)
+        return 0;
+    if (__LZOLIB_FUNCNAME(lzo_pclock_open)(h, LZO_PCLOCK_REALTIME) == 0)
+        return 0;
+    if (__LZOLIB_FUNCNAME(lzo_pclock_open)(h, LZO_PCLOCK_THREAD_CPUTIME_ID) == 0)
+        return 0;
+    return -1;
+}
+LZOLIB_PUBLIC(int, lzo_pclock_close) (lzo_pclock_handle_p h)
+{
+    h->h = LZO_STATIC_CAST(lzolib_handle_t, 0);
+    h->mode = -1;
+    h->name = NULL;
+    h->gettime = LZO_STATIC_CAST(lzo_pclock_gettime_t, 0);
+    return 0;
+}
+LZOLIB_PUBLIC(void, lzo_pclock_read) (lzo_pclock_handle_p h, lzo_pclock_p c)
+{
+    if (h->gettime) {
+        if (h->gettime(h, c) == 0)
+            return;
+    }
+    h->read_error = 1;
+#if defined(lzo_int64l_t)
+    c->tv_sec = 0;
+#else
+    c->tv_sec_high = 0;
+    c->tv_sec_low = 0;
+#endif
+    c->tv_nsec = 0;
+}
+#if !(LZO_CFG_NO_DOUBLE)
+LZOLIB_PUBLIC(double, lzo_pclock_get_elapsed) (lzo_pclock_handle_p h, const lzo_pclock_p start, const lzo_pclock_p stop)
+{
+    if (!h->h) { h->mode = -1; return 0.0; }
+    {
+#if 1 && (LZO_ARCH_I386 && LZO_CC_GNUC) && defined(__STRICT_ALIGNMENT__)
+    float tstop, tstart;
+    tstop  = LZO_STATIC_CAST(float, (stop->tv_sec  + stop->tv_nsec  / 1000000000.0));
+    tstart = LZO_STATIC_CAST(float, (start->tv_sec + start->tv_nsec / 1000000000.0));
+#elif defined(lzo_int64l_t)
+    double tstop, tstart;
+#if 1 && (LZO_CC_INTELC)
+    { lzo_int64l_t a = stop->tv_sec; lzo_uint32l_t b = stop->tv_nsec;
+    tstop = a + b / 1000000000.0; }
+    { lzo_int64l_t a = start->tv_sec; lzo_uint32l_t b = start->tv_nsec;
+    tstart = a + b / 1000000000.0; }
+#else
+    tstop  = stop->tv_sec  + stop->tv_nsec  / 1000000000.0;
+    tstart = start->tv_sec + start->tv_nsec / 1000000000.0;
+#endif
+#else
+    double tstop, tstart;
+    tstop  = stop->tv_sec_low  + stop->tv_nsec  / 1000000000.0;
+    tstart = start->tv_sec_low + start->tv_nsec / 1000000000.0;
+#endif
+    return tstop - tstart;
+    }
+}
+#endif
+LZOLIB_PUBLIC(int, lzo_pclock_flush_cpu_cache) (lzo_pclock_handle_p h, unsigned flags)
+{
+    LZO_UNUSED(h); LZO_UNUSED(flags);
+    return -1;
+}
+#endif
+#if defined(LZO_WANT_ACCLIB_MISC)
+#  undef LZO_WANT_ACCLIB_MISC
+#define __LZOLIB_MISC_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)                r __LZOLIB_FUNCNAME(f)
+#endif
+#if !defined(LZOLIB_PUBLIC_NOINLINE)
+#  if !defined(__lzo_noinline)
+#    define LZOLIB_PUBLIC_NOINLINE(r,f)     r __LZOLIB_FUNCNAME(f)
+#  elif (LZO_CC_CLANG || (LZO_CC_GNUC >= 0x030400ul) || LZO_CC_LLVM)
+#    define LZOLIB_PUBLIC_NOINLINE(r,f)     __lzo_noinline __attribute__((__used__)) r __LZOLIB_FUNCNAME(f)
+#  else
+#    define LZOLIB_PUBLIC_NOINLINE(r,f)     __lzo_noinline r __LZOLIB_FUNCNAME(f)
+#  endif
+#endif
+#if (LZO_OS_WIN32 && LZO_CC_PELLESC && (__POCC__ >= 290))
+#  pragma warn(push)
+#  pragma warn(disable:2007)
+#endif
+LZOLIB_PUBLIC(const char *, lzo_getenv) (const char *s)
+{
+#if (HAVE_GETENV)
+    return getenv(s);
+#else
+    LZO_UNUSED(s); return LZO_STATIC_CAST(const char *, 0);
+#endif
+}
+LZOLIB_PUBLIC(lzo_intptr_t, lzo_get_osfhandle) (int fd)
+{
+    if (fd < 0)
+        return -1;
+#if (LZO_OS_CYGWIN)
+    return get_osfhandle(fd);
+#elif (LZO_OS_EMX && defined(__RSXNT__))
+    return -1;
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__PW32__)
+    return -1;
+#elif (LZO_OS_WIN32 || LZO_OS_WIN64)
+# if (LZO_CC_PELLESC && (__POCC__ < 280))
+    return -1;
+# elif (LZO_CC_WATCOMC && (__WATCOMC__ < 1000))
+    return -1;
+# elif (LZO_CC_WATCOMC && (__WATCOMC__ < 1100))
+    return _os_handle(fd);
+# else
+    return _get_osfhandle(fd);
+# endif
+#else
+    return fd;
+#endif
+}
+LZOLIB_PUBLIC(int, lzo_set_binmode) (int fd, int binary)
+{
+#if (LZO_ARCH_M68K && LZO_OS_TOS && LZO_CC_GNUC) && defined(__MINT__)
+    FILE* fp; int old_binary;
+    if (fd == STDIN_FILENO) fp = stdin;
+    else if (fd == STDOUT_FILENO) fp = stdout;
+    else if (fd == STDERR_FILENO) fp = stderr;
+    else return -1;
+    old_binary = fp->__mode.__binary;
+    __set_binmode(fp, binary ? 1 : 0);
+    return old_binary ? 1 : 0;
+#elif (LZO_ARCH_M68K && LZO_OS_TOS)
+    LZO_UNUSED(fd); LZO_UNUSED(binary);
+    return -1;
+#elif (LZO_OS_DOS16 && (LZO_CC_AZTECC || LZO_CC_PACIFICC))
+    LZO_UNUSED(fd); LZO_UNUSED(binary);
+    return -1;
+#elif (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+    int r; unsigned old_flags = __djgpp_hwint_flags;
+    LZO_COMPILE_TIME_ASSERT(O_BINARY > 0)
+    LZO_COMPILE_TIME_ASSERT(O_TEXT > 0)
+    if (fd < 0) return -1;
+    r = setmode(fd, binary ? O_BINARY : O_TEXT);
+    if ((old_flags & 1u) != (__djgpp_hwint_flags & 1u))
+        __djgpp_set_ctrl_c(!(old_flags & 1));
+    if (r == -1) return -1;
+    return (r & O_TEXT) ? 0 : 1;
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__PW32__)
+    if (fd < 0) return -1;
+    LZO_UNUSED(binary);
+    return 1;
+#elif (LZO_OS_DOS32 && LZO_CC_HIGHC)
+    FILE* fp; int r;
+    if (fd == fileno(stdin)) fp = stdin;
+    else if (fd == fileno(stdout)) fp = stdout;
+    else if (fd == fileno(stderr)) fp = stderr;
+    else return -1;
+    r = _setmode(fp, binary ? _BINARY : _TEXT);
+    if (r == -1) return -1;
+    return (r & _BINARY) ? 1 : 0;
+#elif (LZO_OS_WIN32 && LZO_CC_MWERKS) && defined(__MSL__)
+    LZO_UNUSED(fd); LZO_UNUSED(binary);
+    return -1;
+#elif (LZO_OS_CYGWIN && (LZO_CC_GNUC < 0x025a00ul))
+    LZO_UNUSED(fd); LZO_UNUSED(binary);
+    return -1;
+#elif (LZO_OS_CYGWIN || LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_EMX || LZO_OS_OS2 || LZO_OS_OS216 || LZO_OS_WIN16 || LZO_OS_WIN32 || LZO_OS_WIN64)
+    int r;
+#if !(LZO_CC_ZORTECHC)
+    LZO_COMPILE_TIME_ASSERT(O_BINARY > 0)
+#endif
+    LZO_COMPILE_TIME_ASSERT(O_TEXT > 0)
+    if (fd < 0) return -1;
+    r = setmode(fd, binary ? O_BINARY : O_TEXT);
+    if (r == -1) return -1;
+    return (r & O_TEXT) ? 0 : 1;
+#else
+    if (fd < 0) return -1;
+    LZO_UNUSED(binary);
+    return 1;
+#endif
+}
+LZOLIB_PUBLIC(int, lzo_isatty) (int fd)
+{
+    if (fd < 0)
+        return 0;
+#if (LZO_OS_DOS16 && !(LZO_CC_AZTECC))
+    {
+        union REGS ri, ro;
+        ri.x.ax = 0x4400; ri.x.bx = fd;
+        int86(0x21, &ri, &ro);
+        if ((ro.x.cflag & 1) == 0)
+            if ((ro.x.ax & 0x83) != 0x83)
+                return 0;
+    }
+#elif (LZO_OS_DOS32 && LZO_CC_WATCOMC)
+    {
+        union REGS ri, ro;
+        ri.w.ax = 0x4400; ri.w.bx = LZO_STATIC_CAST(unsigned short, fd);
+        int386(0x21, &ri, &ro);
+        if ((ro.w.cflag & 1) == 0)
+            if ((ro.w.ax & 0x83) != 0x83)
+                return 0;
+    }
+#elif (LZO_HAVE_WINDOWS_H)
+    {
+        lzo_intptr_t h = __LZOLIB_FUNCNAME(lzo_get_osfhandle)(fd);
+        LZO_COMPILE_TIME_ASSERT(sizeof(h) == sizeof(HANDLE))
+        if (h != -1)
+        {
+            DWORD d = 0;
+            if (GetConsoleMode(LZO_REINTERPRET_CAST(HANDLE, h), &d) == 0)
+                return 0;
+        }
+    }
+#endif
+#if (HAVE_ISATTY)
+    return (isatty(fd)) ? 1 : 0;
+#else
+    return 0;
+#endif
+}
+LZOLIB_PUBLIC(int, lzo_mkdir) (const char* name, unsigned mode)
+{
+#if !(HAVE_MKDIR)
+    LZO_UNUSED(name); LZO_UNUSED(mode);
+    return -1;
+#elif (LZO_ARCH_M68K && LZO_OS_TOS && (LZO_CC_PUREC || LZO_CC_TURBOC))
+    LZO_UNUSED(mode);
+    return Dcreate(name);
+#elif (LZO_OS_DOS32 && LZO_CC_GNUC) && defined(__DJGPP__)
+    return mkdir(name, mode);
+#elif (LZO_OS_WIN32 && LZO_CC_GNUC) && defined(__PW32__)
+    return mkdir(name, mode);
+#elif ((LZO_OS_DOS16 || LZO_OS_DOS32) && (LZO_CC_HIGHC || LZO_CC_PACIFICC))
+    LZO_UNUSED(mode);
+    return mkdir(LZO_UNCONST_CAST(char *, name));
+#elif (LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_OS216 || LZO_OS_WIN16 || LZO_OS_WIN32 || LZO_OS_WIN64)
+    LZO_UNUSED(mode);
+    return mkdir(name);
+#elif (LZO_CC_WATCOMC)
+    return mkdir(name, LZO_STATIC_CAST(mode_t, mode));
+#else
+    return mkdir(name, mode);
+#endif
+}
+LZOLIB_PUBLIC(int, lzo_rmdir) (const char* name)
+{
+#if !(HAVE_RMDIR)
+    LZO_UNUSED(name);
+    return -1;
+#elif ((LZO_OS_DOS16 || LZO_OS_DOS32) && (LZO_CC_HIGHC || LZO_CC_PACIFICC))
+    return rmdir(LZO_UNCONST_CAST(char *, name));
+#else
+    return rmdir(name);
+#endif
+}
+#if defined(lzo_int32e_t)
+LZOLIB_PUBLIC(lzo_int32e_t, lzo_muldiv32s) (lzo_int32e_t a, lzo_int32e_t b, lzo_int32e_t x)
+{
+    lzo_int32e_t r = 0;
+    if __lzo_likely(x != 0)
+    {
+#if defined(lzo_int64l_t)
+        lzo_int64l_t rr = (LZO_ICONV(lzo_int64l_t, a) * b) / x;
+        r = LZO_ITRUNC(lzo_int32e_t, rr);
+#else
+        LZO_UNUSED(a); LZO_UNUSED(b);
+#endif
+    }
+    return r;
+}
+LZOLIB_PUBLIC(lzo_uint32e_t, lzo_muldiv32u) (lzo_uint32e_t a, lzo_uint32e_t b, lzo_uint32e_t x)
+{
+    lzo_uint32e_t r = 0;
+    if __lzo_likely(x != 0)
+    {
+#if defined(lzo_int64l_t)
+        lzo_uint64l_t rr = (LZO_ICONV(lzo_uint64l_t, a) * b) / x;
+        r = LZO_ITRUNC(lzo_uint32e_t, rr);
+#else
+        LZO_UNUSED(a); LZO_UNUSED(b);
+#endif
+    }
+    return r;
+}
+#endif
+#if (LZO_OS_WIN16)
+LZO_EXTERN_C void __far __pascal DebugBreak(void);
+#endif
+LZOLIB_PUBLIC_NOINLINE(void, lzo_debug_break) (void)
+{
+#if (LZO_OS_WIN16)
+    DebugBreak();
+#elif (LZO_ARCH_I086)
+#elif (LZO_OS_WIN64) && (LZO_HAVE_WINDOWS_H)
+    DebugBreak();
+#elif (LZO_ARCH_AMD64 || LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+    __asm__ __volatile__("int $3\n" : : __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+#elif (LZO_ARCH_I386) && (LZO_ASM_SYNTAX_MSC)
+    __asm { int 3 }
+#elif (LZO_OS_WIN32) && (LZO_HAVE_WINDOWS_H)
+    DebugBreak();
+#else
+    volatile lzo_intptr_t a = -1;
+    * LZO_STATIC_CAST(volatile unsigned long *, LZO_REINTERPRET_CAST(volatile void *, a)) = ~0ul;
+#endif
+}
+LZOLIB_PUBLIC_NOINLINE(void, lzo_debug_nop) (void)
+{
+}
+LZOLIB_PUBLIC_NOINLINE(int, lzo_debug_align_check_query) (void)
+{
+#if (LZO_ARCH_AMD64 || LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+# if (LZO_ARCH_AMD64)
+    lzo_uint64e_t r = 0;
+# else
+    size_t r = 0;
+# endif
+    __asm__ __volatile__("pushf\n pop %0\n" : "=a" (r) : __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+    return LZO_ICONV(int, (r >> 18) & 1);
+#elif (LZO_ARCH_I386) && (LZO_ASM_SYNTAX_MSC)
+    unsigned long r;
+    __asm {
+        pushf
+        pop eax
+        mov r,eax
+    }
+    return LZO_ICONV(int, (r >> 18) & 1);
+#else
+    return -1;
+#endif
+}
+LZOLIB_PUBLIC_NOINLINE(int, lzo_debug_align_check_enable) (int v)
+{
+#if (LZO_ARCH_AMD64) && (LZO_ASM_SYNTAX_GNUC)
+    if (v) {
+        __asm__ __volatile__("pushf\n orl $262144,(%%rsp)\n popf\n" : : __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+    } else {
+        __asm__ __volatile__("pushf\n andl $-262145,(%%rsp)\n popf\n" : : __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+    }
+    return 0;
+#elif (LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+    if (v) {
+        __asm__ __volatile__("pushf\n orl $262144,(%%esp)\n popf\n" : : __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+    } else {
+        __asm__ __volatile__("pushf\n andl $-262145,(%%esp)\n popf\n" : : __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+    }
+    return 0;
+#elif (LZO_ARCH_I386) && (LZO_ASM_SYNTAX_MSC)
+    if (v) { __asm {
+        pushf
+        or dword ptr [esp],262144
+        popf
+    }} else { __asm {
+        pushf
+        and dword ptr [esp],-262145
+        popf
+    }}
+    return 0;
+#else
+    LZO_UNUSED(v); return -1;
+#endif
+}
+LZOLIB_PUBLIC_NOINLINE(unsigned, lzo_debug_running_on_qemu) (void)
+{
+    unsigned r = 0;
+#if (LZO_OS_POSIX_LINUX || LZO_OS_WIN32 || LZO_OS_WIN64)
+    const char* p;
+    p = __LZOLIB_FUNCNAME(lzo_getenv)(LZO_PP_STRINGIZE(LZO_ENV_RUNNING_ON_QEMU));
+    if (p) {
+        if (p[0] == 0) r = 0;
+        else if ((p[0] >= '0' && p[0] <= '9') && p[1] == 0) r = LZO_ICAST(unsigned, p[0]) - '0';
+        else r = 1;
+    }
+#endif
+    return r;
+}
+LZOLIB_PUBLIC_NOINLINE(unsigned, lzo_debug_running_on_valgrind) (void)
+{
+#if (LZO_ARCH_AMD64 && LZO_ABI_ILP32)
+    return 0;
+#elif (LZO_ARCH_AMD64 || LZO_ARCH_I386) && (LZO_ASM_SYNTAX_GNUC)
+    volatile size_t a[6];
+    size_t r = 0;
+    a[0] = 0x1001; a[1] = 0; a[2] = 0; a[3] = 0; a[4] = 0; a[5] = 0;
+#  if (LZO_ARCH_AMD64)
+    __asm__ __volatile__(".byte 0x48,0xc1,0xc7,0x03,0x48,0xc1,0xc7,0x0d,0x48,0xc1,0xc7,0x3d,0x48,0xc1,0xc7,0x33,0x48,0x87,0xdb\n" : "=d" (r) : "a" (&a[0]), "d" (r) __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+#  elif (LZO_ARCH_I386)
+    __asm__ __volatile__(".byte 0xc1,0xc7,0x03,0xc1,0xc7,0x0d,0xc1,0xc7,0x1d,0xc1,0xc7,0x13,0x87,0xdb\n" : "=d" (r) : "a" (&a[0]), "d" (r) __LZO_ASM_CLOBBER_LIST_CC_MEMORY);
+#  endif
+    return LZO_ITRUNC(unsigned, r);
+#else
+    return 0;
+#endif
+}
+#if (LZO_OS_WIN32 && LZO_CC_PELLESC && (__POCC__ >= 290))
+#  pragma warn(pop)
+#endif
+#endif
+#if defined(LZO_WANT_ACCLIB_WILDARGV)
+#  undef LZO_WANT_ACCLIB_WILDARGV
+#define __LZOLIB_WILDARGV_CH_INCLUDED 1
+#if !defined(LZOLIB_PUBLIC)
+#  define LZOLIB_PUBLIC(r,f)    r __LZOLIB_FUNCNAME(f)
+#endif
+#if (LZO_OS_DOS16 || LZO_OS_OS216 || LZO_OS_WIN16)
+#if 0 && (LZO_CC_MSC)
+LZO_EXTERN_C int __lzo_cdecl __setargv(void);
+LZO_EXTERN_C int __lzo_cdecl _setargv(void);
+LZO_EXTERN_C int __lzo_cdecl _setargv(void) { return __setargv(); }
+#endif
+#endif
+#if (LZO_OS_WIN32 || LZO_OS_WIN64)
+#if (LZO_CC_MSC && (_MSC_VER >= 1900))
+#elif (LZO_CC_INTELC || LZO_CC_MSC)
+LZO_EXTERN_C int __lzo_cdecl __setargv(void);
+LZO_EXTERN_C int __lzo_cdecl _setargv(void);
+LZO_EXTERN_C int __lzo_cdecl _setargv(void) { return __setargv(); }
+#endif
+#endif
+#if (LZO_OS_EMX)
+#define __LZOLIB_HAVE_LZO_WILDARGV 1
+LZOLIB_PUBLIC(void, lzo_wildargv) (int* argc, char*** argv)
+{
+    if (argc && argv) {
+        _response(argc, argv);
+        _wildcard(argc, argv);
+    }
+}
+#endif
+#if (LZO_OS_CONSOLE_PSP) && defined(__PSPSDK_DEBUG__)
+#define __LZOLIB_HAVE_LZO_WILDARGV 1
+LZO_EXTERN_C int lzo_psp_init_module(int*, char***, int);
+LZOLIB_PUBLIC(void, lzo_wildargv) (int* argc, char*** argv)
+{
+    lzo_psp_init_module(argc, argv, -1);
+}
+#endif
+#if !(__LZOLIB_HAVE_LZO_WILDARGV)
+#define __LZOLIB_HAVE_LZO_WILDARGV 1
+LZOLIB_PUBLIC(void, lzo_wildargv) (int* argc, char*** argv)
+{
+#if 1 && (LZO_ARCH_I086PM)
+    if (LZO_MM_AHSHIFT != 3) { exit(1); }
+#elif 1 && (LZO_ARCH_M68K && LZO_OS_TOS && LZO_CC_GNUC) && defined(__MINT__)
+    __binmode(1);
+    if (isatty(1)) __set_binmode(stdout, 0);
+    if (isatty(2)) __set_binmode(stderr, 0);
+#endif
+    LZO_UNUSED(argc); LZO_UNUSED(argv);
+}
+#endif
+#endif
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzo_swd.ch b/tools/z64compress/src/enc/lzo/lzo_swd.ch
new file mode 100644
index 000000000..20bc46175
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzo_swd.ch
@@ -0,0 +1,700 @@
+/* lzo_swd.ch -- sliding window dictionary
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#if (LZO_UINT_MAX < LZO_0xffffffffL)
+#  error "LZO_UINT_MAX"
+#endif
+#if defined(LZO_DEBUG)
+#  include <stdio.h>
+#endif
+#if defined(__LZO_CHECKER)
+#  include <stdlib.h>
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+/* unsigned type for dictionary access - don't waste memory here */
+#if (0UL + SWD_N + SWD_F + SWD_F < 65535UL)
+   typedef lzo_uint16_t     swd_uint;
+#  define SWD_UINT_MAX      0xffffu
+#else
+   typedef lzo_uint32_t     swd_uint;
+#  define SWD_UINT_MAX      0xffffffffu
+#endif
+#define swd_uintp           swd_uint *
+#define SWD_UINT(x)         ((swd_uint)(x))
+
+
+#ifndef SWD_HSIZE
+#  define SWD_HSIZE         16384
+#endif
+#ifndef SWD_MAX_CHAIN
+#  define SWD_MAX_CHAIN     2048
+#endif
+
+#if !defined(HEAD3)
+#if 1
+#  define HEAD3(b,p) \
+    ((DMUL(0x9f5f,(((((lzo_xint)b[p]<<5)^b[p+1])<<5)^b[p+2]))>>5) & (SWD_HSIZE-1))
+#else
+#  define HEAD3(b,p) \
+    ((DMUL(0x9f5f,(((((lzo_xint)b[p+2]<<5)^b[p+1])<<5)^b[p]))>>5) & (SWD_HSIZE-1))
+#endif
+#endif
+
+#if !(SWD_NO_HEAD2) && (SWD_THRESHOLD == 1) && !defined(HEAD2)
+#  if 1 && (LZO_OPT_UNALIGNED16)
+#    define HEAD2(b,p)      UA_GET_NE16((b)+(p))
+#  else
+#    define HEAD2(b,p)      (b[p] ^ ((unsigned)b[(p)+1]<<8))
+#  endif
+#  define NIL2              SWD_UINT_MAX
+#endif
+#ifndef IF_HEAD2
+#define IF_HEAD2(s)         /*empty*/
+#endif
+
+
+typedef struct
+{
+/* public - "built-in" */
+    lzo_uint swd_n;
+    lzo_uint swd_f;
+    lzo_uint swd_threshold;
+
+/* public - configuration */
+    lzo_uint max_chain;
+    lzo_uint nice_length;
+    lzo_bool use_best_off;
+    lzo_uint lazy_insert;
+
+/* public - output */
+    lzo_uint m_len;
+    lzo_uint m_off;
+    lzo_uint look;
+    int b_char;
+#if defined(SWD_BEST_OFF)
+    lzo_uint best_off[ SWD_BEST_OFF ];
+#endif
+
+/* semi public */
+    LZO_COMPRESS_T *c;
+    lzo_uint m_pos;
+#if defined(SWD_BEST_OFF)
+    lzo_uint best_pos[ SWD_BEST_OFF ];
+#endif
+
+/* private */
+    const lzo_bytep dict;
+    const lzo_bytep dict_end;
+    lzo_uint dict_len;
+
+/* private */
+    lzo_uint ip;                /* input pointer (lookahead) */
+    lzo_uint bp;                /* buffer pointer */
+    lzo_uint rp;                /* remove pointer */
+    lzo_uint b_size;
+
+    lzo_bytep b_wrap;
+
+    lzo_uint node_count;
+    lzo_uint first_rp;
+
+#if defined(__LZO_CHECKER)
+    /* malloc arrays of the exact size to detect any overrun */
+    unsigned char *b;
+    swd_uint *head3;
+    swd_uint *succ3;
+    swd_uint *best3;
+    swd_uint *llen3;
+# ifdef HEAD2
+    swd_uint *head2;
+# endif
+
+#else
+    unsigned char b [ SWD_N + SWD_F + SWD_F ];
+    swd_uint head3 [ SWD_HSIZE ];
+    swd_uint succ3 [ SWD_N + SWD_F ];
+    swd_uint best3 [ SWD_N + SWD_F ];
+    swd_uint llen3 [ SWD_HSIZE ];
+# ifdef HEAD2
+    swd_uint head2 [ 65536L ];
+# endif
+#endif
+}
+lzo_swd_t;
+#define lzo_swd_p   lzo_swd_t *
+
+
+#define s_b(s)      s->b
+#define s_head3(s)  s->head3
+#define s_succ3(s)  s->succ3
+#define s_best3(s)  s->best3
+#define s_llen3(s)  s->llen3
+#ifdef HEAD2
+#define s_head2(s)  s->head2
+#endif
+#define SIZEOF_LZO_SWD_T    (sizeof(lzo_swd_t))
+
+
+/* Access macro for head3.
+ * head3[key] may be uninitialized if the list is emtpy,
+ * but then its value will never be used.
+ */
+#if 1 || defined(__LZO_CHECKER)
+#  define s_get_head3(s,key) \
+        ((swd_uint)((s_llen3(s)[key] == 0) ? SWD_UINT_MAX : s_head3(s)[key]))
+#else
+#  define s_get_head3(s,key)    (s_head3(s)[key])
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_initdict(lzo_swd_p s, const lzo_bytep dict, lzo_uint dict_len)
+{
+    s->dict = s->dict_end = NULL;
+    s->dict_len = 0;
+
+    if (!dict || dict_len == 0)
+        return;
+    if (dict_len > s->swd_n)
+    {
+        dict += dict_len - s->swd_n;
+        dict_len = s->swd_n;
+    }
+
+    s->dict = dict;
+    s->dict_len = dict_len;
+    s->dict_end = dict + dict_len;
+    lzo_memcpy(s_b(s),dict,dict_len);
+    s->ip = dict_len;
+}
+
+
+static
+void swd_insertdict(lzo_swd_p s, lzo_uint node, lzo_uint len)
+{
+    lzo_uint key;
+
+    s->node_count = s->swd_n - len;
+    s->first_rp = node;
+
+    if (len) do
+    {
+        key = HEAD3(s_b(s),node);
+        s_succ3(s)[node] = s_get_head3(s,key);
+        s_head3(s)[key] = SWD_UINT(node);
+        s_best3(s)[node] = SWD_UINT(s->swd_f + 1);
+        s_llen3(s)[key]++;
+        assert(s_llen3(s)[key] <= s->swd_n);
+
+#ifdef HEAD2
+        IF_HEAD2(s) {
+            key = HEAD2(s_b(s),node);
+            s_head2(s)[key] = SWD_UINT(node);
+        }
+#endif
+
+        node++;
+    }
+    while (--len != 0);
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static void swd_exit(lzo_swd_p s);
+
+static
+int swd_init(lzo_swd_p s, const lzo_bytep dict, lzo_uint dict_len)
+{
+#if defined(__LZO_CHECKER)
+    unsigned r = 1;
+    s->b = (lzo_bytep) malloc(SWD_N + SWD_F + SWD_F);
+    s->head3 = (swd_uintp) malloc(sizeof(swd_uint) * SWD_HSIZE);
+    s->succ3 = (swd_uintp) malloc(sizeof(swd_uint) * (SWD_N + SWD_F));
+    s->best3 = (swd_uintp) malloc(sizeof(swd_uint) * (SWD_N + SWD_F));
+    s->llen3 = (swd_uintp) malloc(sizeof(swd_uint) * SWD_HSIZE);
+    r &= s->b != NULL;
+    r &= s->head3 != NULL;
+    r &= s->succ3 != NULL;
+    r &= s->best3 != NULL;
+    r &= s->llen3 != NULL;
+#ifdef HEAD2
+    IF_HEAD2(s) {
+        s->head2 = (swd_uintp) malloc(sizeof(swd_uint) * 65536L);
+        r &= s->head2 != NULL;
+    }
+#endif
+    if (r != 1) {
+        swd_exit(s);
+        return LZO_E_OUT_OF_MEMORY;
+    }
+#endif
+
+    s->m_len = 0;
+    s->m_off = 0;
+#if defined(SWD_BEST_OFF)
+    {
+        unsigned i;
+        for (i = 0; i < SWD_BEST_OFF; i++)
+            s->best_off[i] = s->best_pos[i] = 0;
+    }
+#endif
+
+    s->swd_n = SWD_N;
+    s->swd_f = SWD_F;
+    s->swd_threshold = SWD_THRESHOLD;
+
+    /* defaults */
+    s->max_chain = SWD_MAX_CHAIN;
+    s->nice_length = s->swd_f;
+    s->use_best_off = 0;
+    s->lazy_insert = 0;
+
+    s->b_size = s->swd_n + s->swd_f;
+#if 0
+    if (2 * s->swd_f >= s->swd_n || s->b_size + s->swd_f >= SWD_UINT_MAX)
+        return LZO_E_ERROR;
+#else
+    LZO_COMPILE_TIME_ASSERT(!(0ul + 2 * SWD_F >= SWD_N))
+    LZO_COMPILE_TIME_ASSERT(!(0ul + SWD_N + SWD_F + SWD_F >= SWD_UINT_MAX))
+#endif
+    s->b_wrap = s_b(s) + s->b_size;
+    s->node_count = s->swd_n;
+
+    lzo_memset(s_llen3(s), 0, (lzo_uint)sizeof(s_llen3(s)[0]) * (lzo_uint)SWD_HSIZE);
+#ifdef HEAD2
+    IF_HEAD2(s) {
+#if 1
+        lzo_memset(s_head2(s), 0xff, (lzo_uint)sizeof(s_head2(s)[0]) * 65536L);
+        assert(s_head2(s)[0] == NIL2);
+#else
+        lzo_xint i;
+        for (i = 0; i < 65536L; i++)
+            s_head2(s)[i] = NIL2;
+#endif
+    }
+#endif
+
+    s->ip = 0;
+    swd_initdict(s,dict,dict_len);
+    s->bp = s->ip;
+    s->first_rp = s->ip;
+
+    assert(s->ip + s->swd_f <= s->b_size);
+#if 1
+    s->look = (lzo_uint) (s->c->in_end - s->c->ip);
+    if (s->look > 0)
+    {
+        if (s->look > s->swd_f)
+            s->look = s->swd_f;
+        lzo_memcpy(&s_b(s)[s->ip],s->c->ip,s->look);
+        s->c->ip += s->look;
+        s->ip += s->look;
+    }
+#else
+    s->look = 0;
+    while (s->look < s->swd_f)
+    {
+        int c;
+        if ((c = getbyte(*(s->c))) < 0)
+            break;
+        s_b(s)[s->ip] = LZO_BYTE(c);
+        s->ip++;
+        s->look++;
+    }
+#endif
+    if (s->ip == s->b_size)
+        s->ip = 0;
+
+    if (s->look >= 2 && s->dict_len > 0)
+        swd_insertdict(s,0,s->dict_len);
+
+    s->rp = s->first_rp;
+    if (s->rp >= s->node_count)
+        s->rp -= s->node_count;
+    else
+        s->rp += s->b_size - s->node_count;
+
+#if 1 || defined(__LZO_CHECKER)
+    /* initialize memory for the first few HEAD3 (if s->ip is not far
+     * enough ahead to do this job for us). The value doesn't matter. */
+    if (s->look < 3) {
+        lzo_bytep p = &s_b(s)[s->bp+s->look];
+        p[0] = p[1] = p[2] = 0;
+    }
+#endif
+
+    return LZO_E_OK;
+}
+
+
+static
+void swd_exit(lzo_swd_p s)
+{
+#if defined(__LZO_CHECKER)
+    /* free in reverse order of allocations */
+#ifdef HEAD2
+    free(s->head2); s->head2 = NULL;
+#endif
+    free(s->llen3); s->llen3 = NULL;
+    free(s->best3); s->best3 = NULL;
+    free(s->succ3); s->succ3 = NULL;
+    free(s->head3); s->head3 = NULL;
+    free(s->b); s->b = NULL;
+#else
+    LZO_UNUSED(s);
+#endif
+}
+
+
+#define swd_pos2off(s,pos) \
+    (s->bp > (pos) ? s->bp - (pos) : s->b_size - ((pos) - s->bp))
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static __lzo_inline
+void swd_getbyte(lzo_swd_p s)
+{
+    int c;
+
+    if ((c = getbyte(*(s->c))) < 0)
+    {
+        if (s->look > 0)
+            --s->look;
+#if 1 || defined(__LZO_CHECKER)
+        /* initialize memory - value doesn't matter */
+        s_b(s)[s->ip] = 0;
+        if (s->ip < s->swd_f)
+            s->b_wrap[s->ip] = 0;
+#endif
+    }
+    else
+    {
+        s_b(s)[s->ip] = LZO_BYTE(c);
+        if (s->ip < s->swd_f)
+            s->b_wrap[s->ip] = LZO_BYTE(c);
+    }
+    if (++s->ip == s->b_size)
+        s->ip = 0;
+    if (++s->bp == s->b_size)
+        s->bp = 0;
+    if (++s->rp == s->b_size)
+        s->rp = 0;
+}
+
+
+/***********************************************************************
+// remove node from lists
+************************************************************************/
+
+static __lzo_inline
+void swd_remove_node(lzo_swd_p s, lzo_uint node)
+{
+    if (s->node_count == 0)
+    {
+        lzo_uint key;
+
+#ifdef LZO_DEBUG
+        if (s->first_rp != LZO_UINT_MAX)
+        {
+            if (node != s->first_rp)
+                printf("Remove %5ld: %5ld %5ld %5ld %5ld  %6ld %6ld\n",
+                        (long)node, (long)s->rp, (long)s->ip, (long)s->bp,
+                        (long)s->first_rp, (long)(s->ip - node),
+                        (long)(s->ip - s->bp));
+            assert(node == s->first_rp);
+            s->first_rp = LZO_UINT_MAX;
+        }
+#endif
+
+        key = HEAD3(s_b(s),node);
+        assert(s_llen3(s)[key] > 0);
+        --s_llen3(s)[key];
+
+#ifdef HEAD2
+        IF_HEAD2(s) {
+            key = HEAD2(s_b(s),node);
+            assert(s_head2(s)[key] != NIL2);
+            if ((lzo_uint) s_head2(s)[key] == node)
+                s_head2(s)[key] = NIL2;
+        }
+#endif
+    }
+    else
+        --s->node_count;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_accept(lzo_swd_p s, lzo_uint n)
+{
+    assert(n <= s->look);
+
+    if (n) do
+    {
+        lzo_uint key;
+
+        swd_remove_node(s,s->rp);
+
+        /* add bp into HEAD3 */
+        key = HEAD3(s_b(s),s->bp);
+        s_succ3(s)[s->bp] = s_get_head3(s,key);
+        s_head3(s)[key] = SWD_UINT(s->bp);
+        s_best3(s)[s->bp] = SWD_UINT(s->swd_f + 1);
+        s_llen3(s)[key]++;
+        assert(s_llen3(s)[key] <= s->swd_n);
+
+#ifdef HEAD2
+        /* add bp into HEAD2 */
+        IF_HEAD2(s) {
+            key = HEAD2(s_b(s),s->bp);
+            s_head2(s)[key] = SWD_UINT(s->bp);
+        }
+#endif
+
+        swd_getbyte(s);
+    } while (--n != 0);
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_search(lzo_swd_p s, lzo_uint node, lzo_uint cnt)
+{
+    const lzo_bytep p1;
+    const lzo_bytep p2;
+    const lzo_bytep px;
+    lzo_uint m_len = s->m_len;
+    const lzo_bytep b  = s_b(s);
+    const lzo_bytep bp = s_b(s) + s->bp;
+    const lzo_bytep bx = s_b(s) + s->bp + s->look;
+    swd_uintp succ3 = s_succ3(s);
+    unsigned char scan_end1;
+
+    assert(s->m_len > 0);
+
+    scan_end1 = bp[m_len - 1];
+    for ( ; cnt-- > 0; node = succ3[node])
+    {
+        p1 = bp;
+        p2 = b + node;
+        px = bx;
+
+        assert(m_len < s->look);
+
+        if (
+#if 1
+            p2[m_len - 1] == scan_end1 &&
+            p2[m_len] == p1[m_len] &&
+#endif
+            p2[0] == p1[0] &&
+            p2[1] == p1[1])
+        {
+            lzo_uint i;
+            assert(lzo_memcmp(bp,&b[node],3) == 0);
+
+#if 0 && (LZO_OPT_UNALIGNED32)
+            p1 += 3; p2 += 3;
+            while (p1 + 4 <= px && UA_GET_NE32(p1) == UA_GET_NE32(p2))
+                p1 += 4, p2 += 4;
+            while (p1 < px && *p1 == *p2)
+                p1 += 1, p2 += 1;
+#else
+            p1 += 2; p2 += 2;
+            do {} while (++p1 < px && *p1 == *++p2);
+#endif
+            i = pd(p1, bp);
+
+#ifdef LZO_DEBUG
+            if (lzo_memcmp(bp,&b[node],i) != 0)
+                printf("%5ld %5ld %5ld %02x/%02x %02x/%02x\n",
+                        (long)s->bp, (long) node, (long) i,
+                        bp[0], bp[1], b[node], b[node+1]);
+#endif
+            assert(lzo_memcmp(bp,&b[node],i) == 0);
+
+#if defined(SWD_BEST_OFF)
+            if (i < SWD_BEST_OFF)
+            {
+                if (s->best_pos[i] == 0)
+                    s->best_pos[i] = node + 1;
+            }
+#endif
+            if (i > m_len)
+            {
+                s->m_len = m_len = i;
+                s->m_pos = node;
+                if (m_len == s->look)
+                    return;
+                if (m_len >= s->nice_length)
+                    return;
+                if (m_len > (lzo_uint) s_best3(s)[node])
+                    return;
+                scan_end1 = bp[m_len - 1];
+            }
+        }
+    }
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#ifdef HEAD2
+
+static
+lzo_bool swd_search2(lzo_swd_p s)
+{
+    lzo_uint key;
+
+    assert(s->look >= 2);
+    assert(s->m_len > 0);
+
+    key = s_head2(s)[ HEAD2(s_b(s),s->bp) ];
+    if (key == NIL2)
+        return 0;
+#ifdef LZO_DEBUG
+    if (lzo_memcmp(&s_b(s)[s->bp],&s_b(s)[key],2) != 0)
+        printf("%5ld %5ld %02x/%02x %02x/%02x\n", (long)s->bp, (long)key,
+                s_b(s)[s->bp], s_b(s)[s->bp+1], s_b(s)[key], s_b(s)[key+1]);
+#endif
+    assert(lzo_memcmp(&s_b(s)[s->bp],&s_b(s)[key],2) == 0);
+#if defined(SWD_BEST_OFF)
+    if (s->best_pos[2] == 0)
+        s->best_pos[2] = key + 1;
+#endif
+
+    if (s->m_len < 2)
+    {
+        s->m_len = 2;
+        s->m_pos = key;
+    }
+    return 1;
+}
+
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_findbest(lzo_swd_p s)
+{
+    lzo_uint key;
+    lzo_uint cnt, node;
+    lzo_uint len;
+
+    assert(s->m_len > 0);
+
+    /* get current head, add bp into HEAD3 */
+    key = HEAD3(s_b(s),s->bp);
+    node = s_succ3(s)[s->bp] = s_get_head3(s,key);
+    cnt = s_llen3(s)[key]++;
+    assert(s_llen3(s)[key] <= s->swd_n + s->swd_f);
+    if (cnt > s->max_chain && s->max_chain > 0)
+        cnt = s->max_chain;
+    s_head3(s)[key] = SWD_UINT(s->bp);
+
+    s->b_char = s_b(s)[s->bp];
+    len = s->m_len;
+    if (s->m_len >= s->look)
+    {
+        if (s->look == 0)
+            s->b_char = -1;
+        s->m_off = 0;
+        s_best3(s)[s->bp] = SWD_UINT(s->swd_f + 1);
+    }
+    else
+    {
+#if defined(HEAD2)
+        if (swd_search2(s) && s->look >= 3)
+            swd_search(s,node,cnt);
+#else
+        if (s->look >= 3)
+            swd_search(s,node,cnt);
+#endif
+        if (s->m_len > len)
+            s->m_off = swd_pos2off(s,s->m_pos);
+        s_best3(s)[s->bp] = SWD_UINT(s->m_len);
+
+#if defined(SWD_BEST_OFF)
+        if (s->use_best_off)
+        {
+            unsigned i;
+            for (i = 2; i < SWD_BEST_OFF; i++)
+                if (s->best_pos[i] > 0)
+                    s->best_off[i] = swd_pos2off(s,s->best_pos[i]-1);
+                else
+                    s->best_off[i] = 0;
+        }
+#endif
+    }
+
+    swd_remove_node(s,s->rp);
+
+#ifdef HEAD2
+    /* add bp into HEAD2 */
+    IF_HEAD2(s) {
+        key = HEAD2(s_b(s),s->bp);
+        s_head2(s)[key] = SWD_UINT(s->bp);
+    }
+#endif
+}
+
+
+#undef HEAD3
+#undef HEAD2
+#undef IF_HEAD2
+#undef s_get_head3
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzoconf.h b/tools/z64compress/src/enc/lzo/lzoconf.h
new file mode 100644
index 000000000..1a1dd98c2
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzoconf.h
@@ -0,0 +1,453 @@
+/* lzoconf.h -- configuration of the LZO data compression library
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#ifndef __LZOCONF_H_INCLUDED
+#define __LZOCONF_H_INCLUDED 1
+
+#define LZO_VERSION             0x20a0  /* 2.10 */
+#define LZO_VERSION_STRING      "2.10"
+#define LZO_VERSION_DATE        "Mar 01 2017"
+
+/* internal Autoconf configuration file - only used when building LZO */
+#if defined(LZO_HAVE_CONFIG_H)
+#  include <config.h>
+#endif
+#include <limits.h>
+#include <stddef.h>
+
+
+/***********************************************************************
+// LZO requires a conforming <limits.h>
+************************************************************************/
+
+#if !defined(CHAR_BIT) || (CHAR_BIT != 8)
+#  error "invalid CHAR_BIT"
+#endif
+#if !defined(UCHAR_MAX) || !defined(USHRT_MAX) || !defined(UINT_MAX) || !defined(ULONG_MAX)
+#  error "check your compiler installation"
+#endif
+#if (USHRT_MAX < 1) || (UINT_MAX < 1) || (ULONG_MAX < 1)
+#  error "your limits.h macros are broken"
+#endif
+
+/* get OS and architecture defines */
+#ifndef __LZODEFS_H_INCLUDED
+#include "lzodefs.h"
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/***********************************************************************
+// some core defines
+************************************************************************/
+
+/* memory checkers */
+#if !defined(__LZO_CHECKER)
+#  if defined(__BOUNDS_CHECKING_ON)
+#    define __LZO_CHECKER       1
+#  elif defined(__CHECKER__)
+#    define __LZO_CHECKER       1
+#  elif defined(__INSURE__)
+#    define __LZO_CHECKER       1
+#  elif defined(__PURIFY__)
+#    define __LZO_CHECKER       1
+#  endif
+#endif
+
+
+/***********************************************************************
+// integral and pointer types
+************************************************************************/
+
+/* lzo_uint must match size_t */
+#if !defined(LZO_UINT_MAX)
+#  if (LZO_ABI_LLP64)
+#    if (LZO_OS_WIN64)
+     typedef unsigned __int64   lzo_uint;
+     typedef __int64            lzo_int;
+#    define LZO_TYPEOF_LZO_INT  LZO_TYPEOF___INT64
+#    else
+     typedef lzo_ullong_t       lzo_uint;
+     typedef lzo_llong_t        lzo_int;
+#    define LZO_TYPEOF_LZO_INT  LZO_TYPEOF_LONG_LONG
+#    endif
+#    define LZO_SIZEOF_LZO_INT  8
+#    define LZO_UINT_MAX        0xffffffffffffffffull
+#    define LZO_INT_MAX         9223372036854775807LL
+#    define LZO_INT_MIN         (-1LL - LZO_INT_MAX)
+#  elif (LZO_ABI_IP32L64) /* MIPS R5900 */
+     typedef unsigned int       lzo_uint;
+     typedef int                lzo_int;
+#    define LZO_SIZEOF_LZO_INT  LZO_SIZEOF_INT
+#    define LZO_TYPEOF_LZO_INT  LZO_TYPEOF_INT
+#    define LZO_UINT_MAX        UINT_MAX
+#    define LZO_INT_MAX         INT_MAX
+#    define LZO_INT_MIN         INT_MIN
+#  elif (ULONG_MAX >= LZO_0xffffffffL)
+     typedef unsigned long      lzo_uint;
+     typedef long               lzo_int;
+#    define LZO_SIZEOF_LZO_INT  LZO_SIZEOF_LONG
+#    define LZO_TYPEOF_LZO_INT  LZO_TYPEOF_LONG
+#    define LZO_UINT_MAX        ULONG_MAX
+#    define LZO_INT_MAX         LONG_MAX
+#    define LZO_INT_MIN         LONG_MIN
+#  else
+#    error "lzo_uint"
+#  endif
+#endif
+
+/* The larger type of lzo_uint and lzo_uint32_t. */
+#if (LZO_SIZEOF_LZO_INT >= 4)
+#  define lzo_xint              lzo_uint
+#else
+#  define lzo_xint              lzo_uint32_t
+#endif
+
+typedef int lzo_bool;
+
+/* sanity checks */
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int)  == LZO_SIZEOF_LZO_INT)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint) == LZO_SIZEOF_LZO_INT)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_xint) >= sizeof(lzo_uint))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_xint) >= sizeof(lzo_uint32_t))
+
+#ifndef __LZO_MMODEL
+#define __LZO_MMODEL            /*empty*/
+#endif
+
+/* no typedef here because of const-pointer issues */
+#define lzo_bytep               unsigned char __LZO_MMODEL *
+#define lzo_charp               char __LZO_MMODEL *
+#define lzo_voidp               void __LZO_MMODEL *
+#define lzo_shortp              short __LZO_MMODEL *
+#define lzo_ushortp             unsigned short __LZO_MMODEL *
+#define lzo_intp                lzo_int __LZO_MMODEL *
+#define lzo_uintp               lzo_uint __LZO_MMODEL *
+#define lzo_xintp               lzo_xint __LZO_MMODEL *
+#define lzo_voidpp              lzo_voidp __LZO_MMODEL *
+#define lzo_bytepp              lzo_bytep __LZO_MMODEL *
+
+#define lzo_int8_tp             lzo_int8_t __LZO_MMODEL *
+#define lzo_uint8_tp            lzo_uint8_t __LZO_MMODEL *
+#define lzo_int16_tp            lzo_int16_t __LZO_MMODEL *
+#define lzo_uint16_tp           lzo_uint16_t __LZO_MMODEL *
+#define lzo_int32_tp            lzo_int32_t __LZO_MMODEL *
+#define lzo_uint32_tp           lzo_uint32_t __LZO_MMODEL *
+#if defined(lzo_int64_t)
+#define lzo_int64_tp            lzo_int64_t __LZO_MMODEL *
+#define lzo_uint64_tp           lzo_uint64_t __LZO_MMODEL *
+#endif
+
+/* Older LZO versions used to support ancient systems and memory models
+ * such as 16-bit MSDOS with __huge pointers or Cray PVP, but these
+ * obsolete configurations are not supported any longer.
+ */
+#if defined(__LZO_MMODEL_HUGE)
+#error "__LZO_MMODEL_HUGE memory model is unsupported"
+#endif
+#if (LZO_MM_PVP)
+#error "LZO_MM_PVP memory model is unsupported"
+#endif
+#if (LZO_SIZEOF_INT < 4)
+#error "LZO_SIZEOF_INT < 4 is unsupported"
+#endif
+#if (__LZO_UINTPTR_T_IS_POINTER)
+#error "__LZO_UINTPTR_T_IS_POINTER is unsupported"
+#endif
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(int) >= 4)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint) >= 4)
+/* Strange configurations where sizeof(lzo_uint) != sizeof(size_t) should
+ * work but have not received much testing lately, so be strict here.
+ */
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint) == sizeof(size_t))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint) == sizeof(ptrdiff_t))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_uint) == sizeof(lzo_uintptr_t))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(void *)   == sizeof(lzo_uintptr_t))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(char *)   == sizeof(lzo_uintptr_t))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(long *)   == sizeof(lzo_uintptr_t))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(void *)   == sizeof(lzo_voidp))
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(char *)   == sizeof(lzo_bytep))
+
+
+/***********************************************************************
+// function types
+************************************************************************/
+
+/* name mangling */
+#if !defined(__LZO_EXTERN_C)
+#  ifdef __cplusplus
+#    define __LZO_EXTERN_C      extern "C"
+#  else
+#    define __LZO_EXTERN_C      extern
+#  endif
+#endif
+
+/* calling convention */
+#if !defined(__LZO_CDECL)
+#  define __LZO_CDECL           __lzo_cdecl
+#endif
+
+/* DLL export information */
+#if !defined(__LZO_EXPORT1)
+#  define __LZO_EXPORT1         /*empty*/
+#endif
+#if !defined(__LZO_EXPORT2)
+#  define __LZO_EXPORT2         /*empty*/
+#endif
+
+/* __cdecl calling convention for public C and assembly functions */
+#if !defined(LZO_PUBLIC)
+#  define LZO_PUBLIC(r)         __LZO_EXPORT1 r __LZO_EXPORT2 __LZO_CDECL
+#endif
+#if !defined(LZO_EXTERN)
+#  define LZO_EXTERN(r)         __LZO_EXTERN_C LZO_PUBLIC(r)
+#endif
+#if !defined(LZO_PRIVATE)
+#  define LZO_PRIVATE(r)        static r  __LZO_CDECL
+#endif
+
+/* function types */
+typedef int
+(__LZO_CDECL *lzo_compress_t)   ( const lzo_bytep src, lzo_uint  src_len,
+                                        lzo_bytep dst, lzo_uintp dst_len,
+                                        lzo_voidp wrkmem );
+
+typedef int
+(__LZO_CDECL *lzo_decompress_t) ( const lzo_bytep src, lzo_uint  src_len,
+                                        lzo_bytep dst, lzo_uintp dst_len,
+                                        lzo_voidp wrkmem );
+
+typedef int
+(__LZO_CDECL *lzo_optimize_t)   (       lzo_bytep src, lzo_uint  src_len,
+                                        lzo_bytep dst, lzo_uintp dst_len,
+                                        lzo_voidp wrkmem );
+
+typedef int
+(__LZO_CDECL *lzo_compress_dict_t)(const lzo_bytep src, lzo_uint  src_len,
+                                         lzo_bytep dst, lzo_uintp dst_len,
+                                         lzo_voidp wrkmem,
+                                   const lzo_bytep dict, lzo_uint dict_len );
+
+typedef int
+(__LZO_CDECL *lzo_decompress_dict_t)(const lzo_bytep src, lzo_uint  src_len,
+                                           lzo_bytep dst, lzo_uintp dst_len,
+                                           lzo_voidp wrkmem,
+                                     const lzo_bytep dict, lzo_uint dict_len );
+
+
+/* Callback interface. Currently only the progress indicator ("nprogress")
+ * is used, but this may change in a future release. */
+
+struct lzo_callback_t;
+typedef struct lzo_callback_t lzo_callback_t;
+#define lzo_callback_p lzo_callback_t __LZO_MMODEL *
+
+/* malloc & free function types */
+typedef lzo_voidp (__LZO_CDECL *lzo_alloc_func_t)
+    (lzo_callback_p self, lzo_uint items, lzo_uint size);
+typedef void      (__LZO_CDECL *lzo_free_func_t)
+    (lzo_callback_p self, lzo_voidp ptr);
+
+/* a progress indicator callback function */
+typedef void (__LZO_CDECL *lzo_progress_func_t)
+    (lzo_callback_p, lzo_uint, lzo_uint, int);
+
+struct lzo_callback_t
+{
+    /* custom allocators (set to 0 to disable) */
+    lzo_alloc_func_t nalloc;                /* [not used right now] */
+    lzo_free_func_t nfree;                  /* [not used right now] */
+
+    /* a progress indicator callback function (set to 0 to disable) */
+    lzo_progress_func_t nprogress;
+
+    /* INFO: the first parameter "self" of the nalloc/nfree/nprogress
+     * callbacks points back to this struct, so you are free to store
+     * some extra info in the following variables. */
+    lzo_voidp user1;
+    lzo_xint user2;
+    lzo_xint user3;
+};
+
+
+/***********************************************************************
+// error codes and prototypes
+************************************************************************/
+
+/* Error codes for the compression/decompression functions. Negative
+ * values are errors, positive values will be used for special but
+ * normal events.
+ */
+#define LZO_E_OK                    0
+#define LZO_E_ERROR                 (-1)
+#define LZO_E_OUT_OF_MEMORY         (-2)    /* [lzo_alloc_func_t failure] */
+#define LZO_E_NOT_COMPRESSIBLE      (-3)    /* [not used right now] */
+#define LZO_E_INPUT_OVERRUN         (-4)
+#define LZO_E_OUTPUT_OVERRUN        (-5)
+#define LZO_E_LOOKBEHIND_OVERRUN    (-6)
+#define LZO_E_EOF_NOT_FOUND         (-7)
+#define LZO_E_INPUT_NOT_CONSUMED    (-8)
+#define LZO_E_NOT_YET_IMPLEMENTED   (-9)    /* [not used right now] */
+#define LZO_E_INVALID_ARGUMENT      (-10)
+#define LZO_E_INVALID_ALIGNMENT     (-11)   /* pointer argument is not properly aligned */
+#define LZO_E_OUTPUT_NOT_CONSUMED   (-12)
+#define LZO_E_INTERNAL_ERROR        (-99)
+
+
+#ifndef lzo_sizeof_dict_t
+#  define lzo_sizeof_dict_t     ((unsigned)sizeof(lzo_bytep))
+#endif
+
+/* lzo_init() should be the first function you call.
+ * Check the return code !
+ *
+ * lzo_init() is a macro to allow checking that the library and the
+ * compiler's view of various types are consistent.
+ */
+#define lzo_init() __lzo_init_v2(LZO_VERSION,(int)sizeof(short),(int)sizeof(int),\
+    (int)sizeof(long),(int)sizeof(lzo_uint32_t),(int)sizeof(lzo_uint),\
+    (int)lzo_sizeof_dict_t,(int)sizeof(char *),(int)sizeof(lzo_voidp),\
+    (int)sizeof(lzo_callback_t))
+LZO_EXTERN(int) __lzo_init_v2(unsigned,int,int,int,int,int,int,int,int,int);
+
+/* version functions (useful for shared libraries) */
+LZO_EXTERN(unsigned) lzo_version(void);
+LZO_EXTERN(const char *) lzo_version_string(void);
+LZO_EXTERN(const char *) lzo_version_date(void);
+LZO_EXTERN(const lzo_charp) _lzo_version_string(void);
+LZO_EXTERN(const lzo_charp) _lzo_version_date(void);
+
+/* string functions */
+LZO_EXTERN(int)
+    lzo_memcmp(const lzo_voidp a, const lzo_voidp b, lzo_uint len);
+LZO_EXTERN(lzo_voidp)
+    lzo_memcpy(lzo_voidp dst, const lzo_voidp src, lzo_uint len);
+LZO_EXTERN(lzo_voidp)
+    lzo_memmove(lzo_voidp dst, const lzo_voidp src, lzo_uint len);
+LZO_EXTERN(lzo_voidp)
+    lzo_memset(lzo_voidp buf, int c, lzo_uint len);
+
+/* checksum functions */
+LZO_EXTERN(lzo_uint32_t)
+    lzo_adler32(lzo_uint32_t c, const lzo_bytep buf, lzo_uint len);
+LZO_EXTERN(lzo_uint32_t)
+    lzo_crc32(lzo_uint32_t c, const lzo_bytep buf, lzo_uint len);
+LZO_EXTERN(const lzo_uint32_tp)
+    lzo_get_crc32_table(void);
+
+/* misc. */
+LZO_EXTERN(int) _lzo_config_check(void);
+typedef union {
+    lzo_voidp a00; lzo_bytep a01; lzo_uint a02; lzo_xint a03; lzo_uintptr_t a04;
+    void *a05; unsigned char *a06; unsigned long a07; size_t a08; ptrdiff_t a09;
+#if defined(lzo_int64_t)
+    lzo_uint64_t a10;
+#endif
+} lzo_align_t;
+
+/* align a char pointer on a boundary that is a multiple of 'size' */
+LZO_EXTERN(unsigned) __lzo_align_gap(const lzo_voidp p, lzo_uint size);
+#define LZO_PTR_ALIGN_UP(p,size) \
+    ((p) + (lzo_uint) __lzo_align_gap((const lzo_voidp)(p),(lzo_uint)(size)))
+
+
+/***********************************************************************
+// deprecated macros - only for backward compatibility
+************************************************************************/
+
+/* deprecated - use 'lzo_bytep' instead of 'lzo_byte *' */
+#define lzo_byte                unsigned char
+/* deprecated type names */
+#define lzo_int32               lzo_int32_t
+#define lzo_uint32              lzo_uint32_t
+#define lzo_int32p              lzo_int32_t __LZO_MMODEL *
+#define lzo_uint32p             lzo_uint32_t __LZO_MMODEL *
+#define LZO_INT32_MAX           LZO_INT32_C(2147483647)
+#define LZO_UINT32_MAX          LZO_UINT32_C(4294967295)
+#if defined(lzo_int64_t)
+#define lzo_int64               lzo_int64_t
+#define lzo_uint64              lzo_uint64_t
+#define lzo_int64p              lzo_int64_t __LZO_MMODEL *
+#define lzo_uint64p             lzo_uint64_t __LZO_MMODEL *
+#define LZO_INT64_MAX           LZO_INT64_C(9223372036854775807)
+#define LZO_UINT64_MAX          LZO_UINT64_C(18446744073709551615)
+#endif
+/* deprecated types */
+typedef union { lzo_bytep a; lzo_uint b; } __lzo_pu_u;
+typedef union { lzo_bytep a; lzo_uint32_t b; } __lzo_pu32_u;
+/* deprecated defines */
+#if !defined(LZO_SIZEOF_LZO_UINT)
+#  define LZO_SIZEOF_LZO_UINT   LZO_SIZEOF_LZO_INT
+#endif
+
+#if defined(LZO_CFG_COMPAT)
+
+#define __LZOCONF_H 1
+
+#if defined(LZO_ARCH_I086)
+#  define __LZO_i386 1
+#elif defined(LZO_ARCH_I386)
+#  define __LZO_i386 1
+#endif
+
+#if defined(LZO_OS_DOS16)
+#  define __LZO_DOS 1
+#  define __LZO_DOS16 1
+#elif defined(LZO_OS_DOS32)
+#  define __LZO_DOS 1
+#elif defined(LZO_OS_WIN16)
+#  define __LZO_WIN 1
+#  define __LZO_WIN16 1
+#elif defined(LZO_OS_WIN32)
+#  define __LZO_WIN 1
+#endif
+
+#define __LZO_CMODEL            /*empty*/
+#define __LZO_DMODEL            /*empty*/
+#define __LZO_ENTRY             __LZO_CDECL
+#define LZO_EXTERN_CDECL        LZO_EXTERN
+#define LZO_ALIGN               LZO_PTR_ALIGN_UP
+
+#define lzo_compress_asm_t      lzo_compress_t
+#define lzo_decompress_asm_t    lzo_decompress_t
+
+#endif /* LZO_CFG_COMPAT */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* already included */
+
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/lzo/lzodefs.h b/tools/z64compress/src/enc/lzo/lzodefs.h
new file mode 100644
index 000000000..c3e2bcf5d
--- /dev/null
+++ b/tools/z64compress/src/enc/lzo/lzodefs.h
@@ -0,0 +1,3268 @@
+/* lzodefs.h -- architecture, OS and compiler specific defines
+
+   This file is part of the LZO real-time data compression library.
+
+   Copyright (C) 1996-2017 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The LZO library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The LZO library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the LZO library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/lzo/
+ */
+
+
+#ifndef __LZODEFS_H_INCLUDED
+#define __LZODEFS_H_INCLUDED 1
+
+#if defined(__CYGWIN32__) && !defined(__CYGWIN__)
+#  define __CYGWIN__ __CYGWIN32__
+#endif
+#if 1 && defined(__INTERIX) && defined(__GNUC__) && !defined(_ALL_SOURCE)
+#  define _ALL_SOURCE 1
+#endif
+#if defined(__mips__) && defined(__R5900__)
+#  if !defined(__LONG_MAX__)
+#    define __LONG_MAX__ 9223372036854775807L
+#  endif
+#endif
+#if 0
+#elif !defined(__LZO_LANG_OVERRIDE)
+#if (defined(__clang__) || defined(__GNUC__)) && defined(__ASSEMBLER__)
+#  if (__ASSEMBLER__+0) <= 0
+#    error "__ASSEMBLER__"
+#  else
+#    define LZO_LANG_ASSEMBLER  1
+#  endif
+#elif defined(__cplusplus)
+#  if (__cplusplus+0) <= 0
+#    error "__cplusplus"
+#  elif (__cplusplus < 199711L)
+#    define LZO_LANG_CXX        1
+#  elif defined(_MSC_VER) && defined(_MSVC_LANG) && (_MSVC_LANG+0 >= 201402L) && 1
+#    define LZO_LANG_CXX        _MSVC_LANG
+#  else
+#    define LZO_LANG_CXX        __cplusplus
+#  endif
+#  define LZO_LANG_CPLUSPLUS    LZO_LANG_CXX
+#else
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__+0 >= 199409L)
+#    define LZO_LANG_C          __STDC_VERSION__
+#  else
+#    define LZO_LANG_C          1
+#  endif
+#endif
+#endif
+#if !defined(LZO_CFG_NO_DISABLE_WUNDEF)
+#if defined(__ARMCC_VERSION)
+#  pragma diag_suppress 193
+#elif defined(__clang__) && defined(__clang_minor__)
+#  pragma clang diagnostic ignored "-Wundef"
+#elif defined(__INTEL_COMPILER)
+#  pragma warning(disable: 193)
+#elif defined(__KEIL__) && defined(__C166__)
+#  pragma warning disable = 322
+#elif defined(__GNUC__) && defined(__GNUC_MINOR__) && !defined(__PATHSCALE__)
+#  if ((__GNUC__-0) >= 5 || ((__GNUC__-0) == 4 && (__GNUC_MINOR__-0) >= 2))
+#    pragma GCC diagnostic ignored "-Wundef"
+#  endif
+#elif defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER) && !defined(__MWERKS__)
+#  if ((_MSC_VER-0) >= 1300)
+#    pragma warning(disable: 4668)
+#  endif
+#endif
+#endif
+#if 0 && defined(__POCC__) && defined(_WIN32)
+#  if (__POCC__ >= 400)
+#    pragma warn(disable: 2216)
+#  endif
+#endif
+#if 0 && defined(__WATCOMC__)
+#  if (__WATCOMC__ >= 1050) && (__WATCOMC__ < 1060)
+#    pragma warning 203 9
+#  endif
+#endif
+#if defined(__BORLANDC__) && defined(__MSDOS__) && !defined(__FLAT__)
+#  pragma option -h
+#endif
+#if !(LZO_CFG_NO_DISABLE_WCRTNONSTDC)
+#ifndef _CRT_NONSTDC_NO_DEPRECATE
+#define _CRT_NONSTDC_NO_DEPRECATE 1
+#endif
+#ifndef _CRT_NONSTDC_NO_WARNINGS
+#define _CRT_NONSTDC_NO_WARNINGS 1
+#endif
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE 1
+#endif
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS 1
+#endif
+#endif
+#if 0
+#define LZO_0xffffUL            0xfffful
+#define LZO_0xffffffffUL        0xfffffffful
+#else
+#define LZO_0xffffUL            65535ul
+#define LZO_0xffffffffUL        4294967295ul
+#endif
+#define LZO_0xffffL             LZO_0xffffUL
+#define LZO_0xffffffffL         LZO_0xffffffffUL
+#if (LZO_0xffffL == LZO_0xffffffffL)
+#  error "your preprocessor is broken 1"
+#endif
+#if (16ul * 16384ul != 262144ul)
+#  error "your preprocessor is broken 2"
+#endif
+#if 0
+#if (32767 >= 4294967295ul)
+#  error "your preprocessor is broken 3"
+#endif
+#if (65535u >= 4294967295ul)
+#  error "your preprocessor is broken 4"
+#endif
+#endif
+#if defined(__COUNTER__)
+#  ifndef LZO_CFG_USE_COUNTER
+#  define LZO_CFG_USE_COUNTER 1
+#  endif
+#else
+#  undef LZO_CFG_USE_COUNTER
+#endif
+#if (UINT_MAX == LZO_0xffffL)
+#if defined(__ZTC__) && defined(__I86__) && !defined(__OS2__)
+#  if !defined(MSDOS)
+#    define MSDOS 1
+#  endif
+#  if !defined(_MSDOS)
+#    define _MSDOS 1
+#  endif
+#elif 0 && defined(__VERSION) && defined(MB_LEN_MAX)
+#  if (__VERSION == 520) && (MB_LEN_MAX == 1)
+#    if !defined(__AZTEC_C__)
+#      define __AZTEC_C__ __VERSION
+#    endif
+#    if !defined(__DOS__)
+#      define __DOS__ 1
+#    endif
+#  endif
+#endif
+#endif
+#if (UINT_MAX == LZO_0xffffL)
+#if defined(_MSC_VER) && defined(M_I86HM)
+#  define ptrdiff_t long
+#  define _PTRDIFF_T_DEFINED 1
+#endif
+#endif
+#if (UINT_MAX == LZO_0xffffL)
+#  undef __LZO_RENAME_A
+#  undef __LZO_RENAME_B
+#  if defined(__AZTEC_C__) && defined(__DOS__)
+#    define __LZO_RENAME_A 1
+#  elif defined(_MSC_VER) && defined(MSDOS)
+#    if (_MSC_VER < 600)
+#      define __LZO_RENAME_A 1
+#    elif (_MSC_VER < 700)
+#      define __LZO_RENAME_B 1
+#    endif
+#  elif defined(__TSC__) && defined(__OS2__)
+#    define __LZO_RENAME_A 1
+#  elif defined(__MSDOS__) && defined(__TURBOC__) && (__TURBOC__ < 0x0410)
+#    define __LZO_RENAME_A 1
+#  elif defined(__PACIFIC__) && defined(DOS)
+#    if !defined(__far)
+#      define __far far
+#    endif
+#    if !defined(__near)
+#      define __near near
+#    endif
+#  endif
+#  if defined(__LZO_RENAME_A)
+#    if !defined(__cdecl)
+#      define __cdecl cdecl
+#    endif
+#    if !defined(__far)
+#      define __far far
+#    endif
+#    if !defined(__huge)
+#      define __huge huge
+#    endif
+#    if !defined(__near)
+#      define __near near
+#    endif
+#    if !defined(__pascal)
+#      define __pascal pascal
+#    endif
+#    if !defined(__huge)
+#      define __huge huge
+#    endif
+#  elif defined(__LZO_RENAME_B)
+#    if !defined(__cdecl)
+#      define __cdecl _cdecl
+#    endif
+#    if !defined(__far)
+#      define __far _far
+#    endif
+#    if !defined(__huge)
+#      define __huge _huge
+#    endif
+#    if !defined(__near)
+#      define __near _near
+#    endif
+#    if !defined(__pascal)
+#      define __pascal _pascal
+#    endif
+#  elif (defined(__PUREC__) || defined(__TURBOC__)) && defined(__TOS__)
+#    if !defined(__cdecl)
+#      define __cdecl cdecl
+#    endif
+#    if !defined(__pascal)
+#      define __pascal pascal
+#    endif
+#  endif
+#  undef __LZO_RENAME_A
+#  undef __LZO_RENAME_B
+#endif
+#if (UINT_MAX == LZO_0xffffL)
+#if defined(__AZTEC_C__) && defined(__DOS__)
+#  define LZO_BROKEN_CDECL_ALT_SYNTAX 1
+#elif defined(_MSC_VER) && defined(MSDOS)
+#  if (_MSC_VER < 600)
+#    define LZO_BROKEN_INTEGRAL_CONSTANTS 1
+#  endif
+#  if (_MSC_VER < 700)
+#    define LZO_BROKEN_INTEGRAL_PROMOTION 1
+#    define LZO_BROKEN_SIZEOF 1
+#  endif
+#elif defined(__PACIFIC__) && defined(DOS)
+#  define LZO_BROKEN_INTEGRAL_CONSTANTS 1
+#elif defined(__TURBOC__) && defined(__MSDOS__)
+#  if (__TURBOC__ < 0x0150)
+#    define LZO_BROKEN_CDECL_ALT_SYNTAX 1
+#    define LZO_BROKEN_INTEGRAL_CONSTANTS 1
+#    define LZO_BROKEN_INTEGRAL_PROMOTION 1
+#  endif
+#  if (__TURBOC__ < 0x0200)
+#    define LZO_BROKEN_SIZEOF 1
+#  endif
+#  if (__TURBOC__ < 0x0400) && defined(__cplusplus)
+#    define LZO_BROKEN_CDECL_ALT_SYNTAX 1
+#  endif
+#elif (defined(__PUREC__) || defined(__TURBOC__)) && defined(__TOS__)
+#  define LZO_BROKEN_CDECL_ALT_SYNTAX 1
+#  define LZO_BROKEN_SIZEOF 1
+#endif
+#endif
+#if defined(__WATCOMC__) && (__WATCOMC__ < 900)
+#  define LZO_BROKEN_INTEGRAL_CONSTANTS 1
+#endif
+#if defined(_CRAY) && defined(_CRAY1)
+#  define LZO_BROKEN_SIGNED_RIGHT_SHIFT 1
+#endif
+#define LZO_PP_STRINGIZE(x)             #x
+#define LZO_PP_MACRO_EXPAND(x)          LZO_PP_STRINGIZE(x)
+#define LZO_PP_CONCAT0()                /*empty*/
+#define LZO_PP_CONCAT1(a)               a
+#define LZO_PP_CONCAT2(a,b)             a ## b
+#define LZO_PP_CONCAT3(a,b,c)           a ## b ## c
+#define LZO_PP_CONCAT4(a,b,c,d)         a ## b ## c ## d
+#define LZO_PP_CONCAT5(a,b,c,d,e)       a ## b ## c ## d ## e
+#define LZO_PP_CONCAT6(a,b,c,d,e,f)     a ## b ## c ## d ## e ## f
+#define LZO_PP_CONCAT7(a,b,c,d,e,f,g)   a ## b ## c ## d ## e ## f ## g
+#define LZO_PP_ECONCAT0()               LZO_PP_CONCAT0()
+#define LZO_PP_ECONCAT1(a)              LZO_PP_CONCAT1(a)
+#define LZO_PP_ECONCAT2(a,b)            LZO_PP_CONCAT2(a,b)
+#define LZO_PP_ECONCAT3(a,b,c)          LZO_PP_CONCAT3(a,b,c)
+#define LZO_PP_ECONCAT4(a,b,c,d)        LZO_PP_CONCAT4(a,b,c,d)
+#define LZO_PP_ECONCAT5(a,b,c,d,e)      LZO_PP_CONCAT5(a,b,c,d,e)
+#define LZO_PP_ECONCAT6(a,b,c,d,e,f)    LZO_PP_CONCAT6(a,b,c,d,e,f)
+#define LZO_PP_ECONCAT7(a,b,c,d,e,f,g)  LZO_PP_CONCAT7(a,b,c,d,e,f,g)
+#define LZO_PP_EMPTY                    /*empty*/
+#define LZO_PP_EMPTY0()                 /*empty*/
+#define LZO_PP_EMPTY1(a)                /*empty*/
+#define LZO_PP_EMPTY2(a,b)              /*empty*/
+#define LZO_PP_EMPTY3(a,b,c)            /*empty*/
+#define LZO_PP_EMPTY4(a,b,c,d)          /*empty*/
+#define LZO_PP_EMPTY5(a,b,c,d,e)        /*empty*/
+#define LZO_PP_EMPTY6(a,b,c,d,e,f)      /*empty*/
+#define LZO_PP_EMPTY7(a,b,c,d,e,f,g)    /*empty*/
+#if 1
+#define LZO_CPP_STRINGIZE(x)            #x
+#define LZO_CPP_MACRO_EXPAND(x)         LZO_CPP_STRINGIZE(x)
+#define LZO_CPP_CONCAT2(a,b)            a ## b
+#define LZO_CPP_CONCAT3(a,b,c)          a ## b ## c
+#define LZO_CPP_CONCAT4(a,b,c,d)        a ## b ## c ## d
+#define LZO_CPP_CONCAT5(a,b,c,d,e)      a ## b ## c ## d ## e
+#define LZO_CPP_CONCAT6(a,b,c,d,e,f)    a ## b ## c ## d ## e ## f
+#define LZO_CPP_CONCAT7(a,b,c,d,e,f,g)  a ## b ## c ## d ## e ## f ## g
+#define LZO_CPP_ECONCAT2(a,b)           LZO_CPP_CONCAT2(a,b)
+#define LZO_CPP_ECONCAT3(a,b,c)         LZO_CPP_CONCAT3(a,b,c)
+#define LZO_CPP_ECONCAT4(a,b,c,d)       LZO_CPP_CONCAT4(a,b,c,d)
+#define LZO_CPP_ECONCAT5(a,b,c,d,e)     LZO_CPP_CONCAT5(a,b,c,d,e)
+#define LZO_CPP_ECONCAT6(a,b,c,d,e,f)   LZO_CPP_CONCAT6(a,b,c,d,e,f)
+#define LZO_CPP_ECONCAT7(a,b,c,d,e,f,g) LZO_CPP_CONCAT7(a,b,c,d,e,f,g)
+#endif
+#define __LZO_MASK_GEN(o,b)     (((((o) << ((b)-((b)!=0))) - (o)) << 1) + (o)*((b)!=0))
+#if 1 && defined(__cplusplus)
+#  if !defined(__STDC_CONSTANT_MACROS)
+#    define __STDC_CONSTANT_MACROS 1
+#  endif
+#  if !defined(__STDC_LIMIT_MACROS)
+#    define __STDC_LIMIT_MACROS 1
+#  endif
+#endif
+#if defined(__cplusplus)
+#  define LZO_EXTERN_C          extern "C"
+#  define LZO_EXTERN_C_BEGIN    extern "C" {
+#  define LZO_EXTERN_C_END      }
+#else
+#  define LZO_EXTERN_C          extern
+#  define LZO_EXTERN_C_BEGIN    /*empty*/
+#  define LZO_EXTERN_C_END      /*empty*/
+#endif
+#if !defined(__LZO_OS_OVERRIDE)
+#if (LZO_OS_FREESTANDING)
+#  define LZO_INFO_OS           "freestanding"
+#elif (LZO_OS_EMBEDDED)
+#  define LZO_INFO_OS           "embedded"
+#elif 1 && defined(__IAR_SYSTEMS_ICC__)
+#  define LZO_OS_EMBEDDED       1
+#  define LZO_INFO_OS           "embedded"
+#elif defined(__CYGWIN__) && defined(__GNUC__)
+#  define LZO_OS_CYGWIN         1
+#  define LZO_INFO_OS           "cygwin"
+#elif defined(__EMX__) && defined(__GNUC__)
+#  define LZO_OS_EMX            1
+#  define LZO_INFO_OS           "emx"
+#elif defined(__BEOS__)
+#  define LZO_OS_BEOS           1
+#  define LZO_INFO_OS           "beos"
+#elif defined(__Lynx__)
+#  define LZO_OS_LYNXOS         1
+#  define LZO_INFO_OS           "lynxos"
+#elif defined(__OS400__)
+#  define LZO_OS_OS400          1
+#  define LZO_INFO_OS           "os400"
+#elif defined(__QNX__)
+#  define LZO_OS_QNX            1
+#  define LZO_INFO_OS           "qnx"
+#elif defined(__BORLANDC__) && defined(__DPMI32__) && (__BORLANDC__ >= 0x0460)
+#  define LZO_OS_DOS32          1
+#  define LZO_INFO_OS           "dos32"
+#elif defined(__BORLANDC__) && defined(__DPMI16__)
+#  define LZO_OS_DOS16          1
+#  define LZO_INFO_OS           "dos16"
+#elif defined(__ZTC__) && defined(DOS386)
+#  define LZO_OS_DOS32          1
+#  define LZO_INFO_OS           "dos32"
+#elif defined(__OS2__) || defined(__OS2V2__)
+#  if (UINT_MAX == LZO_0xffffL)
+#    define LZO_OS_OS216        1
+#    define LZO_INFO_OS         "os216"
+#  elif (UINT_MAX == LZO_0xffffffffL)
+#    define LZO_OS_OS2          1
+#    define LZO_INFO_OS         "os2"
+#  else
+#    error "check your limits.h header"
+#  endif
+#elif defined(__WIN64__) || defined(_WIN64) || defined(WIN64)
+#  define LZO_OS_WIN64          1
+#  define LZO_INFO_OS           "win64"
+#elif defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(__WINDOWS_386__)
+#  define LZO_OS_WIN32          1
+#  define LZO_INFO_OS           "win32"
+#elif defined(__MWERKS__) && defined(__INTEL__)
+#  define LZO_OS_WIN32          1
+#  define LZO_INFO_OS           "win32"
+#elif defined(__WINDOWS__) || defined(_WINDOWS) || defined(_Windows)
+#  if (UINT_MAX == LZO_0xffffL)
+#    define LZO_OS_WIN16        1
+#    define LZO_INFO_OS         "win16"
+#  elif (UINT_MAX == LZO_0xffffffffL)
+#    define LZO_OS_WIN32        1
+#    define LZO_INFO_OS         "win32"
+#  else
+#    error "check your limits.h header"
+#  endif
+#elif defined(__DOS__) || defined(__MSDOS__) || defined(_MSDOS) || defined(MSDOS) || (defined(__PACIFIC__) && defined(DOS))
+#  if (UINT_MAX == LZO_0xffffL)
+#    define LZO_OS_DOS16        1
+#    define LZO_INFO_OS         "dos16"
+#  elif (UINT_MAX == LZO_0xffffffffL)
+#    define LZO_OS_DOS32        1
+#    define LZO_INFO_OS         "dos32"
+#  else
+#    error "check your limits.h header"
+#  endif
+#elif defined(__WATCOMC__)
+#  if defined(__NT__) && (UINT_MAX == LZO_0xffffL)
+#    define LZO_OS_DOS16        1
+#    define LZO_INFO_OS         "dos16"
+#  elif defined(__NT__) && (__WATCOMC__ < 1100)
+#    define LZO_OS_WIN32        1
+#    define LZO_INFO_OS         "win32"
+#  elif defined(__linux__) || defined(__LINUX__)
+#    define LZO_OS_POSIX        1
+#    define LZO_INFO_OS         "posix"
+#  else
+#    error "please specify a target using the -bt compiler option"
+#  endif
+#elif defined(__palmos__)
+#  define LZO_OS_PALMOS         1
+#  define LZO_INFO_OS           "palmos"
+#elif defined(__TOS__) || defined(__atarist__)
+#  define LZO_OS_TOS            1
+#  define LZO_INFO_OS           "tos"
+#elif defined(macintosh) && !defined(__arm__) && !defined(__i386__) && !defined(__ppc__) && !defined(__x64_64__)
+#  define LZO_OS_MACCLASSIC     1
+#  define LZO_INFO_OS           "macclassic"
+#elif defined(__VMS)
+#  define LZO_OS_VMS            1
+#  define LZO_INFO_OS           "vms"
+#elif (defined(__mips__) && defined(__R5900__)) || defined(__MIPS_PSX2__)
+#  define LZO_OS_CONSOLE        1
+#  define LZO_OS_CONSOLE_PS2    1
+#  define LZO_INFO_OS           "console"
+#  define LZO_INFO_OS_CONSOLE   "ps2"
+#elif defined(__mips__) && defined(__psp__)
+#  define LZO_OS_CONSOLE        1
+#  define LZO_OS_CONSOLE_PSP    1
+#  define LZO_INFO_OS           "console"
+#  define LZO_INFO_OS_CONSOLE   "psp"
+#else
+#  define LZO_OS_POSIX          1
+#  define LZO_INFO_OS           "posix"
+#endif
+#if (LZO_OS_POSIX)
+#  if defined(_AIX) || defined(__AIX__) || defined(__aix__)
+#    define LZO_OS_POSIX_AIX        1
+#    define LZO_INFO_OS_POSIX       "aix"
+#  elif defined(__FreeBSD__)
+#    define LZO_OS_POSIX_FREEBSD    1
+#    define LZO_INFO_OS_POSIX       "freebsd"
+#  elif defined(__hpux__) || defined(__hpux)
+#    define LZO_OS_POSIX_HPUX       1
+#    define LZO_INFO_OS_POSIX       "hpux"
+#  elif defined(__INTERIX)
+#    define LZO_OS_POSIX_INTERIX    1
+#    define LZO_INFO_OS_POSIX       "interix"
+#  elif defined(__IRIX__) || defined(__irix__)
+#    define LZO_OS_POSIX_IRIX       1
+#    define LZO_INFO_OS_POSIX       "irix"
+#  elif defined(__linux__) || defined(__linux) || defined(__LINUX__)
+#    define LZO_OS_POSIX_LINUX      1
+#    define LZO_INFO_OS_POSIX       "linux"
+#  elif defined(__APPLE__) && defined(__MACH__)
+#    if ((__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__-0) >= 20000)
+#      define LZO_OS_POSIX_DARWIN     1040
+#      define LZO_INFO_OS_POSIX       "darwin_iphone"
+#    elif ((__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__-0) >= 1040)
+#      define LZO_OS_POSIX_DARWIN     __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__
+#      define LZO_INFO_OS_POSIX       "darwin"
+#    else
+#      define LZO_OS_POSIX_DARWIN     1
+#      define LZO_INFO_OS_POSIX       "darwin"
+#    endif
+#    define LZO_OS_POSIX_MACOSX     LZO_OS_POSIX_DARWIN
+#  elif defined(__minix__) || defined(__minix)
+#    define LZO_OS_POSIX_MINIX      1
+#    define LZO_INFO_OS_POSIX       "minix"
+#  elif defined(__NetBSD__)
+#    define LZO_OS_POSIX_NETBSD     1
+#    define LZO_INFO_OS_POSIX       "netbsd"
+#  elif defined(__OpenBSD__)
+#    define LZO_OS_POSIX_OPENBSD    1
+#    define LZO_INFO_OS_POSIX       "openbsd"
+#  elif defined(__osf__)
+#    define LZO_OS_POSIX_OSF        1
+#    define LZO_INFO_OS_POSIX       "osf"
+#  elif defined(__solaris__) || defined(__sun)
+#    if defined(__SVR4) || defined(__svr4__)
+#      define LZO_OS_POSIX_SOLARIS  1
+#      define LZO_INFO_OS_POSIX     "solaris"
+#    else
+#      define LZO_OS_POSIX_SUNOS    1
+#      define LZO_INFO_OS_POSIX     "sunos"
+#    endif
+#  elif defined(__ultrix__) || defined(__ultrix)
+#    define LZO_OS_POSIX_ULTRIX     1
+#    define LZO_INFO_OS_POSIX       "ultrix"
+#  elif defined(_UNICOS)
+#    define LZO_OS_POSIX_UNICOS     1
+#    define LZO_INFO_OS_POSIX       "unicos"
+#  else
+#    define LZO_OS_POSIX_UNKNOWN    1
+#    define LZO_INFO_OS_POSIX       "unknown"
+#  endif
+#endif
+#endif
+#if (LZO_OS_DOS16 || LZO_OS_OS216 || LZO_OS_WIN16)
+#  if (UINT_MAX != LZO_0xffffL)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#  if (ULONG_MAX != LZO_0xffffffffL)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#endif
+#if (LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_WIN32 || LZO_OS_WIN64)
+#  if (UINT_MAX != LZO_0xffffffffL)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#  if (ULONG_MAX != LZO_0xffffffffL)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#endif
+#if defined(CIL) && defined(_GNUCC) && defined(__GNUC__)
+#  define LZO_CC_CILLY          1
+#  define LZO_INFO_CC           "Cilly"
+#  if defined(__CILLY__)
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__CILLY__)
+#  else
+#    define LZO_INFO_CCVER      "unknown"
+#  endif
+#elif 0 && defined(SDCC) && defined(__VERSION__) && !defined(__GNUC__)
+#  define LZO_CC_SDCC           1
+#  define LZO_INFO_CC           "sdcc"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(SDCC)
+#elif defined(__PATHSCALE__) && defined(__PATHCC_PATCHLEVEL__)
+#  define LZO_CC_PATHSCALE      (__PATHCC__ * 0x10000L + (__PATHCC_MINOR__-0) * 0x100 + (__PATHCC_PATCHLEVEL__-0))
+#  define LZO_INFO_CC           "Pathscale C"
+#  define LZO_INFO_CCVER        __PATHSCALE__
+#  if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__VERSION__)
+#    define LZO_CC_PATHSCALE_GNUC (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100 + (__GNUC_PATCHLEVEL__-0))
+#  endif
+#elif defined(__INTEL_COMPILER) && ((__INTEL_COMPILER-0) > 0)
+#  define LZO_CC_INTELC         __INTEL_COMPILER
+#  define LZO_INFO_CC           "Intel C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__INTEL_COMPILER)
+#  if defined(_MSC_VER) && ((_MSC_VER-0) > 0)
+#    define LZO_CC_INTELC_MSC   _MSC_VER
+#  elif defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__VERSION__)
+#    define LZO_CC_INTELC_GNUC   (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100 + (__GNUC_PATCHLEVEL__-0))
+#  endif
+#elif defined(__POCC__) && defined(_WIN32)
+#  define LZO_CC_PELLESC        1
+#  define LZO_INFO_CC           "Pelles C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__POCC__)
+#elif defined(__ARMCC_VERSION) && defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__VERSION__)
+#  if defined(__GNUC_PATCHLEVEL__)
+#    define LZO_CC_ARMCC_GNUC   (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100 + (__GNUC_PATCHLEVEL__-0))
+#  else
+#    define LZO_CC_ARMCC_GNUC   (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100)
+#  endif
+#  define LZO_CC_ARMCC          __ARMCC_VERSION
+#  define LZO_INFO_CC           "ARM C Compiler"
+#  define LZO_INFO_CCVER        __VERSION__
+#elif defined(__clang__) && defined(__c2__) && defined(__c2_version__) && defined(_MSC_VER)
+#  define LZO_CC_CLANG          (__clang_major__ * 0x10000L + (__clang_minor__-0) * 0x100 + (__clang_patchlevel__-0))
+#  define LZO_CC_CLANG_C2       _MSC_VER
+#  define LZO_CC_CLANG_VENDOR_MICROSOFT 1
+#  define LZO_INFO_CC           "clang/c2"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__c2_version__)
+#elif defined(__clang__) && defined(__llvm__) && defined(__VERSION__)
+#  if defined(__clang_major__) && defined(__clang_minor__) && defined(__clang_patchlevel__)
+#    define LZO_CC_CLANG        (__clang_major__ * 0x10000L + (__clang_minor__-0) * 0x100 + (__clang_patchlevel__-0))
+#  else
+#    define LZO_CC_CLANG        0x010000L
+#  endif
+#  if defined(_MSC_VER) && ((_MSC_VER-0) > 0)
+#    define LZO_CC_CLANG_MSC    _MSC_VER
+#  elif defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__VERSION__)
+#    define LZO_CC_CLANG_GNUC   (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100 + (__GNUC_PATCHLEVEL__-0))
+#  endif
+#  if defined(__APPLE_CC__)
+#    define LZO_CC_CLANG_VENDOR_APPLE 1
+#    define LZO_INFO_CC         "clang/apple"
+#  else
+#    define LZO_CC_CLANG_VENDOR_LLVM 1
+#    define LZO_INFO_CC         "clang"
+#  endif
+#  if defined(__clang_version__)
+#    define LZO_INFO_CCVER      __clang_version__
+#  else
+#    define LZO_INFO_CCVER      __VERSION__
+#  endif
+#elif defined(__llvm__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__VERSION__)
+#  if defined(__GNUC_PATCHLEVEL__)
+#    define LZO_CC_LLVM_GNUC    (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100 + (__GNUC_PATCHLEVEL__-0))
+#  else
+#    define LZO_CC_LLVM_GNUC    (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100)
+#  endif
+#  define LZO_CC_LLVM           LZO_CC_LLVM_GNUC
+#  define LZO_INFO_CC           "llvm-gcc"
+#  define LZO_INFO_CCVER        __VERSION__
+#elif defined(__ACK__) && defined(_ACK)
+#  define LZO_CC_ACK            1
+#  define LZO_INFO_CC           "Amsterdam Compiler Kit C"
+#  define LZO_INFO_CCVER        "unknown"
+#elif defined(__ARMCC_VERSION) && !defined(__GNUC__)
+#  define LZO_CC_ARMCC          __ARMCC_VERSION
+#  define LZO_CC_ARMCC_ARMCC    __ARMCC_VERSION
+#  define LZO_INFO_CC           "ARM C Compiler"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__ARMCC_VERSION)
+#elif defined(__AZTEC_C__)
+#  define LZO_CC_AZTECC         1
+#  define LZO_INFO_CC           "Aztec C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__AZTEC_C__)
+#elif defined(__CODEGEARC__)
+#  define LZO_CC_CODEGEARC      1
+#  define LZO_INFO_CC           "CodeGear C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__CODEGEARC__)
+#elif defined(__BORLANDC__)
+#  define LZO_CC_BORLANDC       1
+#  define LZO_INFO_CC           "Borland C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__BORLANDC__)
+#elif defined(_CRAYC) && defined(_RELEASE)
+#  define LZO_CC_CRAYC          1
+#  define LZO_INFO_CC           "Cray C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(_RELEASE)
+#elif defined(__DMC__) && defined(__SC__)
+#  define LZO_CC_DMC            1
+#  define LZO_INFO_CC           "Digital Mars C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__DMC__)
+#elif defined(__DECC)
+#  define LZO_CC_DECC           1
+#  define LZO_INFO_CC           "DEC C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__DECC)
+#elif (defined(__ghs) || defined(__ghs__)) && defined(__GHS_VERSION_NUMBER) && ((__GHS_VERSION_NUMBER-0) > 0)
+#  define LZO_CC_GHS            1
+#  define LZO_INFO_CC           "Green Hills C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__GHS_VERSION_NUMBER)
+#  if defined(_MSC_VER) && ((_MSC_VER-0) > 0)
+#    define LZO_CC_GHS_MSC      _MSC_VER
+#  elif defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__VERSION__)
+#    define LZO_CC_GHS_GNUC     (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100 + (__GNUC_PATCHLEVEL__-0))
+#  endif
+#elif defined(__HIGHC__)
+#  define LZO_CC_HIGHC          1
+#  define LZO_INFO_CC           "MetaWare High C"
+#  define LZO_INFO_CCVER        "unknown"
+#elif defined(__HP_aCC) && ((__HP_aCC-0) > 0)
+#  define LZO_CC_HPACC          __HP_aCC
+#  define LZO_INFO_CC           "HP aCC"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__HP_aCC)
+#elif defined(__IAR_SYSTEMS_ICC__)
+#  define LZO_CC_IARC           1
+#  define LZO_INFO_CC           "IAR C"
+#  if defined(__VER__)
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__VER__)
+#  else
+#    define LZO_INFO_CCVER      "unknown"
+#  endif
+#elif defined(__IBMC__) && ((__IBMC__-0) > 0)
+#  define LZO_CC_IBMC           __IBMC__
+#  define LZO_INFO_CC           "IBM C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__IBMC__)
+#elif defined(__IBMCPP__) && ((__IBMCPP__-0) > 0)
+#  define LZO_CC_IBMC           __IBMCPP__
+#  define LZO_INFO_CC           "IBM C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__IBMCPP__)
+#elif defined(__KEIL__) && defined(__C166__)
+#  define LZO_CC_KEILC          1
+#  define LZO_INFO_CC           "Keil C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__C166__)
+#elif defined(__LCC__) && defined(_WIN32) && defined(__LCCOPTIMLEVEL)
+#  define LZO_CC_LCCWIN32       1
+#  define LZO_INFO_CC           "lcc-win32"
+#  define LZO_INFO_CCVER        "unknown"
+#elif defined(__LCC__)
+#  define LZO_CC_LCC            1
+#  define LZO_INFO_CC           "lcc"
+#  if defined(__LCC_VERSION__)
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__LCC_VERSION__)
+#  else
+#    define LZO_INFO_CCVER      "unknown"
+#  endif
+#elif defined(__MWERKS__) && ((__MWERKS__-0) > 0)
+#  define LZO_CC_MWERKS         __MWERKS__
+#  define LZO_INFO_CC           "Metrowerks C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__MWERKS__)
+#elif (defined(__NDPC__) || defined(__NDPX__)) && defined(__i386)
+#  define LZO_CC_NDPC           1
+#  define LZO_INFO_CC           "Microway NDP C"
+#  define LZO_INFO_CCVER        "unknown"
+#elif defined(__PACIFIC__)
+#  define LZO_CC_PACIFICC       1
+#  define LZO_INFO_CC           "Pacific C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__PACIFIC__)
+#elif defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__)
+#  if defined(__PGIC_PATCHLEVEL__)
+#    define LZO_CC_PGI          (__PGIC__ * 0x10000L + (__PGIC_MINOR__-0) * 0x100 + (__PGIC_PATCHLEVEL__-0))
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__PGIC__) "." LZO_PP_MACRO_EXPAND(__PGIC_MINOR__) "." LZO_PP_MACRO_EXPAND(__PGIC_PATCHLEVEL__)
+#  else
+#    define LZO_CC_PGI          (__PGIC__ * 0x10000L + (__PGIC_MINOR__-0) * 0x100)
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__PGIC__) "." LZO_PP_MACRO_EXPAND(__PGIC_MINOR__) ".0"
+#  endif
+#  define LZO_INFO_CC           "Portland Group PGI C"
+#elif defined(__PGI) && (defined(__linux__) || defined(__WIN32__))
+#  define LZO_CC_PGI            1
+#  define LZO_INFO_CC           "Portland Group PGI C"
+#  define LZO_INFO_CCVER        "unknown"
+#elif defined(__PUREC__) && defined(__TOS__)
+#  define LZO_CC_PUREC          1
+#  define LZO_INFO_CC           "Pure C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__PUREC__)
+#elif defined(__SC__) && defined(__ZTC__)
+#  define LZO_CC_SYMANTECC      1
+#  define LZO_INFO_CC           "Symantec C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__SC__)
+#elif defined(__SUNPRO_C)
+#  define LZO_INFO_CC           "SunPro C"
+#  if ((__SUNPRO_C-0) > 0)
+#    define LZO_CC_SUNPROC      __SUNPRO_C
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__SUNPRO_C)
+#  else
+#    define LZO_CC_SUNPROC      1
+#    define LZO_INFO_CCVER      "unknown"
+#  endif
+#elif defined(__SUNPRO_CC)
+#  define LZO_INFO_CC           "SunPro C"
+#  if ((__SUNPRO_CC-0) > 0)
+#    define LZO_CC_SUNPROC      __SUNPRO_CC
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__SUNPRO_CC)
+#  else
+#    define LZO_CC_SUNPROC      1
+#    define LZO_INFO_CCVER      "unknown"
+#  endif
+#elif defined(__TINYC__)
+#  define LZO_CC_TINYC          1
+#  define LZO_INFO_CC           "Tiny C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__TINYC__)
+#elif defined(__TSC__)
+#  define LZO_CC_TOPSPEEDC      1
+#  define LZO_INFO_CC           "TopSpeed C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__TSC__)
+#elif defined(__WATCOMC__)
+#  define LZO_CC_WATCOMC        1
+#  define LZO_INFO_CC           "Watcom C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__WATCOMC__)
+#elif defined(__TURBOC__)
+#  define LZO_CC_TURBOC         1
+#  define LZO_INFO_CC           "Turbo C"
+#  define LZO_INFO_CCVER        LZO_PP_MACRO_EXPAND(__TURBOC__)
+#elif defined(__ZTC__)
+#  define LZO_CC_ZORTECHC       1
+#  define LZO_INFO_CC           "Zortech C"
+#  if ((__ZTC__-0) == 0x310)
+#    define LZO_INFO_CCVER      "0x310"
+#  else
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(__ZTC__)
+#  endif
+#elif defined(__GNUC__) && defined(__VERSION__)
+#  if defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#    define LZO_CC_GNUC         (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100 + (__GNUC_PATCHLEVEL__-0))
+#  elif defined(__GNUC_MINOR__)
+#    define LZO_CC_GNUC         (__GNUC__ * 0x10000L + (__GNUC_MINOR__-0) * 0x100)
+#  else
+#    define LZO_CC_GNUC         (__GNUC__ * 0x10000L)
+#  endif
+#  define LZO_INFO_CC           "gcc"
+#  define LZO_INFO_CCVER        __VERSION__
+#elif defined(_MSC_VER) && ((_MSC_VER-0) > 0)
+#  define LZO_CC_MSC            _MSC_VER
+#  define LZO_INFO_CC           "Microsoft C"
+#  if defined(_MSC_FULL_VER)
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(_MSC_VER) "." LZO_PP_MACRO_EXPAND(_MSC_FULL_VER)
+#  else
+#    define LZO_INFO_CCVER      LZO_PP_MACRO_EXPAND(_MSC_VER)
+#  endif
+#else
+#  define LZO_CC_UNKNOWN        1
+#  define LZO_INFO_CC           "unknown"
+#  define LZO_INFO_CCVER        "unknown"
+#endif
+#if (LZO_CC_GNUC) && defined(__OPEN64__)
+#  if defined(__OPENCC__) && defined(__OPENCC_MINOR__) && defined(__OPENCC_PATCHLEVEL__)
+#    define LZO_CC_OPEN64       (__OPENCC__ * 0x10000L + (__OPENCC_MINOR__-0) * 0x100 + (__OPENCC_PATCHLEVEL__-0))
+#    define LZO_CC_OPEN64_GNUC  LZO_CC_GNUC
+#  endif
+#endif
+#if (LZO_CC_GNUC) && defined(__PCC__)
+#  if defined(__PCC__) && defined(__PCC_MINOR__) && defined(__PCC_MINORMINOR__)
+#    define LZO_CC_PCC          (__PCC__ * 0x10000L + (__PCC_MINOR__-0) * 0x100 + (__PCC_MINORMINOR__-0))
+#    define LZO_CC_PCC_GNUC     LZO_CC_GNUC
+#  endif
+#endif
+#if 0 && (LZO_CC_MSC && (_MSC_VER >= 1200)) && !defined(_MSC_FULL_VER)
+#  error "LZO_CC_MSC: _MSC_FULL_VER is not defined"
+#endif
+#if !defined(__LZO_ARCH_OVERRIDE) && !(LZO_ARCH_GENERIC) && defined(_CRAY)
+#  if (UINT_MAX > LZO_0xffffffffL) && defined(_CRAY)
+#    if defined(_CRAYMPP) || defined(_CRAYT3D) || defined(_CRAYT3E)
+#      define LZO_ARCH_CRAY_MPP     1
+#    elif defined(_CRAY1)
+#      define LZO_ARCH_CRAY_PVP     1
+#    endif
+#  endif
+#endif
+#if !defined(__LZO_ARCH_OVERRIDE)
+#if (LZO_ARCH_GENERIC)
+#  define LZO_INFO_ARCH             "generic"
+#elif (LZO_OS_DOS16 || LZO_OS_OS216 || LZO_OS_WIN16)
+#  define LZO_ARCH_I086             1
+#  define LZO_INFO_ARCH             "i086"
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#  define LZO_ARCH_ARM64            1
+#  define LZO_INFO_ARCH             "arm64"
+#elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA)
+#  define LZO_ARCH_ALPHA            1
+#  define LZO_INFO_ARCH             "alpha"
+#elif (LZO_ARCH_CRAY_MPP) && (defined(_CRAYT3D) || defined(_CRAYT3E))
+#  define LZO_ARCH_ALPHA            1
+#  define LZO_INFO_ARCH             "alpha"
+#elif defined(__amd64__) || defined(__x86_64__) || defined(_M_AMD64)
+#  define LZO_ARCH_AMD64            1
+#  define LZO_INFO_ARCH             "amd64"
+#elif defined(__arm__) || defined(_M_ARM)
+#  define LZO_ARCH_ARM              1
+#  define LZO_INFO_ARCH             "arm"
+#elif defined(__IAR_SYSTEMS_ICC__) && defined(__ICCARM__)
+#  define LZO_ARCH_ARM              1
+#  define LZO_INFO_ARCH             "arm"
+#elif (UINT_MAX <= LZO_0xffffL) && defined(__AVR__)
+#  define LZO_ARCH_AVR              1
+#  define LZO_INFO_ARCH             "avr"
+#elif defined(__avr32__) || defined(__AVR32__)
+#  define LZO_ARCH_AVR32            1
+#  define LZO_INFO_ARCH             "avr32"
+#elif defined(__bfin__)
+#  define LZO_ARCH_BLACKFIN         1
+#  define LZO_INFO_ARCH             "blackfin"
+#elif (UINT_MAX == LZO_0xffffL) && defined(__C166__)
+#  define LZO_ARCH_C166             1
+#  define LZO_INFO_ARCH             "c166"
+#elif defined(__cris__)
+#  define LZO_ARCH_CRIS             1
+#  define LZO_INFO_ARCH             "cris"
+#elif defined(__IAR_SYSTEMS_ICC__) && defined(__ICCEZ80__)
+#  define LZO_ARCH_EZ80             1
+#  define LZO_INFO_ARCH             "ez80"
+#elif defined(__H8300__) || defined(__H8300H__) || defined(__H8300S__) || defined(__H8300SX__)
+#  define LZO_ARCH_H8300            1
+#  define LZO_INFO_ARCH             "h8300"
+#elif defined(__hppa__) || defined(__hppa)
+#  define LZO_ARCH_HPPA             1
+#  define LZO_INFO_ARCH             "hppa"
+#elif defined(__386__) || defined(__i386__) || defined(__i386) || defined(_M_IX86) || defined(_M_I386)
+#  define LZO_ARCH_I386             1
+#  define LZO_ARCH_IA32             1
+#  define LZO_INFO_ARCH             "i386"
+#elif (LZO_CC_ZORTECHC && defined(__I86__))
+#  define LZO_ARCH_I386             1
+#  define LZO_ARCH_IA32             1
+#  define LZO_INFO_ARCH             "i386"
+#elif (LZO_OS_DOS32 && LZO_CC_HIGHC) && defined(_I386)
+#  define LZO_ARCH_I386             1
+#  define LZO_ARCH_IA32             1
+#  define LZO_INFO_ARCH             "i386"
+#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64)
+#  define LZO_ARCH_IA64             1
+#  define LZO_INFO_ARCH             "ia64"
+#elif (UINT_MAX == LZO_0xffffL) && defined(__m32c__)
+#  define LZO_ARCH_M16C             1
+#  define LZO_INFO_ARCH             "m16c"
+#elif defined(__IAR_SYSTEMS_ICC__) && defined(__ICCM16C__)
+#  define LZO_ARCH_M16C             1
+#  define LZO_INFO_ARCH             "m16c"
+#elif defined(__m32r__)
+#  define LZO_ARCH_M32R             1
+#  define LZO_INFO_ARCH             "m32r"
+#elif (LZO_OS_TOS) || defined(__m68k__) || defined(__m68000__) || defined(__mc68000__) || defined(__mc68020__) || defined(_M_M68K)
+#  define LZO_ARCH_M68K             1
+#  define LZO_INFO_ARCH             "m68k"
+#elif (UINT_MAX == LZO_0xffffL) && defined(__C251__)
+#  define LZO_ARCH_MCS251           1
+#  define LZO_INFO_ARCH             "mcs251"
+#elif (UINT_MAX == LZO_0xffffL) && defined(__C51__)
+#  define LZO_ARCH_MCS51            1
+#  define LZO_INFO_ARCH             "mcs51"
+#elif defined(__IAR_SYSTEMS_ICC__) && defined(__ICC8051__)
+#  define LZO_ARCH_MCS51            1
+#  define LZO_INFO_ARCH             "mcs51"
+#elif defined(__mips__) || defined(__mips) || defined(_MIPS_ARCH) || defined(_M_MRX000)
+#  define LZO_ARCH_MIPS             1
+#  define LZO_INFO_ARCH             "mips"
+#elif (UINT_MAX == LZO_0xffffL) && defined(__MSP430__)
+#  define LZO_ARCH_MSP430           1
+#  define LZO_INFO_ARCH             "msp430"
+#elif defined(__IAR_SYSTEMS_ICC__) && defined(__ICC430__)
+#  define LZO_ARCH_MSP430           1
+#  define LZO_INFO_ARCH             "msp430"
+#elif defined(__powerpc__) || defined(__powerpc) || defined(__ppc__) || defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || defined(_ARCH_PWR)
+#  define LZO_ARCH_POWERPC          1
+#  define LZO_INFO_ARCH             "powerpc"
+#elif defined(__powerpc64__) || defined(__powerpc64) || defined(__ppc64__) || defined(__PPC64__)
+#  define LZO_ARCH_POWERPC          1
+#  define LZO_INFO_ARCH             "powerpc"
+#elif defined(__powerpc64le__) || defined(__powerpc64le) || defined(__ppc64le__) || defined(__PPC64LE__)
+#  define LZO_ARCH_POWERPC          1
+#  define LZO_INFO_ARCH             "powerpc"
+#elif defined(__riscv)
+#  define LZO_ARCH_RISCV            1
+#  define LZO_INFO_ARCH             "riscv"
+#elif defined(__s390__) || defined(__s390) || defined(__s390x__) || defined(__s390x)
+#  define LZO_ARCH_S390             1
+#  define LZO_INFO_ARCH             "s390"
+#elif defined(__sh__) || defined(_M_SH)
+#  define LZO_ARCH_SH               1
+#  define LZO_INFO_ARCH             "sh"
+#elif defined(__sparc__) || defined(__sparc) || defined(__sparcv8)
+#  define LZO_ARCH_SPARC            1
+#  define LZO_INFO_ARCH             "sparc"
+#elif defined(__SPU__)
+#  define LZO_ARCH_SPU              1
+#  define LZO_INFO_ARCH             "spu"
+#elif (UINT_MAX == LZO_0xffffL) && defined(__z80)
+#  define LZO_ARCH_Z80              1
+#  define LZO_INFO_ARCH             "z80"
+#elif (LZO_ARCH_CRAY_PVP)
+#  if defined(_CRAYSV1)
+#    define LZO_ARCH_CRAY_SV1       1
+#    define LZO_INFO_ARCH           "cray_sv1"
+#  elif (_ADDR64)
+#    define LZO_ARCH_CRAY_T90       1
+#    define LZO_INFO_ARCH           "cray_t90"
+#  elif (_ADDR32)
+#    define LZO_ARCH_CRAY_YMP       1
+#    define LZO_INFO_ARCH           "cray_ymp"
+#  else
+#    define LZO_ARCH_CRAY_XMP       1
+#    define LZO_INFO_ARCH           "cray_xmp"
+#  endif
+#else
+#  define LZO_ARCH_UNKNOWN          1
+#  define LZO_INFO_ARCH             "unknown"
+#endif
+#endif
+#if !defined(LZO_ARCH_ARM_THUMB2)
+#if (LZO_ARCH_ARM)
+#  if defined(__thumb__) || defined(__thumb) || defined(_M_THUMB)
+#    if defined(__thumb2__)
+#      define LZO_ARCH_ARM_THUMB2   1
+#    elif 1 && defined(__TARGET_ARCH_THUMB) && ((__TARGET_ARCH_THUMB)+0 >= 4)
+#      define LZO_ARCH_ARM_THUMB2   1
+#    elif 1 && defined(_MSC_VER) && defined(_M_THUMB) && ((_M_THUMB)+0 >= 7)
+#      define LZO_ARCH_ARM_THUMB2   1
+#    endif
+#  endif
+#endif
+#endif
+#if (LZO_ARCH_ARM_THUMB2)
+#  undef  LZO_INFO_ARCH
+#  define LZO_INFO_ARCH             "arm_thumb2"
+#endif
+#if 1 && (LZO_ARCH_UNKNOWN) && (LZO_OS_DOS32 || LZO_OS_OS2)
+#  error "FIXME - missing define for CPU architecture"
+#endif
+#if 1 && (LZO_ARCH_UNKNOWN) && (LZO_OS_WIN32)
+#  error "FIXME - missing LZO_OS_WIN32 define for CPU architecture"
+#endif
+#if 1 && (LZO_ARCH_UNKNOWN) && (LZO_OS_WIN64)
+#  error "FIXME - missing LZO_OS_WIN64 define for CPU architecture"
+#endif
+#if (LZO_OS_OS216 || LZO_OS_WIN16)
+#  define LZO_ARCH_I086PM           1
+#elif 1 && (LZO_OS_DOS16 && defined(BLX286))
+#  define LZO_ARCH_I086PM           1
+#elif 1 && (LZO_OS_DOS16 && defined(DOSX286))
+#  define LZO_ARCH_I086PM           1
+#elif 1 && (LZO_OS_DOS16 && LZO_CC_BORLANDC && defined(__DPMI16__))
+#  define LZO_ARCH_I086PM           1
+#endif
+#if (LZO_ARCH_AMD64 && !LZO_ARCH_X64)
+#  define LZO_ARCH_X64              1
+#elif (!LZO_ARCH_AMD64 && LZO_ARCH_X64) && defined(__LZO_ARCH_OVERRIDE)
+#  define LZO_ARCH_AMD64            1
+#endif
+#if (LZO_ARCH_ARM64 && !LZO_ARCH_AARCH64)
+#  define LZO_ARCH_AARCH64          1
+#elif (!LZO_ARCH_ARM64 && LZO_ARCH_AARCH64) && defined(__LZO_ARCH_OVERRIDE)
+#  define LZO_ARCH_ARM64            1
+#endif
+#if (LZO_ARCH_I386 && !LZO_ARCH_X86)
+#  define LZO_ARCH_X86              1
+#elif (!LZO_ARCH_I386 && LZO_ARCH_X86) && defined(__LZO_ARCH_OVERRIDE)
+#  define LZO_ARCH_I386            1
+#endif
+#if (LZO_ARCH_AMD64 && !LZO_ARCH_X64) || (!LZO_ARCH_AMD64 && LZO_ARCH_X64)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_ARM64 && !LZO_ARCH_AARCH64) || (!LZO_ARCH_ARM64 && LZO_ARCH_AARCH64)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_I386 && !LZO_ARCH_X86) || (!LZO_ARCH_I386 && LZO_ARCH_X86)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_ARM_THUMB1 && !LZO_ARCH_ARM)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_ARM_THUMB2 && !LZO_ARCH_ARM)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_ARM_THUMB1 && LZO_ARCH_ARM_THUMB2)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_I086PM && !LZO_ARCH_I086)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_I086)
+#  if (UINT_MAX != LZO_0xffffL)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#  if (ULONG_MAX != LZO_0xffffffffL)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#endif
+#if (LZO_ARCH_I386)
+#  if (UINT_MAX != LZO_0xffffL) && defined(__i386_int16__)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#  if (UINT_MAX != LZO_0xffffffffL) && !defined(__i386_int16__)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#  if (ULONG_MAX != LZO_0xffffffffL)
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#endif
+#if (LZO_ARCH_AMD64 || LZO_ARCH_I386)
+#  if !defined(LZO_TARGET_FEATURE_SSE2)
+#    if defined(__SSE2__)
+#      define LZO_TARGET_FEATURE_SSE2       1
+#    elif defined(_MSC_VER) && (defined(_M_IX86_FP) && ((_M_IX86_FP)+0 >= 2))
+#      define LZO_TARGET_FEATURE_SSE2       1
+#    elif (LZO_CC_INTELC_MSC || LZO_CC_MSC) && defined(_M_AMD64)
+#      define LZO_TARGET_FEATURE_SSE2       1
+#    endif
+#  endif
+#  if !defined(LZO_TARGET_FEATURE_SSSE3)
+#  if (LZO_TARGET_FEATURE_SSE2)
+#    if defined(__SSSE3__)
+#      define LZO_TARGET_FEATURE_SSSE3      1
+#    elif defined(_MSC_VER) && defined(__AVX__)
+#      define LZO_TARGET_FEATURE_SSSE3      1
+#    endif
+#  endif
+#  endif
+#  if !defined(LZO_TARGET_FEATURE_SSE4_2)
+#  if (LZO_TARGET_FEATURE_SSSE3)
+#    if defined(__SSE4_2__)
+#      define LZO_TARGET_FEATURE_SSE4_2     1
+#    endif
+#  endif
+#  endif
+#  if !defined(LZO_TARGET_FEATURE_AVX)
+#  if (LZO_TARGET_FEATURE_SSSE3)
+#    if defined(__AVX__)
+#      define LZO_TARGET_FEATURE_AVX        1
+#    endif
+#  endif
+#  endif
+#  if !defined(LZO_TARGET_FEATURE_AVX2)
+#  if (LZO_TARGET_FEATURE_AVX)
+#    if defined(__AVX2__)
+#      define LZO_TARGET_FEATURE_AVX2       1
+#    endif
+#  endif
+#  endif
+#endif
+#if (LZO_TARGET_FEATURE_SSSE3 && !(LZO_TARGET_FEATURE_SSE2))
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_TARGET_FEATURE_SSE4_2 && !(LZO_TARGET_FEATURE_SSSE3))
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_TARGET_FEATURE_AVX && !(LZO_TARGET_FEATURE_SSSE3))
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_TARGET_FEATURE_AVX2 && !(LZO_TARGET_FEATURE_AVX))
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ARCH_ARM)
+#  if !defined(LZO_TARGET_FEATURE_NEON)
+#    if defined(__ARM_NEON) && ((__ARM_NEON)+0)
+#      define LZO_TARGET_FEATURE_NEON       1
+#    elif 1 && defined(__ARM_NEON__) && ((__ARM_NEON__)+0)
+#      define LZO_TARGET_FEATURE_NEON       1
+#    elif 1 && defined(__TARGET_FEATURE_NEON) && ((__TARGET_FEATURE_NEON)+0)
+#      define LZO_TARGET_FEATURE_NEON       1
+#    endif
+#  endif
+#elif (LZO_ARCH_ARM64)
+#  if !defined(LZO_TARGET_FEATURE_NEON)
+#    if 1
+#      define LZO_TARGET_FEATURE_NEON       1
+#    endif
+#  endif
+#endif
+#if 0
+#elif !defined(__LZO_MM_OVERRIDE)
+#if (LZO_ARCH_I086)
+#if (UINT_MAX != LZO_0xffffL)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if defined(__TINY__) || defined(M_I86TM) || defined(_M_I86TM)
+#  define LZO_MM_TINY           1
+#elif defined(__HUGE__) || defined(_HUGE_) || defined(M_I86HM) || defined(_M_I86HM)
+#  define LZO_MM_HUGE           1
+#elif defined(__SMALL__) || defined(M_I86SM) || defined(_M_I86SM) || defined(SMALL_MODEL)
+#  define LZO_MM_SMALL          1
+#elif defined(__MEDIUM__) || defined(M_I86MM) || defined(_M_I86MM)
+#  define LZO_MM_MEDIUM         1
+#elif defined(__COMPACT__) || defined(M_I86CM) || defined(_M_I86CM)
+#  define LZO_MM_COMPACT        1
+#elif defined(__LARGE__) || defined(M_I86LM) || defined(_M_I86LM) || defined(LARGE_MODEL)
+#  define LZO_MM_LARGE          1
+#elif (LZO_CC_AZTECC)
+#  if defined(_LARGE_CODE) && defined(_LARGE_DATA)
+#    define LZO_MM_LARGE        1
+#  elif defined(_LARGE_CODE)
+#    define LZO_MM_MEDIUM       1
+#  elif defined(_LARGE_DATA)
+#    define LZO_MM_COMPACT      1
+#  else
+#    define LZO_MM_SMALL        1
+#  endif
+#elif (LZO_CC_ZORTECHC && defined(__VCM__))
+#  define LZO_MM_LARGE          1
+#else
+#  error "unknown LZO_ARCH_I086 memory model"
+#endif
+#if (LZO_OS_DOS16 || LZO_OS_OS216 || LZO_OS_WIN16)
+#define LZO_HAVE_MM_HUGE_PTR        1
+#define LZO_HAVE_MM_HUGE_ARRAY      1
+#if (LZO_MM_TINY)
+#  undef LZO_HAVE_MM_HUGE_ARRAY
+#endif
+#if (LZO_CC_AZTECC || LZO_CC_PACIFICC || LZO_CC_ZORTECHC)
+#  undef LZO_HAVE_MM_HUGE_PTR
+#  undef LZO_HAVE_MM_HUGE_ARRAY
+#elif (LZO_CC_DMC || LZO_CC_SYMANTECC)
+#  undef LZO_HAVE_MM_HUGE_ARRAY
+#elif (LZO_CC_MSC && defined(_QC))
+#  undef LZO_HAVE_MM_HUGE_ARRAY
+#  if (_MSC_VER < 600)
+#    undef LZO_HAVE_MM_HUGE_PTR
+#  endif
+#elif (LZO_CC_TURBOC && (__TURBOC__ < 0x0295))
+#  undef LZO_HAVE_MM_HUGE_ARRAY
+#endif
+#if (LZO_ARCH_I086PM) && !(LZO_HAVE_MM_HUGE_PTR)
+#  if (LZO_OS_DOS16)
+#    error "unexpected configuration - check your compiler defines"
+#  elif (LZO_CC_ZORTECHC)
+#  else
+#    error "unexpected configuration - check your compiler defines"
+#  endif
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0200))
+   extern void __near __cdecl _AHSHIFT(void);
+#  define LZO_MM_AHSHIFT      ((unsigned) _AHSHIFT)
+#elif (LZO_CC_DMC || LZO_CC_SYMANTECC || LZO_CC_ZORTECHC)
+   extern void __near __cdecl _AHSHIFT(void);
+#  define LZO_MM_AHSHIFT      ((unsigned) _AHSHIFT)
+#elif (LZO_CC_MSC || LZO_CC_TOPSPEEDC)
+   extern void __near __cdecl _AHSHIFT(void);
+#  define LZO_MM_AHSHIFT      ((unsigned) _AHSHIFT)
+#elif (LZO_CC_TURBOC && (__TURBOC__ >= 0x0295))
+   extern void __near __cdecl _AHSHIFT(void);
+#  define LZO_MM_AHSHIFT      ((unsigned) _AHSHIFT)
+#elif ((LZO_CC_AZTECC || LZO_CC_PACIFICC || LZO_CC_TURBOC) && LZO_OS_DOS16)
+#  define LZO_MM_AHSHIFT      12
+#elif (LZO_CC_WATCOMC)
+   extern unsigned char _HShift;
+#  define LZO_MM_AHSHIFT      ((unsigned) _HShift)
+#else
+#  error "FIXME - implement LZO_MM_AHSHIFT"
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif
+#elif (LZO_ARCH_C166)
+#if !defined(__MODEL__)
+#  error "FIXME - LZO_ARCH_C166 __MODEL__"
+#elif ((__MODEL__) == 0)
+#  define LZO_MM_SMALL          1
+#elif ((__MODEL__) == 1)
+#  define LZO_MM_SMALL          1
+#elif ((__MODEL__) == 2)
+#  define LZO_MM_LARGE          1
+#elif ((__MODEL__) == 3)
+#  define LZO_MM_TINY           1
+#elif ((__MODEL__) == 4)
+#  define LZO_MM_XTINY          1
+#elif ((__MODEL__) == 5)
+#  define LZO_MM_XSMALL         1
+#else
+#  error "FIXME - LZO_ARCH_C166 __MODEL__"
+#endif
+#elif (LZO_ARCH_MCS251)
+#if !defined(__MODEL__)
+#  error "FIXME - LZO_ARCH_MCS251 __MODEL__"
+#elif ((__MODEL__) == 0)
+#  define LZO_MM_SMALL          1
+#elif ((__MODEL__) == 2)
+#  define LZO_MM_LARGE          1
+#elif ((__MODEL__) == 3)
+#  define LZO_MM_TINY           1
+#elif ((__MODEL__) == 4)
+#  define LZO_MM_XTINY          1
+#elif ((__MODEL__) == 5)
+#  define LZO_MM_XSMALL         1
+#else
+#  error "FIXME - LZO_ARCH_MCS251 __MODEL__"
+#endif
+#elif (LZO_ARCH_MCS51)
+#if !defined(__MODEL__)
+#  error "FIXME - LZO_ARCH_MCS51 __MODEL__"
+#elif ((__MODEL__) == 1)
+#  define LZO_MM_SMALL          1
+#elif ((__MODEL__) == 2)
+#  define LZO_MM_LARGE          1
+#elif ((__MODEL__) == 3)
+#  define LZO_MM_TINY           1
+#elif ((__MODEL__) == 4)
+#  define LZO_MM_XTINY          1
+#elif ((__MODEL__) == 5)
+#  define LZO_MM_XSMALL         1
+#else
+#  error "FIXME - LZO_ARCH_MCS51 __MODEL__"
+#endif
+#elif (LZO_ARCH_CRAY_PVP)
+#  define LZO_MM_PVP            1
+#else
+#  define LZO_MM_FLAT           1
+#endif
+#if (LZO_MM_COMPACT)
+#  define LZO_INFO_MM           "compact"
+#elif (LZO_MM_FLAT)
+#  define LZO_INFO_MM           "flat"
+#elif (LZO_MM_HUGE)
+#  define LZO_INFO_MM           "huge"
+#elif (LZO_MM_LARGE)
+#  define LZO_INFO_MM           "large"
+#elif (LZO_MM_MEDIUM)
+#  define LZO_INFO_MM           "medium"
+#elif (LZO_MM_PVP)
+#  define LZO_INFO_MM           "pvp"
+#elif (LZO_MM_SMALL)
+#  define LZO_INFO_MM           "small"
+#elif (LZO_MM_TINY)
+#  define LZO_INFO_MM           "tiny"
+#else
+#  error "unknown memory model"
+#endif
+#endif
+#if !defined(__lzo_gnuc_extension__)
+#if (LZO_CC_GNUC >= 0x020800ul)
+#  define __lzo_gnuc_extension__    __extension__
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_gnuc_extension__    __extension__
+#elif (LZO_CC_IBMC >= 600)
+#  define __lzo_gnuc_extension__    __extension__
+#endif
+#endif
+#if !defined(__lzo_gnuc_extension__)
+#  define __lzo_gnuc_extension__    /*empty*/
+#endif
+#if !defined(lzo_has_builtin)
+#if (LZO_CC_CLANG) && defined(__has_builtin)
+#  define lzo_has_builtin           __has_builtin
+#endif
+#endif
+#if !defined(lzo_has_builtin)
+#  define lzo_has_builtin(x)        0
+#endif
+#if !defined(lzo_has_attribute)
+#if (LZO_CC_CLANG) && defined(__has_attribute)
+#  define lzo_has_attribute         __has_attribute
+#endif
+#endif
+#if !defined(lzo_has_attribute)
+#  define lzo_has_attribute(x)      0
+#endif
+#if !defined(lzo_has_declspec_attribute)
+#if (LZO_CC_CLANG) && defined(__has_declspec_attribute)
+#  define lzo_has_declspec_attribute        __has_declspec_attribute
+#endif
+#endif
+#if !defined(lzo_has_declspec_attribute)
+#  define lzo_has_declspec_attribute(x)     0
+#endif
+#if !defined(lzo_has_feature)
+#if (LZO_CC_CLANG) && defined(__has_feature)
+#  define lzo_has_feature         __has_feature
+#endif
+#endif
+#if !defined(lzo_has_feature)
+#  define lzo_has_feature(x)        0
+#endif
+#if !defined(lzo_has_extension)
+#if (LZO_CC_CLANG) && defined(__has_extension)
+#  define lzo_has_extension         __has_extension
+#elif (LZO_CC_CLANG) && defined(__has_feature)
+#  define lzo_has_extension         __has_feature
+#endif
+#endif
+#if !defined(lzo_has_extension)
+#  define lzo_has_extension(x)      0
+#endif
+#if !defined(LZO_CFG_USE_NEW_STYLE_CASTS) && defined(__cplusplus) && 0
+#  if (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020800ul))
+#    define LZO_CFG_USE_NEW_STYLE_CASTS 0
+#  elif (LZO_CC_INTELC && (__INTEL_COMPILER < 1200))
+#    define LZO_CFG_USE_NEW_STYLE_CASTS 0
+#  else
+#    define LZO_CFG_USE_NEW_STYLE_CASTS 1
+#  endif
+#endif
+#if !defined(LZO_CFG_USE_NEW_STYLE_CASTS)
+#  define LZO_CFG_USE_NEW_STYLE_CASTS 0
+#endif
+#if !defined(__cplusplus)
+#  if defined(LZO_CFG_USE_NEW_STYLE_CASTS)
+#    undef LZO_CFG_USE_NEW_STYLE_CASTS
+#  endif
+#  define LZO_CFG_USE_NEW_STYLE_CASTS 0
+#endif
+#if !defined(LZO_REINTERPRET_CAST)
+#  if (LZO_CFG_USE_NEW_STYLE_CASTS)
+#    define LZO_REINTERPRET_CAST(t,e)       (reinterpret_cast<t> (e))
+#  endif
+#endif
+#if !defined(LZO_REINTERPRET_CAST)
+#  define LZO_REINTERPRET_CAST(t,e)         ((t) (e))
+#endif
+#if !defined(LZO_STATIC_CAST)
+#  if (LZO_CFG_USE_NEW_STYLE_CASTS)
+#    define LZO_STATIC_CAST(t,e)            (static_cast<t> (e))
+#  endif
+#endif
+#if !defined(LZO_STATIC_CAST)
+#  define LZO_STATIC_CAST(t,e)              ((t) (e))
+#endif
+#if !defined(LZO_STATIC_CAST2)
+#  define LZO_STATIC_CAST2(t1,t2,e)         LZO_STATIC_CAST(t1, LZO_STATIC_CAST(t2, e))
+#endif
+#if !defined(LZO_UNCONST_CAST)
+#  if (LZO_CFG_USE_NEW_STYLE_CASTS)
+#    define LZO_UNCONST_CAST(t,e)           (const_cast<t> (e))
+#  elif (LZO_HAVE_MM_HUGE_PTR)
+#    define LZO_UNCONST_CAST(t,e)           ((t) (e))
+#  elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#    define LZO_UNCONST_CAST(t,e)           ((t) ((void *) ((lzo_uintptr_t) ((const void *) (e)))))
+#  endif
+#endif
+#if !defined(LZO_UNCONST_CAST)
+#  define LZO_UNCONST_CAST(t,e)             ((t) ((void *) ((const void *) (e))))
+#endif
+#if !defined(LZO_UNCONST_VOLATILE_CAST)
+#  if (LZO_CFG_USE_NEW_STYLE_CASTS)
+#    define LZO_UNCONST_VOLATILE_CAST(t,e)  (const_cast<t> (e))
+#  elif (LZO_HAVE_MM_HUGE_PTR)
+#    define LZO_UNCONST_VOLATILE_CAST(t,e)  ((t) (e))
+#  elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#    define LZO_UNCONST_VOLATILE_CAST(t,e)  ((t) ((volatile void *) ((lzo_uintptr_t) ((volatile const void *) (e)))))
+#  endif
+#endif
+#if !defined(LZO_UNCONST_VOLATILE_CAST)
+#  define LZO_UNCONST_VOLATILE_CAST(t,e)    ((t) ((volatile void *) ((volatile const void *) (e))))
+#endif
+#if !defined(LZO_UNVOLATILE_CAST)
+#  if (LZO_CFG_USE_NEW_STYLE_CASTS)
+#    define LZO_UNVOLATILE_CAST(t,e)        (const_cast<t> (e))
+#  elif (LZO_HAVE_MM_HUGE_PTR)
+#    define LZO_UNVOLATILE_CAST(t,e)        ((t) (e))
+#  elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#    define LZO_UNVOLATILE_CAST(t,e)        ((t) ((void *) ((lzo_uintptr_t) ((volatile void *) (e)))))
+#  endif
+#endif
+#if !defined(LZO_UNVOLATILE_CAST)
+#  define LZO_UNVOLATILE_CAST(t,e)          ((t) ((void *) ((volatile void *) (e))))
+#endif
+#if !defined(LZO_UNVOLATILE_CONST_CAST)
+#  if (LZO_CFG_USE_NEW_STYLE_CASTS)
+#    define LZO_UNVOLATILE_CONST_CAST(t,e)  (const_cast<t> (e))
+#  elif (LZO_HAVE_MM_HUGE_PTR)
+#    define LZO_UNVOLATILE_CONST_CAST(t,e)  ((t) (e))
+#  elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#    define LZO_UNVOLATILE_CONST_CAST(t,e)  ((t) ((const void *) ((lzo_uintptr_t) ((volatile const void *) (e)))))
+#  endif
+#endif
+#if !defined(LZO_UNVOLATILE_CONST_CAST)
+#  define LZO_UNVOLATILE_CONST_CAST(t,e)    ((t) ((const void *) ((volatile const void *) (e))))
+#endif
+#if !defined(LZO_PCAST)
+#  if (LZO_HAVE_MM_HUGE_PTR)
+#    define LZO_PCAST(t,e)                  ((t) (e))
+#  endif
+#endif
+#if !defined(LZO_PCAST)
+#  define LZO_PCAST(t,e)                    LZO_STATIC_CAST(t, LZO_STATIC_CAST(void *, e))
+#endif
+#if !defined(LZO_CCAST)
+#  if (LZO_HAVE_MM_HUGE_PTR)
+#    define LZO_CCAST(t,e)                  ((t) (e))
+#  endif
+#endif
+#if !defined(LZO_CCAST)
+#  define LZO_CCAST(t,e)                    LZO_STATIC_CAST(t, LZO_STATIC_CAST(const void *, e))
+#endif
+#if !defined(LZO_ICONV)
+#  define LZO_ICONV(t,e)                    LZO_STATIC_CAST(t, e)
+#endif
+#if !defined(LZO_ICAST)
+#  define LZO_ICAST(t,e)                    LZO_STATIC_CAST(t, e)
+#endif
+#if !defined(LZO_ITRUNC)
+#  define LZO_ITRUNC(t,e)                   LZO_STATIC_CAST(t, e)
+#endif
+#if !defined(__lzo_cte)
+#  if (LZO_CC_MSC || LZO_CC_WATCOMC)
+#    define __lzo_cte(e)            ((void)0,(e))
+#  elif 1
+#    define __lzo_cte(e)            ((void)0,(e))
+#  endif
+#endif
+#if !defined(__lzo_cte)
+#  define __lzo_cte(e)              (e)
+#endif
+#if !defined(LZO_BLOCK_BEGIN)
+#  define LZO_BLOCK_BEGIN           do {
+#  define LZO_BLOCK_END             } while __lzo_cte(0)
+#endif
+#if !defined(LZO_UNUSED)
+#  if (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0600))
+#    define LZO_UNUSED(var)         ((void) &var)
+#  elif (LZO_CC_BORLANDC || LZO_CC_HIGHC || LZO_CC_NDPC || LZO_CC_PELLESC || LZO_CC_TURBOC)
+#    define LZO_UNUSED(var)         if (&var) ; else
+#  elif (LZO_CC_CLANG && (LZO_CC_CLANG >= 0x030200ul))
+#    define LZO_UNUSED(var)         ((void) &var)
+#  elif (LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#    define LZO_UNUSED(var)         ((void) var)
+#  elif (LZO_CC_MSC && (_MSC_VER < 900))
+#    define LZO_UNUSED(var)         if (&var) ; else
+#  elif (LZO_CC_KEILC)
+#    define LZO_UNUSED(var)         {extern int lzo_unused__[1-2*!(sizeof(var)>0)]; (void)lzo_unused__;}
+#  elif (LZO_CC_PACIFICC)
+#    define LZO_UNUSED(var)         ((void) sizeof(var))
+#  elif (LZO_CC_WATCOMC) && defined(__cplusplus)
+#    define LZO_UNUSED(var)         ((void) var)
+#  else
+#    define LZO_UNUSED(var)         ((void) &var)
+#  endif
+#endif
+#if !defined(LZO_UNUSED_RESULT)
+#  define LZO_UNUSED_RESULT(var)    LZO_UNUSED(var)
+#endif
+#if !defined(LZO_UNUSED_FUNC)
+#  if (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0600))
+#    define LZO_UNUSED_FUNC(func)   ((void) func)
+#  elif (LZO_CC_BORLANDC || LZO_CC_NDPC || LZO_CC_TURBOC)
+#    define LZO_UNUSED_FUNC(func)   if (func) ; else
+#  elif (LZO_CC_CLANG || LZO_CC_LLVM)
+#    define LZO_UNUSED_FUNC(func)   ((void) &func)
+#  elif (LZO_CC_MSC && (_MSC_VER < 900))
+#    define LZO_UNUSED_FUNC(func)   if (func) ; else
+#  elif (LZO_CC_MSC)
+#    define LZO_UNUSED_FUNC(func)   ((void) &func)
+#  elif (LZO_CC_KEILC || LZO_CC_PELLESC)
+#    define LZO_UNUSED_FUNC(func)   {extern int lzo_unused__[1-2*!(sizeof((int)func)>0)]; (void)lzo_unused__;}
+#  else
+#    define LZO_UNUSED_FUNC(func)   ((void) func)
+#  endif
+#endif
+#if !defined(LZO_UNUSED_LABEL)
+#  if (LZO_CC_CLANG >= 0x020800ul)
+#    define LZO_UNUSED_LABEL(l)     (__lzo_gnuc_extension__ ((void) ((const void *) &&l)))
+#  elif (LZO_CC_ARMCC || LZO_CC_CLANG || LZO_CC_INTELC || LZO_CC_WATCOMC)
+#    define LZO_UNUSED_LABEL(l)     if __lzo_cte(0) goto l
+#  else
+#    define LZO_UNUSED_LABEL(l)     switch (0) case 1:goto l
+#  endif
+#endif
+#if !defined(LZO_DEFINE_UNINITIALIZED_VAR)
+#  if 0
+#    define LZO_DEFINE_UNINITIALIZED_VAR(type,var,init)  type var
+#  elif 0 && (LZO_CC_GNUC)
+#    define LZO_DEFINE_UNINITIALIZED_VAR(type,var,init)  type var = var
+#  else
+#    define LZO_DEFINE_UNINITIALIZED_VAR(type,var,init)  type var = init
+#  endif
+#endif
+#if !defined(__lzo_inline)
+#if (LZO_CC_TURBOC && (__TURBOC__ <= 0x0295))
+#elif defined(__cplusplus)
+#  define __lzo_inline          inline
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__-0 >= 199901L)
+#  define __lzo_inline          inline
+#elif (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0550))
+#  define __lzo_inline          __inline
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CILLY || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE || LZO_CC_PGI)
+#  define __lzo_inline          __inline__
+#elif (LZO_CC_DMC)
+#  define __lzo_inline          __inline
+#elif (LZO_CC_GHS)
+#  define __lzo_inline          __inline__
+#elif (LZO_CC_IBMC >= 600)
+#  define __lzo_inline          __inline__
+#elif (LZO_CC_INTELC)
+#  define __lzo_inline          __inline
+#elif (LZO_CC_MWERKS && (__MWERKS__ >= 0x2405))
+#  define __lzo_inline          __inline
+#elif (LZO_CC_MSC && (_MSC_VER >= 900))
+#  define __lzo_inline          __inline
+#elif (LZO_CC_SUNPROC >= 0x5100)
+#  define __lzo_inline          __inline__
+#endif
+#endif
+#if defined(__lzo_inline)
+#  ifndef __lzo_HAVE_inline
+#  define __lzo_HAVE_inline 1
+#  endif
+#else
+#  define __lzo_inline          /*empty*/
+#endif
+#if !defined(__lzo_forceinline)
+#if (LZO_CC_GNUC >= 0x030200ul)
+#  define __lzo_forceinline     __inline__ __attribute__((__always_inline__))
+#elif (LZO_CC_IBMC >= 700)
+#  define __lzo_forceinline     __inline__ __attribute__((__always_inline__))
+#elif (LZO_CC_INTELC_MSC && (__INTEL_COMPILER >= 450))
+#  define __lzo_forceinline     __forceinline
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 800))
+#  define __lzo_forceinline     __inline__ __attribute__((__always_inline__))
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_forceinline     __inline__ __attribute__((__always_inline__))
+#elif (LZO_CC_MSC && (_MSC_VER >= 1200))
+#  define __lzo_forceinline     __forceinline
+#elif (LZO_CC_PGI >= 0x0d0a00ul)
+#  define __lzo_forceinline     __inline__ __attribute__((__always_inline__))
+#elif (LZO_CC_SUNPROC >= 0x5100)
+#  define __lzo_forceinline     __inline__ __attribute__((__always_inline__))
+#endif
+#endif
+#if defined(__lzo_forceinline)
+#  ifndef __lzo_HAVE_forceinline
+#  define __lzo_HAVE_forceinline 1
+#  endif
+#else
+#  define __lzo_forceinline     __lzo_inline
+#endif
+#if !defined(__lzo_noinline)
+#if 1 && (LZO_ARCH_I386) && (LZO_CC_GNUC >= 0x040000ul) && (LZO_CC_GNUC < 0x040003ul)
+#  define __lzo_noinline        __attribute__((__noinline__,__used__))
+#elif (LZO_CC_GNUC >= 0x030200ul)
+#  define __lzo_noinline        __attribute__((__noinline__))
+#elif (LZO_CC_IBMC >= 700)
+#  define __lzo_noinline        __attribute__((__noinline__))
+#elif (LZO_CC_INTELC_MSC && (__INTEL_COMPILER >= 600))
+#  define __lzo_noinline        __declspec(noinline)
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 800))
+#  define __lzo_noinline        __attribute__((__noinline__))
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_noinline        __attribute__((__noinline__))
+#elif (LZO_CC_MSC && (_MSC_VER >= 1300))
+#  define __lzo_noinline        __declspec(noinline)
+#elif (LZO_CC_MWERKS && (__MWERKS__ >= 0x3200) && (LZO_OS_WIN32 || LZO_OS_WIN64))
+#  if defined(__cplusplus)
+#  else
+#    define __lzo_noinline      __declspec(noinline)
+#  endif
+#elif (LZO_CC_PGI >= 0x0d0a00ul)
+#  define __lzo_noinline        __attribute__((__noinline__))
+#elif (LZO_CC_SUNPROC >= 0x5100)
+#  define __lzo_noinline        __attribute__((__noinline__))
+#endif
+#endif
+#if defined(__lzo_noinline)
+#  ifndef __lzo_HAVE_noinline
+#  define __lzo_HAVE_noinline 1
+#  endif
+#else
+#  define __lzo_noinline        /*empty*/
+#endif
+#if (__lzo_HAVE_forceinline || __lzo_HAVE_noinline) && !(__lzo_HAVE_inline)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if !defined(__lzo_static_inline)
+#if (LZO_CC_IBMC)
+#  define __lzo_static_inline       __lzo_gnuc_extension__ static __lzo_inline
+#endif
+#endif
+#if !defined(__lzo_static_inline)
+#  define __lzo_static_inline       static __lzo_inline
+#endif
+#if !defined(__lzo_static_forceinline)
+#if (LZO_CC_IBMC)
+#  define __lzo_static_forceinline  __lzo_gnuc_extension__ static __lzo_forceinline
+#endif
+#endif
+#if !defined(__lzo_static_forceinline)
+#  define __lzo_static_forceinline  static __lzo_forceinline
+#endif
+#if !defined(__lzo_static_noinline)
+#if (LZO_CC_IBMC)
+#  define __lzo_static_noinline     __lzo_gnuc_extension__ static __lzo_noinline
+#endif
+#endif
+#if !defined(__lzo_static_noinline)
+#  define __lzo_static_noinline     static __lzo_noinline
+#endif
+#if !defined(__lzo_c99_extern_inline)
+#if defined(__GNUC_GNU_INLINE__)
+#  define __lzo_c99_extern_inline   __lzo_inline
+#elif defined(__GNUC_STDC_INLINE__)
+#  define __lzo_c99_extern_inline   extern __lzo_inline
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__-0 >= 199901L)
+#  define __lzo_c99_extern_inline   extern __lzo_inline
+#endif
+#if !defined(__lzo_c99_extern_inline) && (__lzo_HAVE_inline)
+#  define __lzo_c99_extern_inline   __lzo_inline
+#endif
+#endif
+#if defined(__lzo_c99_extern_inline)
+#  ifndef __lzo_HAVE_c99_extern_inline
+#  define __lzo_HAVE_c99_extern_inline 1
+#  endif
+#else
+#  define __lzo_c99_extern_inline   /*empty*/
+#endif
+#if !defined(__lzo_may_alias)
+#if (LZO_CC_GNUC >= 0x030400ul)
+#  define __lzo_may_alias       __attribute__((__may_alias__))
+#elif (LZO_CC_CLANG >= 0x020900ul)
+#  define __lzo_may_alias       __attribute__((__may_alias__))
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 1210)) && 0
+#  define __lzo_may_alias       __attribute__((__may_alias__))
+#elif (LZO_CC_PGI >= 0x0d0a00ul) && 0
+#  define __lzo_may_alias       __attribute__((__may_alias__))
+#endif
+#endif
+#if defined(__lzo_may_alias)
+#  ifndef __lzo_HAVE_may_alias
+#  define __lzo_HAVE_may_alias 1
+#  endif
+#else
+#  define __lzo_may_alias       /*empty*/
+#endif
+#if !defined(__lzo_noreturn)
+#if (LZO_CC_GNUC >= 0x020700ul)
+#  define __lzo_noreturn        __attribute__((__noreturn__))
+#elif (LZO_CC_IBMC >= 700)
+#  define __lzo_noreturn        __attribute__((__noreturn__))
+#elif (LZO_CC_INTELC_MSC && (__INTEL_COMPILER >= 450))
+#  define __lzo_noreturn        __declspec(noreturn)
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 600))
+#  define __lzo_noreturn        __attribute__((__noreturn__))
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_noreturn        __attribute__((__noreturn__))
+#elif (LZO_CC_MSC && (_MSC_VER >= 1200))
+#  define __lzo_noreturn        __declspec(noreturn)
+#elif (LZO_CC_PGI >= 0x0d0a00ul)
+#  define __lzo_noreturn        __attribute__((__noreturn__))
+#endif
+#endif
+#if defined(__lzo_noreturn)
+#  ifndef __lzo_HAVE_noreturn
+#  define __lzo_HAVE_noreturn 1
+#  endif
+#else
+#  define __lzo_noreturn        /*empty*/
+#endif
+#if !defined(__lzo_nothrow)
+#if (LZO_CC_GNUC >= 0x030300ul)
+#  define __lzo_nothrow         __attribute__((__nothrow__))
+#elif (LZO_CC_INTELC_MSC && (__INTEL_COMPILER >= 450)) && defined(__cplusplus)
+#  define __lzo_nothrow         __declspec(nothrow)
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 900))
+#  define __lzo_nothrow         __attribute__((__nothrow__))
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_nothrow         __attribute__((__nothrow__))
+#elif (LZO_CC_MSC && (_MSC_VER >= 1200)) && defined(__cplusplus)
+#  define __lzo_nothrow         __declspec(nothrow)
+#endif
+#endif
+#if defined(__lzo_nothrow)
+#  ifndef __lzo_HAVE_nothrow
+#  define __lzo_HAVE_nothrow 1
+#  endif
+#else
+#  define __lzo_nothrow         /*empty*/
+#endif
+#if !defined(__lzo_restrict)
+#if (LZO_CC_GNUC >= 0x030400ul)
+#  define __lzo_restrict        __restrict__
+#elif (LZO_CC_IBMC >= 800) && !defined(__cplusplus)
+#  define __lzo_restrict        __restrict__
+#elif (LZO_CC_IBMC >= 1210)
+#  define __lzo_restrict        __restrict__
+#elif (LZO_CC_INTELC_MSC && (__INTEL_COMPILER >= 600))
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 600))
+#  define __lzo_restrict        __restrict__
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM)
+#  define __lzo_restrict        __restrict__
+#elif (LZO_CC_MSC && (_MSC_VER >= 1400))
+#  define __lzo_restrict        __restrict
+#elif (LZO_CC_PGI >= 0x0d0a00ul)
+#  define __lzo_restrict        __restrict__
+#endif
+#endif
+#if defined(__lzo_restrict)
+#  ifndef __lzo_HAVE_restrict
+#  define __lzo_HAVE_restrict 1
+#  endif
+#else
+#  define __lzo_restrict        /*empty*/
+#endif
+#if !defined(__lzo_alignof)
+#if (LZO_CC_ARMCC || LZO_CC_CILLY || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE || LZO_CC_PGI)
+#  define __lzo_alignof(e)      __alignof__(e)
+#elif (LZO_CC_GHS) && !defined(__cplusplus)
+#  define __lzo_alignof(e)      __alignof__(e)
+#elif (LZO_CC_IBMC >= 600)
+#  define __lzo_alignof(e)      (__lzo_gnuc_extension__ __alignof__(e))
+#elif (LZO_CC_INTELC && (__INTEL_COMPILER >= 700))
+#  define __lzo_alignof(e)      __alignof__(e)
+#elif (LZO_CC_MSC && (_MSC_VER >= 1300))
+#  define __lzo_alignof(e)      __alignof(e)
+#elif (LZO_CC_SUNPROC >= 0x5100)
+#  define __lzo_alignof(e)      __alignof__(e)
+#endif
+#endif
+#if defined(__lzo_alignof)
+#  ifndef __lzo_HAVE_alignof
+#  define __lzo_HAVE_alignof 1
+#  endif
+#endif
+#if !defined(__lzo_struct_packed)
+#if   (LZO_CC_CLANG && (LZO_CC_CLANG < 0x020800ul)) && defined(__cplusplus)
+#elif (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020700ul))
+#elif (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020800ul)) && defined(__cplusplus)
+#elif (LZO_CC_PCC && (LZO_CC_PCC < 0x010100ul))
+#elif (LZO_CC_SUNPROC && (LZO_CC_SUNPROC < 0x5110)) && !defined(__cplusplus)
+#elif (LZO_CC_GNUC >= 0x030400ul) && !(LZO_CC_PCC_GNUC) && (LZO_ARCH_AMD64 || LZO_ARCH_I386)
+#  define __lzo_struct_packed(s)        struct s {
+#  define __lzo_struct_packed_end()     } __attribute__((__gcc_struct__,__packed__));
+#  define __lzo_struct_packed_ma_end()  } __lzo_may_alias __attribute__((__gcc_struct__,__packed__));
+#elif (LZO_CC_ARMCC || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_INTELC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE || (LZO_CC_PGI >= 0x0d0a00ul) || (LZO_CC_SUNPROC >= 0x5100))
+#  define __lzo_struct_packed(s)        struct s {
+#  define __lzo_struct_packed_end()     } __attribute__((__packed__));
+#  define __lzo_struct_packed_ma_end()  } __lzo_may_alias __attribute__((__packed__));
+#elif (LZO_CC_IBMC >= 700)
+#  define __lzo_struct_packed(s)        __lzo_gnuc_extension__ struct s {
+#  define __lzo_struct_packed_end()     } __attribute__((__packed__));
+#  define __lzo_struct_packed_ma_end()  } __lzo_may_alias __attribute__((__packed__));
+#elif (LZO_CC_INTELC_MSC) || (LZO_CC_MSC && (_MSC_VER >= 1300))
+#  define __lzo_struct_packed(s)        __pragma(pack(push,1)) struct s {
+#  define __lzo_struct_packed_end()     } __pragma(pack(pop));
+#elif (LZO_CC_WATCOMC && (__WATCOMC__ >= 900))
+#  define __lzo_struct_packed(s)        _Packed struct s {
+#  define __lzo_struct_packed_end()     };
+#endif
+#endif
+#if defined(__lzo_struct_packed) && !defined(__lzo_struct_packed_ma)
+#  define __lzo_struct_packed_ma(s)     __lzo_struct_packed(s)
+#endif
+#if defined(__lzo_struct_packed_end) && !defined(__lzo_struct_packed_ma_end)
+#  define __lzo_struct_packed_ma_end()  __lzo_struct_packed_end()
+#endif
+#if !defined(__lzo_byte_struct)
+#if defined(__lzo_struct_packed)
+#  define __lzo_byte_struct(s,n)        __lzo_struct_packed(s) unsigned char a[n]; __lzo_struct_packed_end()
+#  define __lzo_byte_struct_ma(s,n)     __lzo_struct_packed_ma(s) unsigned char a[n]; __lzo_struct_packed_ma_end()
+#elif (LZO_CC_CILLY || LZO_CC_CLANG || LZO_CC_PGI || (LZO_CC_SUNPROC >= 0x5100))
+#  define __lzo_byte_struct(s,n)        struct s { unsigned char a[n]; } __attribute__((__packed__));
+#  define __lzo_byte_struct_ma(s,n)     struct s { unsigned char a[n]; } __lzo_may_alias __attribute__((__packed__));
+#endif
+#endif
+#if defined(__lzo_byte_struct) &&  !defined(__lzo_byte_struct_ma)
+#  define __lzo_byte_struct_ma(s,n)     __lzo_byte_struct(s,n)
+#endif
+#if !defined(__lzo_struct_align16) && (__lzo_HAVE_alignof)
+#if (LZO_CC_GNUC && (LZO_CC_GNUC < 0x030000ul))
+#elif (LZO_CC_CLANG && (LZO_CC_CLANG < 0x020800ul)) && defined(__cplusplus)
+#elif (LZO_CC_CILLY || LZO_CC_PCC)
+#elif (LZO_CC_INTELC_MSC) || (LZO_CC_MSC && (_MSC_VER >= 1300))
+#  define __lzo_struct_align16(s)       struct __declspec(align(16)) s {
+#  define __lzo_struct_align16_end()    };
+#  define __lzo_struct_align32(s)       struct __declspec(align(32)) s {
+#  define __lzo_struct_align32_end()    };
+#  define __lzo_struct_align64(s)       struct __declspec(align(64)) s {
+#  define __lzo_struct_align64_end()    };
+#elif (LZO_CC_ARMCC || LZO_CC_CLANG || LZO_CC_GNUC || (LZO_CC_IBMC >= 700) || LZO_CC_INTELC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_struct_align16(s)       struct s {
+#  define __lzo_struct_align16_end()    } __attribute__((__aligned__(16)));
+#  define __lzo_struct_align32(s)       struct s {
+#  define __lzo_struct_align32_end()    } __attribute__((__aligned__(32)));
+#  define __lzo_struct_align64(s)       struct s {
+#  define __lzo_struct_align64_end()    } __attribute__((__aligned__(64)));
+#endif
+#endif
+#if !defined(__lzo_union_um)
+#if   (LZO_CC_CLANG && (LZO_CC_CLANG < 0x020800ul)) && defined(__cplusplus)
+#elif (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020700ul))
+#elif (LZO_CC_GNUC && (LZO_CC_GNUC < 0x020800ul)) && defined(__cplusplus)
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER < 810))
+#elif (LZO_CC_PCC && (LZO_CC_PCC < 0x010100ul))
+#elif (LZO_CC_SUNPROC && (LZO_CC_SUNPROC < 0x5110)) && !defined(__cplusplus)
+#elif (LZO_CC_ARMCC || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_INTELC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE || (LZO_CC_PGI >= 0x0d0a00ul) || (LZO_CC_SUNPROC >= 0x5100))
+#  define __lzo_union_am(s)             union s {
+#  define __lzo_union_am_end()          } __lzo_may_alias;
+#  define __lzo_union_um(s)             union s {
+#  define __lzo_union_um_end()          } __lzo_may_alias __attribute__((__packed__));
+#elif (LZO_CC_IBMC >= 700)
+#  define __lzo_union_am(s)             __lzo_gnuc_extension__ union s {
+#  define __lzo_union_am_end()          } __lzo_may_alias;
+#  define __lzo_union_um(s)             __lzo_gnuc_extension__ union s {
+#  define __lzo_union_um_end()          } __lzo_may_alias __attribute__((__packed__));
+#elif (LZO_CC_INTELC_MSC) || (LZO_CC_MSC && (_MSC_VER >= 1300))
+#  define __lzo_union_um(s)             __pragma(pack(push,1)) union s {
+#  define __lzo_union_um_end()          } __pragma(pack(pop));
+#elif (LZO_CC_WATCOMC && (__WATCOMC__ >= 900))
+#  define __lzo_union_um(s)             _Packed union s {
+#  define __lzo_union_um_end()          };
+#endif
+#endif
+#if !defined(__lzo_union_am)
+#  define __lzo_union_am(s)             union s {
+#  define __lzo_union_am_end()          };
+#endif
+#if !defined(__lzo_constructor)
+#if (LZO_CC_GNUC >= 0x030400ul)
+#  define __lzo_constructor     __attribute__((__constructor__,__used__))
+#elif (LZO_CC_GNUC >= 0x020700ul)
+#  define __lzo_constructor     __attribute__((__constructor__))
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 800))
+#  define __lzo_constructor     __attribute__((__constructor__,__used__))
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_constructor     __attribute__((__constructor__))
+#endif
+#endif
+#if defined(__lzo_constructor)
+#  ifndef __lzo_HAVE_constructor
+#  define __lzo_HAVE_constructor 1
+#  endif
+#endif
+#if !defined(__lzo_destructor)
+#if (LZO_CC_GNUC >= 0x030400ul)
+#  define __lzo_destructor      __attribute__((__destructor__,__used__))
+#elif (LZO_CC_GNUC >= 0x020700ul)
+#  define __lzo_destructor      __attribute__((__destructor__))
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 800))
+#  define __lzo_destructor      __attribute__((__destructor__,__used__))
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_destructor      __attribute__((__destructor__))
+#endif
+#endif
+#if defined(__lzo_destructor)
+#  ifndef __lzo_HAVE_destructor
+#  define __lzo_HAVE_destructor 1
+#  endif
+#endif
+#if (__lzo_HAVE_destructor) && !(__lzo_HAVE_constructor)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if !defined(__lzo_likely) && !defined(__lzo_unlikely)
+#if (LZO_CC_GNUC >= 0x030200ul)
+#  define __lzo_likely(e)       (__builtin_expect(!!(e),1))
+#  define __lzo_unlikely(e)     (__builtin_expect(!!(e),0))
+#elif (LZO_CC_IBMC >= 1010)
+#  define __lzo_likely(e)       (__builtin_expect(!!(e),1))
+#  define __lzo_unlikely(e)     (__builtin_expect(!!(e),0))
+#elif (LZO_CC_INTELC && (__INTEL_COMPILER >= 800))
+#  define __lzo_likely(e)       (__builtin_expect(!!(e),1))
+#  define __lzo_unlikely(e)     (__builtin_expect(!!(e),0))
+#elif (LZO_CC_CLANG && LZO_CC_CLANG_C2)
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define __lzo_likely(e)       (__builtin_expect(!!(e),1))
+#  define __lzo_unlikely(e)     (__builtin_expect(!!(e),0))
+#endif
+#endif
+#if defined(__lzo_likely)
+#  ifndef __lzo_HAVE_likely
+#  define __lzo_HAVE_likely 1
+#  endif
+#else
+#  define __lzo_likely(e)           (e)
+#endif
+#if defined(__lzo_very_likely)
+#  ifndef __lzo_HAVE_very_likely
+#  define __lzo_HAVE_very_likely 1
+#  endif
+#else
+#  define __lzo_very_likely(e)      __lzo_likely(e)
+#endif
+#if defined(__lzo_unlikely)
+#  ifndef __lzo_HAVE_unlikely
+#  define __lzo_HAVE_unlikely 1
+#  endif
+#else
+#  define __lzo_unlikely(e)         (e)
+#endif
+#if defined(__lzo_very_unlikely)
+#  ifndef __lzo_HAVE_very_unlikely
+#  define __lzo_HAVE_very_unlikely 1
+#  endif
+#else
+#  define __lzo_very_unlikely(e)    __lzo_unlikely(e)
+#endif
+#if !defined(__lzo_loop_forever)
+#  if (LZO_CC_IBMC)
+#    define __lzo_loop_forever()    LZO_BLOCK_BEGIN for (;;) { ; } LZO_BLOCK_END
+#  else
+#    define __lzo_loop_forever()    do { ; } while __lzo_cte(1)
+#  endif
+#endif
+#if !defined(__lzo_unreachable)
+#if (LZO_CC_CLANG && (LZO_CC_CLANG >= 0x020800ul)) && lzo_has_builtin(__builtin_unreachable)
+#  define __lzo_unreachable()       __builtin_unreachable();
+#elif (LZO_CC_GNUC >= 0x040500ul)
+#  define __lzo_unreachable()       __builtin_unreachable();
+#elif (LZO_CC_INTELC_GNUC && (__INTEL_COMPILER >= 1300)) && 1
+#  define __lzo_unreachable()       __builtin_unreachable();
+#endif
+#endif
+#if defined(__lzo_unreachable)
+#  ifndef __lzo_HAVE_unreachable
+#  define __lzo_HAVE_unreachable 1
+#  endif
+#else
+#  if 0
+#  define __lzo_unreachable()       ((void)0);
+#  else
+#  define __lzo_unreachable()       __lzo_loop_forever();
+#  endif
+#endif
+#if !defined(lzo_unused_funcs_impl)
+#  if 1 && (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || (LZO_CC_GNUC >= 0x020700ul) || LZO_CC_INTELC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE || LZO_CC_PGI)
+#    define lzo_unused_funcs_impl(r,f)  static r __attribute__((__unused__)) f
+#  elif 1 && (LZO_CC_BORLANDC || LZO_CC_GNUC)
+#    define lzo_unused_funcs_impl(r,f)  static r f
+#  else
+#    define lzo_unused_funcs_impl(r,f)  __lzo_static_forceinline r f
+#  endif
+#endif
+#ifndef __LZO_CTA_NAME
+#if (LZO_CFG_USE_COUNTER)
+#  define __LZO_CTA_NAME(a)         LZO_PP_ECONCAT2(a,__COUNTER__)
+#else
+#  define __LZO_CTA_NAME(a)         LZO_PP_ECONCAT2(a,__LINE__)
+#endif
+#endif
+#if !defined(LZO_COMPILE_TIME_ASSERT_HEADER)
+#  if (LZO_CC_AZTECC || LZO_CC_ZORTECHC)
+#    define LZO_COMPILE_TIME_ASSERT_HEADER(e)  LZO_EXTERN_C_BEGIN extern int __LZO_CTA_NAME(lzo_cta__)[1-!(e)]; LZO_EXTERN_C_END
+#  elif (LZO_CC_DMC || LZO_CC_SYMANTECC)
+#    define LZO_COMPILE_TIME_ASSERT_HEADER(e)  LZO_EXTERN_C_BEGIN extern int __LZO_CTA_NAME(lzo_cta__)[1u-2*!(e)]; LZO_EXTERN_C_END
+#  elif (LZO_CC_TURBOC && (__TURBOC__ == 0x0295))
+#    define LZO_COMPILE_TIME_ASSERT_HEADER(e)  LZO_EXTERN_C_BEGIN extern int __LZO_CTA_NAME(lzo_cta__)[1-!(e)]; LZO_EXTERN_C_END
+#  elif (LZO_CC_CLANG && (LZO_CC_CLANG < 0x020900ul)) && defined(__cplusplus)
+#    define LZO_COMPILE_TIME_ASSERT_HEADER(e)  LZO_EXTERN_C_BEGIN int __LZO_CTA_NAME(lzo_cta_f__)(int [1-2*!(e)]); LZO_EXTERN_C_END
+#  elif (LZO_CC_GNUC) && defined(__CHECKER__) && defined(__SPARSE_CHECKER__)
+#    define LZO_COMPILE_TIME_ASSERT_HEADER(e)  LZO_EXTERN_C_BEGIN enum {__LZO_CTA_NAME(lzo_cta_e__)=1/!!(e)} __attribute__((__unused__)); LZO_EXTERN_C_END
+#  else
+#    define LZO_COMPILE_TIME_ASSERT_HEADER(e)  LZO_EXTERN_C_BEGIN extern int __LZO_CTA_NAME(lzo_cta__)[1-2*!(e)]; LZO_EXTERN_C_END
+#  endif
+#endif
+#if !defined(LZO_COMPILE_TIME_ASSERT)
+#  if (LZO_CC_AZTECC)
+#    define LZO_COMPILE_TIME_ASSERT(e)  {typedef int __LZO_CTA_NAME(lzo_cta_t__)[1-!(e)];}
+#  elif (LZO_CC_CLANG && (LZO_CC_CLANG >= 0x030000ul))
+#    define LZO_COMPILE_TIME_ASSERT(e)  {typedef int __LZO_CTA_NAME(lzo_cta_t__)[1-2*!(e)] __attribute__((__unused__));}
+#  elif (LZO_CC_DMC || LZO_CC_PACIFICC || LZO_CC_SYMANTECC || LZO_CC_ZORTECHC)
+#    define LZO_COMPILE_TIME_ASSERT(e)  switch(0) case 1:case !(e):break;
+#  elif (LZO_CC_GNUC) && defined(__CHECKER__) && defined(__SPARSE_CHECKER__)
+#    define LZO_COMPILE_TIME_ASSERT(e)  {(void) (0/!!(e));}
+#  elif (LZO_CC_GNUC >= 0x040700ul) && (LZO_CFG_USE_COUNTER) && defined(__cplusplus)
+#    define LZO_COMPILE_TIME_ASSERT(e)  {enum {__LZO_CTA_NAME(lzo_cta_e__)=1/!!(e)} __attribute__((__unused__));}
+#  elif (LZO_CC_GNUC >= 0x040700ul)
+#    define LZO_COMPILE_TIME_ASSERT(e)  {typedef int __LZO_CTA_NAME(lzo_cta_t__)[1-2*!(e)] __attribute__((__unused__));}
+#  elif (LZO_CC_MSC && (_MSC_VER < 900))
+#    define LZO_COMPILE_TIME_ASSERT(e)  switch(0) case 1:case !(e):break;
+#  elif (LZO_CC_TURBOC && (__TURBOC__ == 0x0295))
+#    define LZO_COMPILE_TIME_ASSERT(e)  switch(0) case 1:case !(e):break;
+#  else
+#    define LZO_COMPILE_TIME_ASSERT(e)  {typedef int __LZO_CTA_NAME(lzo_cta_t__)[1-2*!(e)];}
+#  endif
+#endif
+#if (LZO_LANG_ASSEMBLER)
+#  undef LZO_COMPILE_TIME_ASSERT_HEADER
+#  define LZO_COMPILE_TIME_ASSERT_HEADER(e)  /*empty*/
+#else
+LZO_COMPILE_TIME_ASSERT_HEADER(1 == 1)
+#if defined(__cplusplus)
+extern "C" { LZO_COMPILE_TIME_ASSERT_HEADER(2 == 2) }
+#endif
+LZO_COMPILE_TIME_ASSERT_HEADER(3 == 3)
+#endif
+#if (LZO_ARCH_I086 || LZO_ARCH_I386) && (LZO_OS_DOS16 || LZO_OS_DOS32 || LZO_OS_OS2 || LZO_OS_OS216 || LZO_OS_WIN16 || LZO_OS_WIN32 || LZO_OS_WIN64)
+#  if (LZO_CC_GNUC || LZO_CC_HIGHC || LZO_CC_NDPC || LZO_CC_PACIFICC)
+#  elif (LZO_CC_DMC || LZO_CC_SYMANTECC || LZO_CC_ZORTECHC)
+#    define __lzo_cdecl                 __cdecl
+#    define __lzo_cdecl_atexit          /*empty*/
+#    define __lzo_cdecl_main            __cdecl
+#    if (LZO_OS_OS2 && (LZO_CC_DMC || LZO_CC_SYMANTECC))
+#      define __lzo_cdecl_qsort         __pascal
+#    elif (LZO_OS_OS2 && (LZO_CC_ZORTECHC))
+#      define __lzo_cdecl_qsort         _stdcall
+#    else
+#      define __lzo_cdecl_qsort         __cdecl
+#    endif
+#  elif (LZO_CC_WATCOMC)
+#    define __lzo_cdecl                 __cdecl
+#  else
+#    define __lzo_cdecl                 __cdecl
+#    define __lzo_cdecl_atexit          __cdecl
+#    define __lzo_cdecl_main            __cdecl
+#    define __lzo_cdecl_qsort           __cdecl
+#  endif
+#  if (LZO_CC_GNUC || LZO_CC_HIGHC || LZO_CC_NDPC || LZO_CC_PACIFICC || LZO_CC_WATCOMC)
+#  elif (LZO_OS_OS2 && (LZO_CC_DMC || LZO_CC_SYMANTECC))
+#    define __lzo_cdecl_sighandler      __pascal
+#  elif (LZO_OS_OS2 && (LZO_CC_ZORTECHC))
+#    define __lzo_cdecl_sighandler      _stdcall
+#  elif (LZO_CC_MSC && (_MSC_VER >= 1400)) && defined(_M_CEE_PURE)
+#    define __lzo_cdecl_sighandler      __clrcall
+#  elif (LZO_CC_MSC && (_MSC_VER >= 600 && _MSC_VER < 700))
+#    if defined(_DLL)
+#      define __lzo_cdecl_sighandler    _far _cdecl _loadds
+#    elif defined(_MT)
+#      define __lzo_cdecl_sighandler    _far _cdecl
+#    else
+#      define __lzo_cdecl_sighandler    _cdecl
+#    endif
+#  else
+#    define __lzo_cdecl_sighandler      __cdecl
+#  endif
+#elif (LZO_ARCH_I386) && (LZO_CC_WATCOMC)
+#  define __lzo_cdecl                   __cdecl
+#elif (LZO_ARCH_M68K && LZO_OS_TOS && (LZO_CC_PUREC || LZO_CC_TURBOC))
+#  define __lzo_cdecl                   cdecl
+#endif
+#if !defined(__lzo_cdecl)
+#  define __lzo_cdecl                   /*empty*/
+#endif
+#if !defined(__lzo_cdecl_atexit)
+#  define __lzo_cdecl_atexit            /*empty*/
+#endif
+#if !defined(__lzo_cdecl_main)
+#  define __lzo_cdecl_main              /*empty*/
+#endif
+#if !defined(__lzo_cdecl_qsort)
+#  define __lzo_cdecl_qsort             /*empty*/
+#endif
+#if !defined(__lzo_cdecl_sighandler)
+#  define __lzo_cdecl_sighandler        /*empty*/
+#endif
+#if !defined(__lzo_cdecl_va)
+#  define __lzo_cdecl_va                __lzo_cdecl
+#endif
+#if !(LZO_CFG_NO_WINDOWS_H)
+#if !defined(LZO_HAVE_WINDOWS_H)
+#if (LZO_OS_CYGWIN || (LZO_OS_EMX && defined(__RSXNT__)) || LZO_OS_WIN32 || LZO_OS_WIN64)
+#  if (LZO_CC_WATCOMC && (__WATCOMC__ < 1000))
+#  elif ((LZO_OS_WIN32 && defined(__PW32__)) && (LZO_CC_GNUC && (LZO_CC_GNUC < 0x030000ul)))
+#  elif ((LZO_OS_CYGWIN || defined(__MINGW32__)) && (LZO_CC_GNUC && (LZO_CC_GNUC < 0x025f00ul)))
+#  else
+#    define LZO_HAVE_WINDOWS_H 1
+#  endif
+#endif
+#endif
+#endif
+#define LZO_SIZEOF_CHAR             1
+#ifndef LZO_SIZEOF_SHORT
+#if defined(SIZEOF_SHORT)
+#  define LZO_SIZEOF_SHORT          (SIZEOF_SHORT)
+#elif defined(__SIZEOF_SHORT__)
+#  define LZO_SIZEOF_SHORT          (__SIZEOF_SHORT__)
+#endif
+#endif
+#ifndef LZO_SIZEOF_INT
+#if defined(SIZEOF_INT)
+#  define LZO_SIZEOF_INT            (SIZEOF_INT)
+#elif defined(__SIZEOF_INT__)
+#  define LZO_SIZEOF_INT            (__SIZEOF_INT__)
+#endif
+#endif
+#ifndef LZO_SIZEOF_LONG
+#if defined(SIZEOF_LONG)
+#  define LZO_SIZEOF_LONG           (SIZEOF_LONG)
+#elif defined(__SIZEOF_LONG__)
+#  define LZO_SIZEOF_LONG           (__SIZEOF_LONG__)
+#endif
+#endif
+#ifndef LZO_SIZEOF_LONG_LONG
+#if defined(SIZEOF_LONG_LONG)
+#  define LZO_SIZEOF_LONG_LONG      (SIZEOF_LONG_LONG)
+#elif defined(__SIZEOF_LONG_LONG__)
+#  define LZO_SIZEOF_LONG_LONG      (__SIZEOF_LONG_LONG__)
+#endif
+#endif
+#ifndef LZO_SIZEOF___INT16
+#if defined(SIZEOF___INT16)
+#  define LZO_SIZEOF___INT16        (SIZEOF___INT16)
+#endif
+#endif
+#ifndef LZO_SIZEOF___INT32
+#if defined(SIZEOF___INT32)
+#  define LZO_SIZEOF___INT32        (SIZEOF___INT32)
+#endif
+#endif
+#ifndef LZO_SIZEOF___INT64
+#if defined(SIZEOF___INT64)
+#  define LZO_SIZEOF___INT64        (SIZEOF___INT64)
+#endif
+#endif
+#ifndef LZO_SIZEOF_VOID_P
+#if defined(SIZEOF_VOID_P)
+#  define LZO_SIZEOF_VOID_P         (SIZEOF_VOID_P)
+#elif defined(__SIZEOF_POINTER__)
+#  define LZO_SIZEOF_VOID_P         (__SIZEOF_POINTER__)
+#endif
+#endif
+#ifndef LZO_SIZEOF_SIZE_T
+#if defined(SIZEOF_SIZE_T)
+#  define LZO_SIZEOF_SIZE_T         (SIZEOF_SIZE_T)
+#elif defined(__SIZEOF_SIZE_T__)
+#  define LZO_SIZEOF_SIZE_T         (__SIZEOF_SIZE_T__)
+#endif
+#endif
+#ifndef LZO_SIZEOF_PTRDIFF_T
+#if defined(SIZEOF_PTRDIFF_T)
+#  define LZO_SIZEOF_PTRDIFF_T      (SIZEOF_PTRDIFF_T)
+#elif defined(__SIZEOF_PTRDIFF_T__)
+#  define LZO_SIZEOF_PTRDIFF_T      (__SIZEOF_PTRDIFF_T__)
+#endif
+#endif
+#define __LZO_LSR(x,b)    (((x)+0ul) >> (b))
+#if !defined(LZO_SIZEOF_SHORT)
+#  if (LZO_ARCH_CRAY_PVP)
+#    define LZO_SIZEOF_SHORT        8
+#  elif (USHRT_MAX == LZO_0xffffL)
+#    define LZO_SIZEOF_SHORT        2
+#  elif (__LZO_LSR(USHRT_MAX,7) == 1)
+#    define LZO_SIZEOF_SHORT        1
+#  elif (__LZO_LSR(USHRT_MAX,15) == 1)
+#    define LZO_SIZEOF_SHORT        2
+#  elif (__LZO_LSR(USHRT_MAX,31) == 1)
+#    define LZO_SIZEOF_SHORT        4
+#  elif (__LZO_LSR(USHRT_MAX,63) == 1)
+#    define LZO_SIZEOF_SHORT        8
+#  elif (__LZO_LSR(USHRT_MAX,127) == 1)
+#    define LZO_SIZEOF_SHORT        16
+#  else
+#    error "LZO_SIZEOF_SHORT"
+#  endif
+#endif
+LZO_COMPILE_TIME_ASSERT_HEADER(LZO_SIZEOF_SHORT == sizeof(short))
+#if !defined(LZO_SIZEOF_INT)
+#  if (LZO_ARCH_CRAY_PVP)
+#    define LZO_SIZEOF_INT          8
+#  elif (UINT_MAX == LZO_0xffffL)
+#    define LZO_SIZEOF_INT          2
+#  elif (UINT_MAX == LZO_0xffffffffL)
+#    define LZO_SIZEOF_INT          4
+#  elif (__LZO_LSR(UINT_MAX,7) == 1)
+#    define LZO_SIZEOF_INT          1
+#  elif (__LZO_LSR(UINT_MAX,15) == 1)
+#    define LZO_SIZEOF_INT          2
+#  elif (__LZO_LSR(UINT_MAX,31) == 1)
+#    define LZO_SIZEOF_INT          4
+#  elif (__LZO_LSR(UINT_MAX,63) == 1)
+#    define LZO_SIZEOF_INT          8
+#  elif (__LZO_LSR(UINT_MAX,127) == 1)
+#    define LZO_SIZEOF_INT          16
+#  else
+#    error "LZO_SIZEOF_INT"
+#  endif
+#endif
+LZO_COMPILE_TIME_ASSERT_HEADER(LZO_SIZEOF_INT == sizeof(int))
+#if !defined(LZO_SIZEOF_LONG)
+#  if (ULONG_MAX == LZO_0xffffffffL)
+#    define LZO_SIZEOF_LONG         4
+#  elif (__LZO_LSR(ULONG_MAX,7) == 1)
+#    define LZO_SIZEOF_LONG         1
+#  elif (__LZO_LSR(ULONG_MAX,15) == 1)
+#    define LZO_SIZEOF_LONG         2
+#  elif (__LZO_LSR(ULONG_MAX,31) == 1)
+#    define LZO_SIZEOF_LONG         4
+#  elif (__LZO_LSR(ULONG_MAX,39) == 1)
+#    define LZO_SIZEOF_LONG         5
+#  elif (__LZO_LSR(ULONG_MAX,63) == 1)
+#    define LZO_SIZEOF_LONG         8
+#  elif (__LZO_LSR(ULONG_MAX,127) == 1)
+#    define LZO_SIZEOF_LONG         16
+#  else
+#    error "LZO_SIZEOF_LONG"
+#  endif
+#endif
+LZO_COMPILE_TIME_ASSERT_HEADER(LZO_SIZEOF_LONG == sizeof(long))
+#if !defined(LZO_SIZEOF_LONG_LONG) && !defined(LZO_SIZEOF___INT64)
+#if (LZO_SIZEOF_LONG > 0 && LZO_SIZEOF_LONG < 8)
+#  if defined(__LONG_MAX__) && defined(__LONG_LONG_MAX__)
+#    if (LZO_CC_GNUC >= 0x030300ul)
+#      if ((__LONG_MAX__-0) == (__LONG_LONG_MAX__-0))
+#        define LZO_SIZEOF_LONG_LONG      LZO_SIZEOF_LONG
+#      elif (__LZO_LSR(__LONG_LONG_MAX__,30) == 1)
+#        define LZO_SIZEOF_LONG_LONG      4
+#      endif
+#    endif
+#  endif
+#endif
+#endif
+#if !defined(LZO_SIZEOF_LONG_LONG) && !defined(LZO_SIZEOF___INT64)
+#if (LZO_SIZEOF_LONG > 0 && LZO_SIZEOF_LONG < 8)
+#if (LZO_ARCH_I086 && LZO_CC_DMC)
+#elif (LZO_CC_CILLY) && defined(__GNUC__)
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_CC_ARMCC_GNUC || LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_LLVM || LZO_CC_PATHSCALE)
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif ((LZO_OS_WIN32 || LZO_OS_WIN64 || defined(_WIN32)) && LZO_CC_MSC && (_MSC_VER >= 1400))
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_OS_WIN64 || defined(_WIN64))
+#  define LZO_SIZEOF___INT64        8
+#elif (LZO_ARCH_I386 && (LZO_CC_DMC))
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_ARCH_I386 && (LZO_CC_SYMANTECC && (__SC__ >= 0x700)))
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_ARCH_I386 && (LZO_CC_INTELC && defined(__linux__)))
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_ARCH_I386 && (LZO_CC_MWERKS || LZO_CC_PELLESC || LZO_CC_PGI || LZO_CC_SUNPROC))
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_ARCH_I386 && (LZO_CC_INTELC || LZO_CC_MSC))
+#  define LZO_SIZEOF___INT64        8
+#elif ((LZO_OS_WIN32 || defined(_WIN32)) && (LZO_CC_MSC))
+#  define LZO_SIZEOF___INT64        8
+#elif (LZO_ARCH_I386 && (LZO_CC_BORLANDC && (__BORLANDC__ >= 0x0520)))
+#  define LZO_SIZEOF___INT64        8
+#elif (LZO_ARCH_I386 && (LZO_CC_WATCOMC && (__WATCOMC__ >= 1100)))
+#  define LZO_SIZEOF___INT64        8
+#elif (LZO_CC_GHS && defined(__LLONG_BIT) && ((__LLONG_BIT-0) == 64))
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_CC_WATCOMC && defined(_INTEGRAL_MAX_BITS) && ((_INTEGRAL_MAX_BITS-0) == 64))
+#  define LZO_SIZEOF___INT64        8
+#elif (LZO_OS_OS400 || defined(__OS400__)) && defined(__LLP64_IFC__)
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (defined(__vms) || defined(__VMS)) && ((__INITIAL_POINTER_SIZE-0) == 64)
+#  define LZO_SIZEOF_LONG_LONG      8
+#elif (LZO_CC_SDCC) && (LZO_SIZEOF_INT == 2)
+#elif 1 && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#  define LZO_SIZEOF_LONG_LONG      8
+#endif
+#endif
+#endif
+#if defined(__cplusplus) && (LZO_CC_GNUC)
+#  if (LZO_CC_GNUC < 0x020800ul)
+#    undef LZO_SIZEOF_LONG_LONG
+#  endif
+#endif
+#if (LZO_CFG_NO_LONG_LONG)
+#  undef LZO_SIZEOF_LONG_LONG
+#elif defined(__NO_LONG_LONG)
+#  undef LZO_SIZEOF_LONG_LONG
+#elif defined(_NO_LONGLONG)
+#  undef LZO_SIZEOF_LONG_LONG
+#endif
+#if !defined(LZO_WORDSIZE)
+#if (LZO_ARCH_ALPHA)
+#  define LZO_WORDSIZE              8
+#elif (LZO_ARCH_AMD64)
+#  define LZO_WORDSIZE              8
+#elif (LZO_ARCH_ARM64)
+#  define LZO_WORDSIZE              8
+#elif (LZO_ARCH_AVR)
+#  define LZO_WORDSIZE              1
+#elif (LZO_ARCH_H8300)
+#  if defined(__H8300H__) || defined(__H8300S__) || defined(__H8300SX__)
+#    define LZO_WORDSIZE            4
+#  else
+#    define LZO_WORDSIZE            2
+#  endif
+#elif (LZO_ARCH_I086)
+#  define LZO_WORDSIZE              2
+#elif (LZO_ARCH_IA64)
+#  define LZO_WORDSIZE              8
+#elif (LZO_ARCH_M16C)
+#  define LZO_WORDSIZE              2
+#elif (LZO_ARCH_SPU)
+#  define LZO_WORDSIZE              4
+#elif (LZO_ARCH_Z80)
+#  define LZO_WORDSIZE              1
+#elif (LZO_SIZEOF_LONG == 8) && ((defined(__mips__) && defined(__R5900__)) || defined(__MIPS_PSX2__))
+#  define LZO_WORDSIZE              8
+#elif (LZO_OS_OS400 || defined(__OS400__))
+#  define LZO_WORDSIZE              8
+#elif (defined(__vms) || defined(__VMS)) && (__INITIAL_POINTER_SIZE+0 == 64)
+#  define LZO_WORDSIZE              8
+#endif
+#endif
+#if !defined(LZO_SIZEOF_VOID_P)
+#if defined(__ILP32__) || defined(__ILP32) || defined(_ILP32)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(int)  == 4)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(long) == 4)
+#  define LZO_SIZEOF_VOID_P         4
+#elif defined(__ILP64__) || defined(__ILP64) || defined(_ILP64)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(int)  == 8)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(long) == 8)
+#  define LZO_SIZEOF_VOID_P         8
+#elif defined(__LLP64__) || defined(__LLP64) || defined(_LLP64) || defined(_WIN64)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(long) == 4)
+#  define LZO_SIZEOF_VOID_P         8
+#elif defined(__LP64__) || defined(__LP64) || defined(_LP64)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(long) == 8)
+#  define LZO_SIZEOF_VOID_P         8
+#elif (LZO_ARCH_AVR)
+#  define LZO_SIZEOF_VOID_P         2
+#elif (LZO_ARCH_C166 || LZO_ARCH_MCS51 || LZO_ARCH_MCS251 || LZO_ARCH_MSP430)
+#  define LZO_SIZEOF_VOID_P         2
+#elif (LZO_ARCH_H8300)
+#  if defined(__H8300H__) || defined(__H8300S__) || defined(__H8300SX__)
+     LZO_COMPILE_TIME_ASSERT_HEADER(LZO_WORDSIZE == 4)
+#    if defined(__NORMAL_MODE__)
+#      define LZO_SIZEOF_VOID_P     2
+#    else
+#      define LZO_SIZEOF_VOID_P     4
+#    endif
+#  else
+     LZO_COMPILE_TIME_ASSERT_HEADER(LZO_WORDSIZE == 2)
+#    define LZO_SIZEOF_VOID_P       2
+#  endif
+#  if (LZO_CC_GNUC && (LZO_CC_GNUC < 0x040000ul)) && (LZO_SIZEOF_INT == 4)
+#    define LZO_SIZEOF_SIZE_T       LZO_SIZEOF_INT
+#    define LZO_SIZEOF_PTRDIFF_T    LZO_SIZEOF_INT
+#  endif
+#elif (LZO_ARCH_I086)
+#  if (LZO_MM_TINY || LZO_MM_SMALL || LZO_MM_MEDIUM)
+#    define LZO_SIZEOF_VOID_P       2
+#  elif (LZO_MM_COMPACT || LZO_MM_LARGE || LZO_MM_HUGE)
+#    define LZO_SIZEOF_VOID_P       4
+#  else
+#    error "invalid LZO_ARCH_I086 memory model"
+#  endif
+#elif (LZO_ARCH_M16C)
+#  if defined(__m32c_cpu__) || defined(__m32cm_cpu__)
+#    define LZO_SIZEOF_VOID_P       4
+#  else
+#    define LZO_SIZEOF_VOID_P       2
+#  endif
+#elif (LZO_ARCH_SPU)
+#  define LZO_SIZEOF_VOID_P         4
+#elif (LZO_ARCH_Z80)
+#  define LZO_SIZEOF_VOID_P         2
+#elif (LZO_SIZEOF_LONG == 8) && ((defined(__mips__) && defined(__R5900__)) || defined(__MIPS_PSX2__))
+#  define LZO_SIZEOF_VOID_P         4
+#elif (LZO_OS_OS400 || defined(__OS400__))
+#  if defined(__LLP64_IFC__)
+#    define LZO_SIZEOF_VOID_P       8
+#    define LZO_SIZEOF_SIZE_T       LZO_SIZEOF_LONG
+#    define LZO_SIZEOF_PTRDIFF_T    LZO_SIZEOF_LONG
+#  else
+#    define LZO_SIZEOF_VOID_P       16
+#    define LZO_SIZEOF_SIZE_T       LZO_SIZEOF_LONG
+#    define LZO_SIZEOF_PTRDIFF_T    LZO_SIZEOF_LONG
+#  endif
+#elif (defined(__vms) || defined(__VMS)) && (__INITIAL_POINTER_SIZE+0 == 64)
+#  define LZO_SIZEOF_VOID_P         8
+#  define LZO_SIZEOF_SIZE_T         LZO_SIZEOF_LONG
+#  define LZO_SIZEOF_PTRDIFF_T      LZO_SIZEOF_LONG
+#endif
+#endif
+#if !defined(LZO_SIZEOF_VOID_P)
+#  define LZO_SIZEOF_VOID_P         LZO_SIZEOF_LONG
+#endif
+LZO_COMPILE_TIME_ASSERT_HEADER(LZO_SIZEOF_VOID_P == sizeof(void *))
+#if !defined(LZO_SIZEOF_SIZE_T)
+#if (LZO_ARCH_I086 || LZO_ARCH_M16C)
+#  define LZO_SIZEOF_SIZE_T         2
+#endif
+#endif
+#if !defined(LZO_SIZEOF_SIZE_T)
+#  define LZO_SIZEOF_SIZE_T         LZO_SIZEOF_VOID_P
+#endif
+#if defined(offsetof)
+LZO_COMPILE_TIME_ASSERT_HEADER(LZO_SIZEOF_SIZE_T == sizeof(size_t))
+#endif
+#if !defined(LZO_SIZEOF_PTRDIFF_T)
+#if (LZO_ARCH_I086)
+#  if (LZO_MM_TINY || LZO_MM_SMALL || LZO_MM_MEDIUM || LZO_MM_HUGE)
+#    define LZO_SIZEOF_PTRDIFF_T    LZO_SIZEOF_VOID_P
+#  elif (LZO_MM_COMPACT || LZO_MM_LARGE)
+#    if (LZO_CC_BORLANDC || LZO_CC_TURBOC)
+#      define LZO_SIZEOF_PTRDIFF_T  4
+#    else
+#      define LZO_SIZEOF_PTRDIFF_T  2
+#    endif
+#  else
+#    error "invalid LZO_ARCH_I086 memory model"
+#  endif
+#endif
+#endif
+#if !defined(LZO_SIZEOF_PTRDIFF_T)
+#  define LZO_SIZEOF_PTRDIFF_T      LZO_SIZEOF_SIZE_T
+#endif
+#if defined(offsetof)
+LZO_COMPILE_TIME_ASSERT_HEADER(LZO_SIZEOF_PTRDIFF_T == sizeof(ptrdiff_t))
+#endif
+#if !defined(LZO_WORDSIZE)
+#  define LZO_WORDSIZE              LZO_SIZEOF_VOID_P
+#endif
+#if (LZO_ABI_NEUTRAL_ENDIAN)
+#  undef LZO_ABI_BIG_ENDIAN
+#  undef LZO_ABI_LITTLE_ENDIAN
+#elif !(LZO_ABI_BIG_ENDIAN) && !(LZO_ABI_LITTLE_ENDIAN)
+#if (LZO_ARCH_ALPHA) && (LZO_ARCH_CRAY_MPP)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif (LZO_ARCH_IA64) && (LZO_OS_POSIX_LINUX || LZO_OS_WIN64)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif (LZO_ARCH_ALPHA || LZO_ARCH_AMD64 || LZO_ARCH_BLACKFIN || LZO_ARCH_CRIS || LZO_ARCH_I086 || LZO_ARCH_I386 || LZO_ARCH_MSP430 || LZO_ARCH_RISCV)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif (LZO_ARCH_AVR32 || LZO_ARCH_M68K || LZO_ARCH_S390 || LZO_ARCH_SPU)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif 1 && defined(__IAR_SYSTEMS_ICC__) && defined(__LITTLE_ENDIAN__)
+#  if (__LITTLE_ENDIAN__ == 1)
+#    define LZO_ABI_LITTLE_ENDIAN   1
+#  else
+#    define LZO_ABI_BIG_ENDIAN      1
+#  endif
+#elif 1 && defined(__BIG_ENDIAN__) && !defined(__LITTLE_ENDIAN__)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif 1 && defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif 1 && (LZO_ARCH_ARM) && defined(__ARM_BIG_ENDIAN) && ((__ARM_BIG_ENDIAN)+0)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif 1 && (LZO_ARCH_ARM) && defined(__ARMEB__) && !defined(__ARMEL__)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif 1 && (LZO_ARCH_ARM) && defined(__ARMEL__) && !defined(__ARMEB__)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif 1 && (LZO_ARCH_ARM) && defined(_MSC_VER) && defined(_WIN32)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif 1 && (LZO_ARCH_ARM && LZO_CC_ARMCC_ARMCC)
+#  if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN)
+#    error "unexpected configuration - check your compiler defines"
+#  elif defined(__BIG_ENDIAN)
+#    define LZO_ABI_BIG_ENDIAN      1
+#  else
+#    define LZO_ABI_LITTLE_ENDIAN   1
+#  endif
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif 1 && (LZO_ARCH_ARM64) && defined(__ARM_BIG_ENDIAN) && ((__ARM_BIG_ENDIAN)+0)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif 1 && (LZO_ARCH_ARM64) && defined(__AARCH64EB__) && !defined(__AARCH64EL__)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif 1 && (LZO_ARCH_ARM64) && defined(__AARCH64EL__) && !defined(__AARCH64EB__)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif 1 && (LZO_ARCH_ARM64) && defined(_MSC_VER) && defined(_WIN32)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#elif 1 && (LZO_ARCH_MIPS) && defined(__MIPSEB__) && !defined(__MIPSEL__)
+#  define LZO_ABI_BIG_ENDIAN        1
+#elif 1 && (LZO_ARCH_MIPS) && defined(__MIPSEL__) && !defined(__MIPSEB__)
+#  define LZO_ABI_LITTLE_ENDIAN     1
+#endif
+#endif
+#if (LZO_ABI_BIG_ENDIAN) && (LZO_ABI_LITTLE_ENDIAN)
+#  error "unexpected configuration - check your compiler defines"
+#endif
+#if (LZO_ABI_BIG_ENDIAN)
+#  define LZO_INFO_ABI_ENDIAN       "be"
+#elif (LZO_ABI_LITTLE_ENDIAN)
+#  define LZO_INFO_ABI_ENDIAN       "le"
+#elif (LZO_ABI_NEUTRAL_ENDIAN)
+#  define LZO_INFO_ABI_ENDIAN       "neutral"
+#endif
+#if (LZO_SIZEOF_INT == 1 && LZO_SIZEOF_LONG == 2 && LZO_SIZEOF_VOID_P == 2)
+#  define LZO_ABI_I8LP16         1
+#  define LZO_INFO_ABI_PM       "i8lp16"
+#elif (LZO_SIZEOF_INT == 2 && LZO_SIZEOF_LONG == 2 && LZO_SIZEOF_VOID_P == 2)
+#  define LZO_ABI_ILP16         1
+#  define LZO_INFO_ABI_PM       "ilp16"
+#elif (LZO_SIZEOF_INT == 2 && LZO_SIZEOF_LONG == 4 && LZO_SIZEOF_VOID_P == 4)
+#  define LZO_ABI_LP32          1
+#  define LZO_INFO_ABI_PM       "lp32"
+#elif (LZO_SIZEOF_INT == 4 && LZO_SIZEOF_LONG == 4 && LZO_SIZEOF_VOID_P == 4)
+#  define LZO_ABI_ILP32         1
+#  define LZO_INFO_ABI_PM       "ilp32"
+#elif (LZO_SIZEOF_INT == 4 && LZO_SIZEOF_LONG == 4 && LZO_SIZEOF_VOID_P == 8 && LZO_SIZEOF_SIZE_T == 8)
+#  define LZO_ABI_LLP64         1
+#  define LZO_INFO_ABI_PM       "llp64"
+#elif (LZO_SIZEOF_INT == 4 && LZO_SIZEOF_LONG == 8 && LZO_SIZEOF_VOID_P == 8)
+#  define LZO_ABI_LP64          1
+#  define LZO_INFO_ABI_PM       "lp64"
+#elif (LZO_SIZEOF_INT == 8 && LZO_SIZEOF_LONG == 8 && LZO_SIZEOF_VOID_P == 8)
+#  define LZO_ABI_ILP64         1
+#  define LZO_INFO_ABI_PM       "ilp64"
+#elif (LZO_SIZEOF_INT == 4 && LZO_SIZEOF_LONG == 8 && LZO_SIZEOF_VOID_P == 4)
+#  define LZO_ABI_IP32L64       1
+#  define LZO_INFO_ABI_PM       "ip32l64"
+#endif
+#if (LZO_SIZEOF_INT == 4 && LZO_SIZEOF_VOID_P == 4 && LZO_WORDSIZE == 8)
+#  define LZO_ABI_IP32W64       1
+#  ifndef LZO_INFO_ABI_PM
+#  define LZO_INFO_ABI_PM       "ip32w64"
+#  endif
+#endif
+#if 0
+#elif !defined(__LZO_LIBC_OVERRIDE)
+#if (LZO_LIBC_NAKED)
+#  define LZO_INFO_LIBC         "naked"
+#elif (LZO_LIBC_FREESTANDING)
+#  define LZO_INFO_LIBC         "freestanding"
+#elif (LZO_LIBC_MOSTLY_FREESTANDING)
+#  define LZO_INFO_LIBC         "mfreestanding"
+#elif (LZO_LIBC_ISOC90)
+#  define LZO_INFO_LIBC         "isoc90"
+#elif (LZO_LIBC_ISOC99)
+#  define LZO_INFO_LIBC         "isoc99"
+#elif (LZO_CC_ARMCC_ARMCC) && defined(__ARMCLIB_VERSION)
+#  define LZO_LIBC_ISOC90       1
+#  define LZO_INFO_LIBC         "isoc90"
+#elif defined(__dietlibc__)
+#  define LZO_LIBC_DIETLIBC     1
+#  define LZO_INFO_LIBC         "dietlibc"
+#elif defined(_NEWLIB_VERSION)
+#  define LZO_LIBC_NEWLIB       1
+#  define LZO_INFO_LIBC         "newlib"
+#elif defined(__UCLIBC__) && defined(__UCLIBC_MAJOR__) && defined(__UCLIBC_MINOR__)
+#  if defined(__UCLIBC_SUBLEVEL__)
+#    define LZO_LIBC_UCLIBC     (__UCLIBC_MAJOR__ * 0x10000L + (__UCLIBC_MINOR__-0) * 0x100 + (__UCLIBC_SUBLEVEL__-0))
+#  else
+#    define LZO_LIBC_UCLIBC     0x00090bL
+#  endif
+#  define LZO_INFO_LIBC         "uc" "libc"
+#elif defined(__GLIBC__) && defined(__GLIBC_MINOR__)
+#  define LZO_LIBC_GLIBC        (__GLIBC__ * 0x10000L + (__GLIBC_MINOR__-0) * 0x100)
+#  define LZO_INFO_LIBC         "glibc"
+#elif (LZO_CC_MWERKS) && defined(__MSL__)
+#  define LZO_LIBC_MSL          __MSL__
+#  define LZO_INFO_LIBC         "msl"
+#elif 1 && defined(__IAR_SYSTEMS_ICC__)
+#  define LZO_LIBC_ISOC90       1
+#  define LZO_INFO_LIBC         "isoc90"
+#else
+#  define LZO_LIBC_DEFAULT      1
+#  define LZO_INFO_LIBC         "default"
+#endif
+#endif
+#if (LZO_ARCH_I386 && (LZO_OS_DOS32 || LZO_OS_WIN32) && (LZO_CC_DMC || LZO_CC_INTELC || LZO_CC_MSC || LZO_CC_PELLESC))
+#  define LZO_ASM_SYNTAX_MSC 1
+#elif (LZO_OS_WIN64 && (LZO_CC_DMC || LZO_CC_INTELC || LZO_CC_MSC || LZO_CC_PELLESC))
+#elif (LZO_ARCH_I386 && LZO_CC_GNUC && (LZO_CC_GNUC == 0x011f00ul))
+#elif (LZO_ARCH_I386 && (LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_INTELC || LZO_CC_PATHSCALE))
+#  define LZO_ASM_SYNTAX_GNUC 1
+#elif (LZO_ARCH_AMD64 && (LZO_CC_CLANG || LZO_CC_GNUC || LZO_CC_INTELC || LZO_CC_PATHSCALE))
+#  define LZO_ASM_SYNTAX_GNUC 1
+#elif (LZO_CC_GNUC)
+#  define LZO_ASM_SYNTAX_GNUC 1
+#endif
+#if (LZO_ASM_SYNTAX_GNUC)
+#if (LZO_ARCH_I386 && LZO_CC_GNUC && (LZO_CC_GNUC < 0x020000ul))
+#  define __LZO_ASM_CLOBBER                     "ax"
+#  define __LZO_ASM_CLOBBER_LIST_CC             /*empty*/
+#  define __LZO_ASM_CLOBBER_LIST_CC_MEMORY      /*empty*/
+#  define __LZO_ASM_CLOBBER_LIST_EMPTY          /*empty*/
+#elif (LZO_CC_INTELC && (__INTEL_COMPILER < 1000))
+#  define __LZO_ASM_CLOBBER                     "memory"
+#  define __LZO_ASM_CLOBBER_LIST_CC             /*empty*/
+#  define __LZO_ASM_CLOBBER_LIST_CC_MEMORY      : "memory"
+#  define __LZO_ASM_CLOBBER_LIST_EMPTY          /*empty*/
+#else
+#  define __LZO_ASM_CLOBBER                     "cc", "memory"
+#  define __LZO_ASM_CLOBBER_LIST_CC             : "cc"
+#  define __LZO_ASM_CLOBBER_LIST_CC_MEMORY      : "cc", "memory"
+#  define __LZO_ASM_CLOBBER_LIST_EMPTY          /*empty*/
+#endif
+#endif
+#if (LZO_ARCH_ALPHA)
+#  define LZO_OPT_AVOID_UINT_INDEX          1
+#elif (LZO_ARCH_AMD64)
+#  define LZO_OPT_AVOID_INT_INDEX           1
+#  define LZO_OPT_AVOID_UINT_INDEX          1
+#  ifndef LZO_OPT_UNALIGNED16
+#  define LZO_OPT_UNALIGNED16               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED32
+#  define LZO_OPT_UNALIGNED32               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED64
+#  define LZO_OPT_UNALIGNED64               1
+#  endif
+#elif (LZO_ARCH_ARM)
+#  if defined(__ARM_FEATURE_UNALIGNED)
+#   if ((__ARM_FEATURE_UNALIGNED)+0)
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#   endif
+#  elif 1 && (LZO_ARCH_ARM_THUMB2)
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#  elif 1 && defined(__ARM_ARCH) && ((__ARM_ARCH)+0 >= 7)
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#  elif 1 && defined(__TARGET_ARCH_ARM) && ((__TARGET_ARCH_ARM)+0 >= 7)
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#  elif 1 && defined(__TARGET_ARCH_ARM) && ((__TARGET_ARCH_ARM)+0 >= 6) && (defined(__TARGET_PROFILE_A) || defined(__TARGET_PROFILE_R))
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#  elif 1 && defined(_MSC_VER) && defined(_M_ARM) && ((_M_ARM)+0 >= 7)
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#  endif
+#elif (LZO_ARCH_ARM64)
+#  ifndef LZO_OPT_UNALIGNED16
+#  define LZO_OPT_UNALIGNED16               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED32
+#  define LZO_OPT_UNALIGNED32               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED64
+#  define LZO_OPT_UNALIGNED64               1
+#  endif
+#elif (LZO_ARCH_CRIS)
+#  ifndef LZO_OPT_UNALIGNED16
+#  define LZO_OPT_UNALIGNED16               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED32
+#  define LZO_OPT_UNALIGNED32               1
+#  endif
+#elif (LZO_ARCH_I386)
+#  ifndef LZO_OPT_UNALIGNED16
+#  define LZO_OPT_UNALIGNED16               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED32
+#  define LZO_OPT_UNALIGNED32               1
+#  endif
+#elif (LZO_ARCH_IA64)
+#  define LZO_OPT_AVOID_INT_INDEX           1
+#  define LZO_OPT_AVOID_UINT_INDEX          1
+#  define LZO_OPT_PREFER_POSTINC            1
+#elif (LZO_ARCH_M68K)
+#  define LZO_OPT_PREFER_POSTINC            1
+#  define LZO_OPT_PREFER_PREDEC             1
+#  if defined(__mc68020__) && !defined(__mcoldfire__)
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#  endif
+#elif (LZO_ARCH_MIPS)
+#  define LZO_OPT_AVOID_UINT_INDEX          1
+#elif (LZO_ARCH_POWERPC)
+#  define LZO_OPT_PREFER_PREINC             1
+#  define LZO_OPT_PREFER_PREDEC             1
+#  if (LZO_ABI_BIG_ENDIAN) || (LZO_WORDSIZE == 8)
+#    ifndef LZO_OPT_UNALIGNED16
+#    define LZO_OPT_UNALIGNED16             1
+#    endif
+#    ifndef LZO_OPT_UNALIGNED32
+#    define LZO_OPT_UNALIGNED32             1
+#    endif
+#    if (LZO_WORDSIZE == 8)
+#      ifndef LZO_OPT_UNALIGNED64
+#      define LZO_OPT_UNALIGNED64           1
+#      endif
+#    endif
+#  endif
+#elif (LZO_ARCH_RISCV)
+#  define LZO_OPT_AVOID_UINT_INDEX          1
+#  ifndef LZO_OPT_UNALIGNED16
+#  define LZO_OPT_UNALIGNED16               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED32
+#  define LZO_OPT_UNALIGNED32               1
+#  endif
+#  if (LZO_WORDSIZE == 8)
+#    ifndef LZO_OPT_UNALIGNED64
+#    define LZO_OPT_UNALIGNED64             1
+#    endif
+#  endif
+#elif (LZO_ARCH_S390)
+#  ifndef LZO_OPT_UNALIGNED16
+#  define LZO_OPT_UNALIGNED16               1
+#  endif
+#  ifndef LZO_OPT_UNALIGNED32
+#  define LZO_OPT_UNALIGNED32               1
+#  endif
+#  if (LZO_WORDSIZE == 8)
+#    ifndef LZO_OPT_UNALIGNED64
+#    define LZO_OPT_UNALIGNED64             1
+#    endif
+#  endif
+#elif (LZO_ARCH_SH)
+#  define LZO_OPT_PREFER_POSTINC            1
+#  define LZO_OPT_PREFER_PREDEC             1
+#endif
+#ifndef LZO_CFG_NO_INLINE_ASM
+#if (LZO_ABI_NEUTRAL_ENDIAN) || (LZO_ARCH_GENERIC)
+#  define LZO_CFG_NO_INLINE_ASM 1
+#elif (LZO_CC_LLVM)
+#  define LZO_CFG_NO_INLINE_ASM 1
+#endif
+#endif
+#if (LZO_CFG_NO_INLINE_ASM)
+#  undef LZO_ASM_SYNTAX_MSC
+#  undef LZO_ASM_SYNTAX_GNUC
+#  undef __LZO_ASM_CLOBBER
+#  undef __LZO_ASM_CLOBBER_LIST_CC
+#  undef __LZO_ASM_CLOBBER_LIST_CC_MEMORY
+#  undef __LZO_ASM_CLOBBER_LIST_EMPTY
+#endif
+#ifndef LZO_CFG_NO_UNALIGNED
+#if (LZO_ABI_NEUTRAL_ENDIAN) || (LZO_ARCH_GENERIC)
+#  define LZO_CFG_NO_UNALIGNED 1
+#endif
+#endif
+#if (LZO_CFG_NO_UNALIGNED)
+#  undef LZO_OPT_UNALIGNED16
+#  undef LZO_OPT_UNALIGNED32
+#  undef LZO_OPT_UNALIGNED64
+#endif
+#if defined(__LZO_INFOSTR_MM)
+#elif (LZO_MM_FLAT) && (defined(__LZO_INFOSTR_PM) || defined(LZO_INFO_ABI_PM))
+#  define __LZO_INFOSTR_MM          ""
+#elif defined(LZO_INFO_MM)
+#  define __LZO_INFOSTR_MM          "." LZO_INFO_MM
+#else
+#  define __LZO_INFOSTR_MM          ""
+#endif
+#if defined(__LZO_INFOSTR_PM)
+#elif defined(LZO_INFO_ABI_PM)
+#  define __LZO_INFOSTR_PM          "." LZO_INFO_ABI_PM
+#else
+#  define __LZO_INFOSTR_PM          ""
+#endif
+#if defined(__LZO_INFOSTR_ENDIAN)
+#elif defined(LZO_INFO_ABI_ENDIAN)
+#  define __LZO_INFOSTR_ENDIAN      "." LZO_INFO_ABI_ENDIAN
+#else
+#  define __LZO_INFOSTR_ENDIAN      ""
+#endif
+#if defined(__LZO_INFOSTR_OSNAME)
+#elif defined(LZO_INFO_OS_CONSOLE)
+#  define __LZO_INFOSTR_OSNAME      LZO_INFO_OS "." LZO_INFO_OS_CONSOLE
+#elif defined(LZO_INFO_OS_POSIX)
+#  define __LZO_INFOSTR_OSNAME      LZO_INFO_OS "." LZO_INFO_OS_POSIX
+#else
+#  define __LZO_INFOSTR_OSNAME      LZO_INFO_OS
+#endif
+#if defined(__LZO_INFOSTR_LIBC)
+#elif defined(LZO_INFO_LIBC)
+#  define __LZO_INFOSTR_LIBC        "." LZO_INFO_LIBC
+#else
+#  define __LZO_INFOSTR_LIBC        ""
+#endif
+#if defined(__LZO_INFOSTR_CCVER)
+#elif defined(LZO_INFO_CCVER)
+#  define __LZO_INFOSTR_CCVER       " " LZO_INFO_CCVER
+#else
+#  define __LZO_INFOSTR_CCVER       ""
+#endif
+#define LZO_INFO_STRING \
+    LZO_INFO_ARCH __LZO_INFOSTR_MM __LZO_INFOSTR_PM __LZO_INFOSTR_ENDIAN \
+    " " __LZO_INFOSTR_OSNAME __LZO_INFOSTR_LIBC " " LZO_INFO_CC __LZO_INFOSTR_CCVER
+#if !(LZO_CFG_SKIP_LZO_TYPES)
+#if (!(LZO_SIZEOF_SHORT+0 > 0 && LZO_SIZEOF_INT+0 > 0 && LZO_SIZEOF_LONG+0 > 0))
+#  error "missing defines for sizes"
+#endif
+#if (!(LZO_SIZEOF_PTRDIFF_T+0 > 0 && LZO_SIZEOF_SIZE_T+0 > 0 && LZO_SIZEOF_VOID_P+0 > 0))
+#  error "missing defines for sizes"
+#endif
+#define LZO_TYPEOF_CHAR             1u
+#define LZO_TYPEOF_SHORT            2u
+#define LZO_TYPEOF_INT              3u
+#define LZO_TYPEOF_LONG             4u
+#define LZO_TYPEOF_LONG_LONG        5u
+#define LZO_TYPEOF___INT8           17u
+#define LZO_TYPEOF___INT16          18u
+#define LZO_TYPEOF___INT32          19u
+#define LZO_TYPEOF___INT64          20u
+#define LZO_TYPEOF___INT128         21u
+#define LZO_TYPEOF___INT256         22u
+#define LZO_TYPEOF___MODE_QI        33u
+#define LZO_TYPEOF___MODE_HI        34u
+#define LZO_TYPEOF___MODE_SI        35u
+#define LZO_TYPEOF___MODE_DI        36u
+#define LZO_TYPEOF___MODE_TI        37u
+#define LZO_TYPEOF_CHAR_P           129u
+#if !defined(lzo_llong_t)
+#if (LZO_SIZEOF_LONG_LONG+0 > 0)
+#  if !(LZO_LANG_ASSEMBLER)
+   __lzo_gnuc_extension__ typedef long long lzo_llong_t__;
+   __lzo_gnuc_extension__ typedef unsigned long long lzo_ullong_t__;
+#  endif
+#  define lzo_llong_t               lzo_llong_t__
+#  define lzo_ullong_t              lzo_ullong_t__
+#endif
+#endif
+#if !defined(lzo_int16e_t)
+#if (LZO_CFG_PREFER_TYPEOF_ACC_INT16E_T == LZO_TYPEOF_SHORT) && (LZO_SIZEOF_SHORT != 2)
+#  undef LZO_CFG_PREFER_TYPEOF_ACC_INT16E_T
+#endif
+#if (LZO_SIZEOF_LONG == 2) && !(LZO_CFG_PREFER_TYPEOF_ACC_INT16E_T == LZO_TYPEOF_SHORT)
+#  define lzo_int16e_t              long
+#  define lzo_uint16e_t             unsigned long
+#  define LZO_TYPEOF_LZO_INT16E_T   LZO_TYPEOF_LONG
+#elif (LZO_SIZEOF_INT == 2) && !(LZO_CFG_PREFER_TYPEOF_ACC_INT16E_T == LZO_TYPEOF_SHORT)
+#  define lzo_int16e_t              int
+#  define lzo_uint16e_t             unsigned int
+#  define LZO_TYPEOF_LZO_INT16E_T   LZO_TYPEOF_INT
+#elif (LZO_SIZEOF_SHORT == 2)
+#  define lzo_int16e_t              short int
+#  define lzo_uint16e_t             unsigned short int
+#  define LZO_TYPEOF_LZO_INT16E_T   LZO_TYPEOF_SHORT
+#elif 1 && !(LZO_CFG_TYPE_NO_MODE_HI) && (LZO_CC_CLANG || (LZO_CC_GNUC >= 0x025f00ul) || LZO_CC_LLVM)
+#  if !(LZO_LANG_ASSEMBLER)
+   typedef int lzo_int16e_hi_t__ __attribute__((__mode__(__HI__)));
+   typedef unsigned int lzo_uint16e_hi_t__ __attribute__((__mode__(__HI__)));
+#  endif
+#  define lzo_int16e_t              lzo_int16e_hi_t__
+#  define lzo_uint16e_t             lzo_uint16e_hi_t__
+#  define LZO_TYPEOF_LZO_INT16E_T   LZO_TYPEOF___MODE_HI
+#elif (LZO_SIZEOF___INT16 == 2)
+#  define lzo_int16e_t              __int16
+#  define lzo_uint16e_t             unsigned __int16
+#  define LZO_TYPEOF_LZO_INT16E_T   LZO_TYPEOF___INT16
+#else
+#endif
+#endif
+#if defined(lzo_int16e_t)
+#  define LZO_SIZEOF_LZO_INT16E_T   2
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int16e_t) == 2)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int16e_t) == LZO_SIZEOF_LZO_INT16E_T)
+#endif
+#if !defined(lzo_int32e_t)
+#if (LZO_CFG_PREFER_TYPEOF_ACC_INT32E_T == LZO_TYPEOF_INT) && (LZO_SIZEOF_INT != 4)
+#  undef LZO_CFG_PREFER_TYPEOF_ACC_INT32E_T
+#endif
+#if (LZO_SIZEOF_LONG == 4) && !(LZO_CFG_PREFER_TYPEOF_ACC_INT32E_T == LZO_TYPEOF_INT)
+#  define lzo_int32e_t              long int
+#  define lzo_uint32e_t             unsigned long int
+#  define LZO_TYPEOF_LZO_INT32E_T   LZO_TYPEOF_LONG
+#elif (LZO_SIZEOF_INT == 4)
+#  define lzo_int32e_t              int
+#  define lzo_uint32e_t             unsigned int
+#  define LZO_TYPEOF_LZO_INT32E_T   LZO_TYPEOF_INT
+#elif (LZO_SIZEOF_SHORT == 4)
+#  define lzo_int32e_t              short int
+#  define lzo_uint32e_t             unsigned short int
+#  define LZO_TYPEOF_LZO_INT32E_T   LZO_TYPEOF_SHORT
+#elif (LZO_SIZEOF_LONG_LONG == 4)
+#  define lzo_int32e_t              lzo_llong_t
+#  define lzo_uint32e_t             lzo_ullong_t
+#  define LZO_TYPEOF_LZO_INT32E_T   LZO_TYPEOF_LONG_LONG
+#elif 1 && !(LZO_CFG_TYPE_NO_MODE_SI) && (LZO_CC_CLANG || (LZO_CC_GNUC >= 0x025f00ul) || LZO_CC_LLVM) && (__INT_MAX__+0 > 2147483647L)
+#  if !(LZO_LANG_ASSEMBLER)
+   typedef int lzo_int32e_si_t__ __attribute__((__mode__(__SI__)));
+   typedef unsigned int lzo_uint32e_si_t__ __attribute__((__mode__(__SI__)));
+#  endif
+#  define lzo_int32e_t              lzo_int32e_si_t__
+#  define lzo_uint32e_t             lzo_uint32e_si_t__
+#  define LZO_TYPEOF_LZO_INT32E_T   LZO_TYPEOF___MODE_SI
+#elif 1 && !(LZO_CFG_TYPE_NO_MODE_SI) && (LZO_CC_GNUC >= 0x025f00ul) && defined(__AVR__) && (__LONG_MAX__+0 == 32767L)
+#  if !(LZO_LANG_ASSEMBLER)
+   typedef int lzo_int32e_si_t__ __attribute__((__mode__(__SI__)));
+   typedef unsigned int lzo_uint32e_si_t__ __attribute__((__mode__(__SI__)));
+#  endif
+#  define lzo_int32e_t              lzo_int32e_si_t__
+#  define lzo_uint32e_t             lzo_uint32e_si_t__
+#  define LZO_INT32_C(c)            (c##LL)
+#  define LZO_UINT32_C(c)           (c##ULL)
+#  define LZO_TYPEOF_LZO_INT32E_T   LZO_TYPEOF___MODE_SI
+#elif (LZO_SIZEOF___INT32 == 4)
+#  define lzo_int32e_t              __int32
+#  define lzo_uint32e_t             unsigned __int32
+#  define LZO_TYPEOF_LZO_INT32E_T   LZO_TYPEOF___INT32
+#else
+#endif
+#endif
+#if defined(lzo_int32e_t)
+#  define LZO_SIZEOF_LZO_INT32E_T   4
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32e_t) == 4)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32e_t) == LZO_SIZEOF_LZO_INT32E_T)
+#endif
+#if !defined(lzo_int64e_t)
+#if (LZO_SIZEOF___INT64 == 8)
+#  if (LZO_CC_BORLANDC) && !defined(LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T)
+#    define LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T  LZO_TYPEOF___INT64
+#  endif
+#endif
+#if (LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T == LZO_TYPEOF_LONG_LONG) && (LZO_SIZEOF_LONG_LONG != 8)
+#  undef LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T
+#endif
+#if (LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T == LZO_TYPEOF___INT64) && (LZO_SIZEOF___INT64 != 8)
+#  undef LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T
+#endif
+#if (LZO_SIZEOF_INT == 8) && (LZO_SIZEOF_INT < LZO_SIZEOF_LONG)
+#  define lzo_int64e_t              int
+#  define lzo_uint64e_t             unsigned int
+#  define LZO_TYPEOF_LZO_INT64E_T   LZO_TYPEOF_INT
+#elif (LZO_SIZEOF_LONG == 8) && !(LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T == LZO_TYPEOF_LONG_LONG) && !(LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T == LZO_TYPEOF___INT64)
+#  define lzo_int64e_t              long int
+#  define lzo_uint64e_t             unsigned long int
+#  define LZO_TYPEOF_LZO_INT64E_T   LZO_TYPEOF_LONG
+#elif (LZO_SIZEOF_LONG_LONG == 8) && !(LZO_CFG_PREFER_TYPEOF_ACC_INT64E_T == LZO_TYPEOF___INT64)
+#  define lzo_int64e_t              lzo_llong_t
+#  define lzo_uint64e_t             lzo_ullong_t
+#  define LZO_TYPEOF_LZO_INT64E_T   LZO_TYPEOF_LONG_LONG
+#  if (LZO_CC_BORLANDC)
+#    define LZO_INT64_C(c)          ((c) + 0ll)
+#    define LZO_UINT64_C(c)         ((c) + 0ull)
+#  elif 0
+#    define LZO_INT64_C(c)          (__lzo_gnuc_extension__ (c##LL))
+#    define LZO_UINT64_C(c)         (__lzo_gnuc_extension__ (c##ULL))
+#  else
+#    define LZO_INT64_C(c)          (c##LL)
+#    define LZO_UINT64_C(c)         (c##ULL)
+#  endif
+#elif (LZO_SIZEOF___INT64 == 8)
+#  define lzo_int64e_t              __int64
+#  define lzo_uint64e_t             unsigned __int64
+#  define LZO_TYPEOF_LZO_INT64E_T   LZO_TYPEOF___INT64
+#  if (LZO_CC_BORLANDC)
+#    define LZO_INT64_C(c)          ((c) + 0i64)
+#    define LZO_UINT64_C(c)         ((c) + 0ui64)
+#  else
+#    define LZO_INT64_C(c)          (c##i64)
+#    define LZO_UINT64_C(c)         (c##ui64)
+#  endif
+#else
+#endif
+#endif
+#if defined(lzo_int64e_t)
+#  define LZO_SIZEOF_LZO_INT64E_T   8
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64e_t) == 8)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64e_t) == LZO_SIZEOF_LZO_INT64E_T)
+#endif
+#if !defined(lzo_int32l_t)
+#if defined(lzo_int32e_t)
+#  define lzo_int32l_t              lzo_int32e_t
+#  define lzo_uint32l_t             lzo_uint32e_t
+#  define LZO_SIZEOF_LZO_INT32L_T   LZO_SIZEOF_LZO_INT32E_T
+#  define LZO_TYPEOF_LZO_INT32L_T   LZO_TYPEOF_LZO_INT32E_T
+#elif (LZO_SIZEOF_INT >= 4) && (LZO_SIZEOF_INT < LZO_SIZEOF_LONG)
+#  define lzo_int32l_t              int
+#  define lzo_uint32l_t             unsigned int
+#  define LZO_SIZEOF_LZO_INT32L_T   LZO_SIZEOF_INT
+#  define LZO_TYPEOF_LZO_INT32L_T   LZO_SIZEOF_INT
+#elif (LZO_SIZEOF_LONG >= 4)
+#  define lzo_int32l_t              long int
+#  define lzo_uint32l_t             unsigned long int
+#  define LZO_SIZEOF_LZO_INT32L_T   LZO_SIZEOF_LONG
+#  define LZO_TYPEOF_LZO_INT32L_T   LZO_SIZEOF_LONG
+#else
+#  error "lzo_int32l_t"
+#endif
+#endif
+#if 1
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32l_t) >= 4)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32l_t) == LZO_SIZEOF_LZO_INT32L_T)
+#endif
+#if !defined(lzo_int64l_t)
+#if defined(lzo_int64e_t)
+#  define lzo_int64l_t              lzo_int64e_t
+#  define lzo_uint64l_t             lzo_uint64e_t
+#  define LZO_SIZEOF_LZO_INT64L_T   LZO_SIZEOF_LZO_INT64E_T
+#  define LZO_TYPEOF_LZO_INT64L_T   LZO_TYPEOF_LZO_INT64E_T
+#else
+#endif
+#endif
+#if defined(lzo_int64l_t)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64l_t) >= 8)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64l_t) == LZO_SIZEOF_LZO_INT64L_T)
+#endif
+#if !defined(lzo_int32f_t)
+#if (LZO_SIZEOF_SIZE_T >= 8)
+#  define lzo_int32f_t              lzo_int64l_t
+#  define lzo_uint32f_t             lzo_uint64l_t
+#  define LZO_SIZEOF_LZO_INT32F_T   LZO_SIZEOF_LZO_INT64L_T
+#  define LZO_TYPEOF_LZO_INT32F_T   LZO_TYPEOF_LZO_INT64L_T
+#else
+#  define lzo_int32f_t              lzo_int32l_t
+#  define lzo_uint32f_t             lzo_uint32l_t
+#  define LZO_SIZEOF_LZO_INT32F_T   LZO_SIZEOF_LZO_INT32L_T
+#  define LZO_TYPEOF_LZO_INT32F_T   LZO_TYPEOF_LZO_INT32L_T
+#endif
+#endif
+#if 1
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32f_t) >= 4)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32f_t) == LZO_SIZEOF_LZO_INT32F_T)
+#endif
+#if !defined(lzo_int64f_t)
+#if defined(lzo_int64l_t)
+#  define lzo_int64f_t              lzo_int64l_t
+#  define lzo_uint64f_t             lzo_uint64l_t
+#  define LZO_SIZEOF_LZO_INT64F_T   LZO_SIZEOF_LZO_INT64L_T
+#  define LZO_TYPEOF_LZO_INT64F_T   LZO_TYPEOF_LZO_INT64L_T
+#else
+#endif
+#endif
+#if defined(lzo_int64f_t)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64f_t) >= 8)
+   LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64f_t) == LZO_SIZEOF_LZO_INT64F_T)
+#endif
+#if !defined(lzo_intptr_t)
+#if 1 && (LZO_OS_OS400 && (LZO_SIZEOF_VOID_P == 16))
+#  define __LZO_INTPTR_T_IS_POINTER 1
+#  if !(LZO_LANG_ASSEMBLER)
+   typedef char *                   lzo_intptr_t;
+   typedef char *                   lzo_uintptr_t;
+#  endif
+#  define lzo_intptr_t              lzo_intptr_t
+#  define lzo_uintptr_t             lzo_uintptr_t
+#  define LZO_SIZEOF_LZO_INTPTR_T   LZO_SIZEOF_VOID_P
+#  define LZO_TYPEOF_LZO_INTPTR_T   LZO_TYPEOF_CHAR_P
+#elif (LZO_CC_MSC && (_MSC_VER >= 1300) && (LZO_SIZEOF_VOID_P == 4) && (LZO_SIZEOF_INT == 4))
+#  if !(LZO_LANG_ASSEMBLER)
+   typedef __w64 int                lzo_intptr_t;
+   typedef __w64 unsigned int       lzo_uintptr_t;
+#  endif
+#  define lzo_intptr_t              lzo_intptr_t
+#  define lzo_uintptr_t             lzo_uintptr_t
+#  define LZO_SIZEOF_LZO_INTPTR_T   LZO_SIZEOF_INT
+#  define LZO_TYPEOF_LZO_INTPTR_T   LZO_TYPEOF_INT
+#elif (LZO_SIZEOF_SHORT == LZO_SIZEOF_VOID_P) && (LZO_SIZEOF_INT > LZO_SIZEOF_VOID_P)
+#  define lzo_intptr_t              short
+#  define lzo_uintptr_t             unsigned short
+#  define LZO_SIZEOF_LZO_INTPTR_T   LZO_SIZEOF_SHORT
+#  define LZO_TYPEOF_LZO_INTPTR_T   LZO_TYPEOF_SHORT
+#elif (LZO_SIZEOF_INT >= LZO_SIZEOF_VOID_P) && (LZO_SIZEOF_INT < LZO_SIZEOF_LONG)
+#  define lzo_intptr_t              int
+#  define lzo_uintptr_t             unsigned int
+#  define LZO_SIZEOF_LZO_INTPTR_T   LZO_SIZEOF_INT
+#  define LZO_TYPEOF_LZO_INTPTR_T   LZO_TYPEOF_INT
+#elif (LZO_SIZEOF_LONG >= LZO_SIZEOF_VOID_P)
+#  define lzo_intptr_t              long
+#  define lzo_uintptr_t             unsigned long
+#  define LZO_SIZEOF_LZO_INTPTR_T   LZO_SIZEOF_LONG
+#  define LZO_TYPEOF_LZO_INTPTR_T   LZO_TYPEOF_LONG
+#elif (LZO_SIZEOF_LZO_INT64L_T >= LZO_SIZEOF_VOID_P)
+#  define lzo_intptr_t              lzo_int64l_t
+#  define lzo_uintptr_t             lzo_uint64l_t
+#  define LZO_SIZEOF_LZO_INTPTR_T   LZO_SIZEOF_LZO_INT64L_T
+#  define LZO_TYPEOF_LZO_INTPTR_T   LZO_TYPEOF_LZO_INT64L_T
+#else
+#  error "lzo_intptr_t"
+#endif
+#endif
+#if 1
+    LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_intptr_t) >= sizeof(void *))
+    LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_intptr_t) == sizeof(lzo_uintptr_t))
+#endif
+#if !defined(lzo_word_t)
+#if defined(LZO_WORDSIZE) && (LZO_WORDSIZE+0 > 0)
+#if (LZO_WORDSIZE == LZO_SIZEOF_LZO_INTPTR_T) && !(__LZO_INTPTR_T_IS_POINTER)
+#  define lzo_word_t                lzo_uintptr_t
+#  define lzo_sword_t               lzo_intptr_t
+#  define LZO_SIZEOF_LZO_WORD_T     LZO_SIZEOF_LZO_INTPTR_T
+#  define LZO_TYPEOF_LZO_WORD_T     LZO_TYPEOF_LZO_INTPTR_T
+#elif (LZO_WORDSIZE == LZO_SIZEOF_LONG)
+#  define lzo_word_t                unsigned long
+#  define lzo_sword_t               long
+#  define LZO_SIZEOF_LZO_WORD_T     LZO_SIZEOF_LONG
+#  define LZO_TYPEOF_LZO_WORD_T     LZO_TYPEOF_LONG
+#elif (LZO_WORDSIZE == LZO_SIZEOF_INT)
+#  define lzo_word_t                unsigned int
+#  define lzo_sword_t               int
+#  define LZO_SIZEOF_LZO_WORD_T     LZO_SIZEOF_INT
+#  define LZO_TYPEOF_LZO_WORD_T     LZO_TYPEOF_INT
+#elif (LZO_WORDSIZE == LZO_SIZEOF_SHORT)
+#  define lzo_word_t                unsigned short
+#  define lzo_sword_t               short
+#  define LZO_SIZEOF_LZO_WORD_T     LZO_SIZEOF_SHORT
+#  define LZO_TYPEOF_LZO_WORD_T     LZO_TYPEOF_SHORT
+#elif (LZO_WORDSIZE == 1)
+#  define lzo_word_t                unsigned char
+#  define lzo_sword_t               signed char
+#  define LZO_SIZEOF_LZO_WORD_T     1
+#  define LZO_TYPEOF_LZO_WORD_T     LZO_TYPEOF_CHAR
+#elif (LZO_WORDSIZE == LZO_SIZEOF_LZO_INT64L_T)
+#  define lzo_word_t                lzo_uint64l_t
+#  define lzo_sword_t               lzo_int64l_t
+#  define LZO_SIZEOF_LZO_WORD_T     LZO_SIZEOF_LZO_INT64L_T
+#  define LZO_TYPEOF_LZO_WORD_T     LZO_SIZEOF_LZO_INT64L_T
+#elif (LZO_ARCH_SPU) && (LZO_CC_GNUC)
+#if 0
+#  if !(LZO_LANG_ASSEMBLER)
+   typedef unsigned lzo_word_t  __attribute__((__mode__(__V16QI__)));
+   typedef int      lzo_sword_t __attribute__((__mode__(__V16QI__)));
+#  endif
+#  define lzo_word_t                lzo_word_t
+#  define lzo_sword_t               lzo_sword_t
+#  define LZO_SIZEOF_LZO_WORD_T     16
+#  define LZO_TYPEOF_LZO_WORD_T     LZO_TYPEOF___MODE_V16QI
+#endif
+#else
+#  error "lzo_word_t"
+#endif
+#endif
+#endif
+#if 1 && defined(lzo_word_t)
+    LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_word_t)  == LZO_WORDSIZE)
+    LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_sword_t) == LZO_WORDSIZE)
+#endif
+#if 1
+#define lzo_int8_t                  signed char
+#define lzo_uint8_t                 unsigned char
+#define LZO_SIZEOF_LZO_INT8_T       1
+#define LZO_TYPEOF_LZO_INT8_T       LZO_TYPEOF_CHAR
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int8_t) == 1)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int8_t) == sizeof(lzo_uint8_t))
+#endif
+#if defined(lzo_int16e_t)
+#define lzo_int16_t                 lzo_int16e_t
+#define lzo_uint16_t                lzo_uint16e_t
+#define LZO_SIZEOF_LZO_INT16_T      LZO_SIZEOF_LZO_INT16E_T
+#define LZO_TYPEOF_LZO_INT16_T      LZO_TYPEOF_LZO_INT16E_T
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int16_t) == 2)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int16_t) == sizeof(lzo_uint16_t))
+#endif
+#if defined(lzo_int32e_t)
+#define lzo_int32_t                 lzo_int32e_t
+#define lzo_uint32_t                lzo_uint32e_t
+#define LZO_SIZEOF_LZO_INT32_T      LZO_SIZEOF_LZO_INT32E_T
+#define LZO_TYPEOF_LZO_INT32_T      LZO_TYPEOF_LZO_INT32E_T
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32_t) == 4)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int32_t) == sizeof(lzo_uint32_t))
+#endif
+#if defined(lzo_int64e_t)
+#define lzo_int64_t                 lzo_int64e_t
+#define lzo_uint64_t                lzo_uint64e_t
+#define LZO_SIZEOF_LZO_INT64_T      LZO_SIZEOF_LZO_INT64E_T
+#define LZO_TYPEOF_LZO_INT64_T      LZO_TYPEOF_LZO_INT64E_T
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64_t) == 8)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int64_t) == sizeof(lzo_uint64_t))
+#endif
+#if 1
+#define lzo_int_least32_t           lzo_int32l_t
+#define lzo_uint_least32_t          lzo_uint32l_t
+#define LZO_SIZEOF_LZO_INT_LEAST32_T LZO_SIZEOF_LZO_INT32L_T
+#define LZO_TYPEOF_LZO_INT_LEAST32_T LZO_TYPEOF_LZO_INT32L_T
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_least32_t) >= 4)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_least32_t) == sizeof(lzo_uint_least32_t))
+#endif
+#if defined(lzo_int64l_t)
+#define lzo_int_least64_t           lzo_int64l_t
+#define lzo_uint_least64_t          lzo_uint64l_t
+#define LZO_SIZEOF_LZO_INT_LEAST64_T LZO_SIZEOF_LZO_INT64L_T
+#define LZO_TYPEOF_LZO_INT_LEAST64_T LZO_TYPEOF_LZO_INT64L_T
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_least64_t) >= 8)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_least64_t) == sizeof(lzo_uint_least64_t))
+#endif
+#if 1
+#define lzo_int_fast32_t           lzo_int32f_t
+#define lzo_uint_fast32_t          lzo_uint32f_t
+#define LZO_SIZEOF_LZO_INT_FAST32_T LZO_SIZEOF_LZO_INT32F_T
+#define LZO_TYPEOF_LZO_INT_FAST32_T LZO_TYPEOF_LZO_INT32F_T
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_fast32_t) >= 4)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_fast32_t) == sizeof(lzo_uint_fast32_t))
+#endif
+#if defined(lzo_int64f_t)
+#define lzo_int_fast64_t           lzo_int64f_t
+#define lzo_uint_fast64_t          lzo_uint64f_t
+#define LZO_SIZEOF_LZO_INT_FAST64_T LZO_SIZEOF_LZO_INT64F_T
+#define LZO_TYPEOF_LZO_INT_FAST64_T LZO_TYPEOF_LZO_INT64F_T
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_fast64_t) >= 8)
+LZO_COMPILE_TIME_ASSERT_HEADER(sizeof(lzo_int_fast64_t) == sizeof(lzo_uint_fast64_t))
+#endif
+#if !defined(LZO_INT16_C)
+#  if (LZO_BROKEN_INTEGRAL_CONSTANTS) && (LZO_SIZEOF_INT >= 2)
+#    define LZO_INT16_C(c)          ((c) + 0)
+#    define LZO_UINT16_C(c)         ((c) + 0U)
+#  elif (LZO_BROKEN_INTEGRAL_CONSTANTS) && (LZO_SIZEOF_LONG >= 2)
+#    define LZO_INT16_C(c)          ((c) + 0L)
+#    define LZO_UINT16_C(c)         ((c) + 0UL)
+#  elif (LZO_SIZEOF_INT >= 2)
+#    define LZO_INT16_C(c)          (c)
+#    define LZO_UINT16_C(c)         (c##U)
+#  elif (LZO_SIZEOF_LONG >= 2)
+#    define LZO_INT16_C(c)          (c##L)
+#    define LZO_UINT16_C(c)         (c##UL)
+#  else
+#    error "LZO_INT16_C"
+#  endif
+#endif
+#if !defined(LZO_INT32_C)
+#  if (LZO_BROKEN_INTEGRAL_CONSTANTS) && (LZO_SIZEOF_INT >= 4)
+#    define LZO_INT32_C(c)          ((c) + 0)
+#    define LZO_UINT32_C(c)         ((c) + 0U)
+#  elif (LZO_BROKEN_INTEGRAL_CONSTANTS) && (LZO_SIZEOF_LONG >= 4)
+#    define LZO_INT32_C(c)          ((c) + 0L)
+#    define LZO_UINT32_C(c)         ((c) + 0UL)
+#  elif (LZO_SIZEOF_INT >= 4)
+#    define LZO_INT32_C(c)          (c)
+#    define LZO_UINT32_C(c)         (c##U)
+#  elif (LZO_SIZEOF_LONG >= 4)
+#    define LZO_INT32_C(c)          (c##L)
+#    define LZO_UINT32_C(c)         (c##UL)
+#  elif (LZO_SIZEOF_LONG_LONG >= 4)
+#    define LZO_INT32_C(c)          (c##LL)
+#    define LZO_UINT32_C(c)         (c##ULL)
+#  else
+#    error "LZO_INT32_C"
+#  endif
+#endif
+#if !defined(LZO_INT64_C) && defined(lzo_int64l_t)
+#  if (LZO_BROKEN_INTEGRAL_CONSTANTS) && (LZO_SIZEOF_INT >= 8)
+#    define LZO_INT64_C(c)          ((c) + 0)
+#    define LZO_UINT64_C(c)         ((c) + 0U)
+#  elif (LZO_BROKEN_INTEGRAL_CONSTANTS) && (LZO_SIZEOF_LONG >= 8)
+#    define LZO_INT64_C(c)          ((c) + 0L)
+#    define LZO_UINT64_C(c)         ((c) + 0UL)
+#  elif (LZO_SIZEOF_INT >= 8)
+#    define LZO_INT64_C(c)          (c)
+#    define LZO_UINT64_C(c)         (c##U)
+#  elif (LZO_SIZEOF_LONG >= 8)
+#    define LZO_INT64_C(c)          (c##L)
+#    define LZO_UINT64_C(c)         (c##UL)
+#  else
+#    error "LZO_INT64_C"
+#  endif
+#endif
+#endif
+
+#endif /* already included */
+
+/* vim:set ts=4 sw=4 et: */
diff --git a/tools/z64compress/src/enc/stretchy_buffer.h b/tools/z64compress/src/enc/stretchy_buffer.h
new file mode 100644
index 000000000..cbd48a300
--- /dev/null
+++ b/tools/z64compress/src/enc/stretchy_buffer.h
@@ -0,0 +1,262 @@
+// stretchy_buffer.h - v1.03 - public domain - nothings.org/stb
+// a vector<>-like dynamic array for C
+//
+// version history:
+//      1.03 -  compile as C++ maybe
+//      1.02 -  tweaks to syntax for no good reason
+//      1.01 -  added a "common uses" documentation section
+//      1.0  -  fixed bug in the version I posted prematurely
+//      0.9  -  rewrite to try to avoid strict-aliasing optimization
+//              issues, but won't compile as C++
+//
+// Will probably not work correctly with strict-aliasing optimizations.
+//
+// The idea:
+//
+//    This implements an approximation to C++ vector<> for C, in that it
+//    provides a generic definition for dynamic arrays which you can
+//    still access in a typesafe way using arr[i] or *(arr+i). However,
+//    it is simply a convenience wrapper around the common idiom of
+//    of keeping a set of variables (in a struct or globals) which store
+//        - pointer to array
+//        - the length of the "in-use" part of the array
+//        - the current size of the allocated array
+//
+//    I find it to be the single most useful non-built-in-structure when
+//    programming in C (hash tables a close second), but to be clear
+//    it lacks many of the capabilities of C++ vector<>: there is no
+//    range checking, the object address isn't stable (see next section
+//    for details), the set of methods available is small (although
+//    the file stb.h has another implementation of stretchy buffers
+//    called 'stb_arr' which provides more methods, e.g. for insertion
+//    and deletion).
+//
+// How to use:
+//
+//    Unlike other stb header file libraries, there is no need to
+//    define an _IMPLEMENTATION symbol. Every #include creates as
+//    much implementation is needed.
+//
+//    stretchy_buffer.h does not define any types, so you do not
+//    need to #include it to before defining data types that are
+//    stretchy buffers, only in files that *manipulate* stretchy
+//    buffers.
+//
+//    If you want a stretchy buffer aka dynamic array containing
+//    objects of TYPE, declare such an array as:
+//
+//       TYPE *myarray = NULL;
+//
+//    (There is no typesafe way to distinguish between stretchy
+//    buffers and regular arrays/pointers; this is necessary to
+//    make ordinary array indexing work on these objects.)
+//
+//    Unlike C++ vector<>, the stretchy_buffer has the same
+//    semantics as an object that you manually malloc and realloc.
+//    The pointer may relocate every time you add a new object
+//    to it, so you:
+//
+//         1. can't take long-term pointers to elements of the array
+//         2. have to return the pointer from functions which might expand it
+//            (either as a return value or by storing it to a ptr-to-ptr)
+//
+//    Now you can do the following things with this array:
+//
+//         sb_free(TYPE *a)           free the array
+//         sb_count(TYPE *a)          the number of elements in the array
+//         sb_push(TYPE *a, TYPE v)   adds v on the end of the array, a la push_back
+//         sb_add(TYPE *a, int n)     adds n uninitialized elements at end of array & returns pointer to first added
+//         sb_last(TYPE *a)           returns an lvalue of the last item in the array
+//         a[n]                       access the nth (counting from 0) element of the array
+//
+//     #define STRETCHY_BUFFER_NO_SHORT_NAMES to only export
+//     names of the form 'stb_sb_' if you have a name that would
+//     otherwise collide.
+//
+//     Note that these are all macros and many of them evaluate
+//     their arguments more than once, so the arguments should
+//     be side-effect-free.
+//
+//     Note that 'TYPE *a' in sb_push and sb_add must be lvalues
+//     so that the library can overwrite the existing pointer if
+//     the object has to be reallocated.
+//
+//     In an out-of-memory condition, the code will try to
+//     set up a null-pointer or otherwise-invalid-pointer
+//     exception to happen later. It's possible optimizing
+//     compilers could detect this write-to-null statically
+//     and optimize away some of the code, but it should only
+//     be along the failure path. Nevertheless, for more security
+//     in the face of such compilers, #define STRETCHY_BUFFER_OUT_OF_MEMORY
+//     to a statement such as assert(0) or exit(1) or something
+//     to force a failure when out-of-memory occurs.
+//
+// Common use:
+//
+//    The main application for this is when building a list of
+//    things with an unknown quantity, either due to loading from
+//    a file or through a process which produces an unpredictable
+//    number.
+//
+//    My most common idiom is something like:
+//
+//       SomeStruct *arr = NULL;
+//       while (something)
+//       {
+//          SomeStruct new_one;
+//          new_one.whatever = whatever;
+//          new_one.whatup   = whatup;
+//          new_one.foobar   = barfoo;
+//          sb_push(arr, new_one);
+//       }
+//
+//    and various closely-related factorings of that. For example,
+//    you might have several functions to create/init new SomeStructs,
+//    and if you use the above idiom, you might prefer to make them
+//    return structs rather than take non-const-pointers-to-structs,
+//    so you can do things like:
+//
+//       SomeStruct *arr = NULL;
+//       while (something)
+//       {
+//          if (case_A) {
+//             sb_push(arr, some_func1());
+//          } else if (case_B) {
+//             sb_push(arr, some_func2());
+//          } else {
+//             sb_push(arr, some_func3());
+//          }
+//       }
+//
+//    Note that the above relies on the fact that sb_push doesn't
+//    evaluate its second argument more than once. The macros do
+//    evaluate the *array* argument multiple times, and numeric
+//    arguments may be evaluated multiple times, but you can rely
+//    on the second argument of sb_push being evaluated only once.
+//
+//    Of course, you don't have to store bare objects in the array;
+//    if you need the objects to have stable pointers, store an array
+//    of pointers instead:
+//
+//       SomeStruct **arr = NULL;
+//       while (something)
+//       {
+//          SomeStruct *new_one = malloc(sizeof(*new_one));
+//          new_one->whatever = whatever;
+//          new_one->whatup   = whatup;
+//          new_one->foobar   = barfoo;
+//          sb_push(arr, new_one);
+//       }
+//
+// How it works:
+//
+//    A long-standing tradition in things like malloc implementations
+//    is to store extra data before the beginning of the block returned
+//    to the user. The stretchy buffer implementation here uses the
+//    same trick; the current-count and current-allocation-size are
+//    stored before the beginning of the array returned to the user.
+//    (This means you can't directly free() the pointer, because the
+//    allocated pointer is different from the type-safe pointer provided
+//    to the user.)
+//
+//    The details are trivial and implementation is straightforward;
+//    the main trick is in realizing in the first place that it's
+//    possible to do this in a generic, type-safe way in C.
+//
+// Contributors:
+//
+// Timothy Wright (github:ZenToad)
+//
+// LICENSE
+//
+//   See end of file for license information.
+
+#ifndef STB_STRETCHY_BUFFER_H_INCLUDED
+#define STB_STRETCHY_BUFFER_H_INCLUDED
+
+#ifndef NO_STRETCHY_BUFFER_SHORT_NAMES
+#define sb_free   stb_sb_free
+#define sb_push   stb_sb_push
+#define sb_count  stb_sb_count
+#define sb_add    stb_sb_add
+#define sb_last   stb_sb_last
+#endif
+
+#define stb_sb_free(a)         ((a) ? free(stb__sbraw(a)),0 : 0)
+#define stb_sb_push(a,v)       (stb__sbmaybegrow(a,1), (a)[stb__sbn(a)++] = (v))
+#define stb_sb_count(a)        ((a) ? stb__sbn(a) : 0)
+#define stb_sb_add(a,n)        (stb__sbmaybegrow(a,n), stb__sbn(a)+=(n), &(a)[stb__sbn(a)-(n)])
+#define stb_sb_last(a)         ((a)[stb__sbn(a)-1])
+
+#define stb__sbraw(a) ((int *) (a) - 2)
+#define stb__sbm(a)   stb__sbraw(a)[0]
+#define stb__sbn(a)   stb__sbraw(a)[1]
+
+#define stb__sbneedgrow(a,n)  ((a)==0 || stb__sbn(a)+(n) >= stb__sbm(a))
+#define stb__sbmaybegrow(a,n) (stb__sbneedgrow(a,(n)) ? stb__sbgrow(a,n) : 0)
+#define stb__sbgrow(a,n)      (*((void **)&(a)) = stb__sbgrowf((a), (n), sizeof(*(a))))
+
+#include <stdlib.h>
+
+static void * stb__sbgrowf(void *arr, int increment, int itemsize)
+{
+   int dbl_cur = arr ? 2*stb__sbm(arr) : 0;
+   int min_needed = stb_sb_count(arr) + increment;
+   int m = dbl_cur > min_needed ? dbl_cur : min_needed;
+   int *p = (int *) realloc(arr ? stb__sbraw(arr) : 0, itemsize * m + sizeof(int)*2);
+   if (p) {
+      if (!arr)
+         p[1] = 0;
+      p[0] = m;
+      return p+2;
+   } else {
+      #ifdef STRETCHY_BUFFER_OUT_OF_MEMORY
+      STRETCHY_BUFFER_OUT_OF_MEMORY ;
+      #endif
+      return (void *) (2*sizeof(int)); // try to force a NULL pointer exception later
+   }
+}
+#endif // STB_STRETCHY_BUFFER_H_INCLUDED
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/tools/z64compress/src/enc/ucl.c b/tools/z64compress/src/enc/ucl.c
new file mode 100644
index 000000000..49474af25
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "ucl/ucl.h"
+
+int
+uclenc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	int r;
+	int level = 10;
+	ucl_uint result_sz;
+	
+	extern int g_hlen; /* header length */
+	memset(dst, 0, g_hlen);
+	memcpy(dst, "UCL0", 4);
+	dst[4] = (src_sz >> 24);
+	dst[5] = (src_sz >> 16);
+	dst[6] = (src_sz >>  8);
+	dst[7] = (src_sz >>  0);
+	
+	r = ucl_nrv2b_99_compress(
+		src          /* in */
+		, src_sz     /* in size */
+		, dst + g_hlen /* out */
+		, &result_sz /* out size */
+		, NULL       /* callback */
+		, level      /* level */
+		, NULL       /* conf */
+		, NULL       /* result */
+	);
+	
+	if (r != UCL_E_OK)
+	{
+		fprintf(stderr, "[!] fatal compression error %d\n", r);
+		exit(EXIT_FAILURE);
+	}
+	
+	*dst_sz = result_sz + g_hlen;
+	
+	return 0;
+}
+
diff --git a/tools/z64compress/src/enc/ucl/comp/n2_99.ch b/tools/z64compress/src/enc/ucl/comp/n2_99.ch
new file mode 100644
index 000000000..06c5c6467
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/comp/n2_99.ch
@@ -0,0 +1,651 @@
+/* n2_99.ch -- implementation of the NRV2[BDE]-99 compression algorithms
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+
+#define HAVE_MEMCMP 1
+#define HAVE_MEMCPY 1
+#define HAVE_MEMMOVE 1
+#define HAVE_MEMSET 1
+
+
+#include "../ucl_conf.h"
+#include "../ucl.h"
+
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+#include <assert.h>
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#define SWD_USE_MALLOC 1
+#if (ACC_OS_DOS16)
+#define SWD_HMASK       (s->hmask)
+#define HEAD2_VAR
+#define IF_HEAD2(s)     if (s->use_head2)
+#else
+#define SWD_HMASK       (UCL_UINT32_C(65535))
+#define IF_HEAD2(s)
+#endif
+#define SWD_N           (8*1024*1024ul) /* max. size of ring buffer */
+#define SWD_F           2048            /* upper limit for match length */
+#define SWD_THRESHOLD   1               /* lower limit for match length */
+
+#if defined(NRV2B)
+#  define UCL_COMPRESS_T                ucl_nrv2b_t
+#  define ucl_swd_t                     ucl_nrv2b_swd_t
+#  define ucl_nrv_99_compress           ucl_nrv2b_99_compress
+#  define M2_MAX_OFFSET                 0xd00
+#elif defined(NRV2D)
+#  define UCL_COMPRESS_T                ucl_nrv2d_t
+#  define ucl_swd_t                     ucl_nrv2d_swd_t
+#  define ucl_nrv_99_compress           ucl_nrv2d_99_compress
+#  define M2_MAX_OFFSET                 0x500
+#elif defined(NRV2E)
+#  define UCL_COMPRESS_T                ucl_nrv2e_t
+#  define ucl_swd_t                     ucl_nrv2e_swd_t
+#  define ucl_nrv_99_compress           ucl_nrv2e_99_compress
+#  define M2_MAX_OFFSET                 0x500
+#else
+#  error
+#endif
+#define ucl_swd_p       ucl_swd_t * __UCL_MMODEL
+
+#include "ucl_mchw.ch"
+
+
+/***********************************************************************
+// start-step-stop prefix coding
+************************************************************************/
+
+static void code_prefix_ss11(UCL_COMPRESS_T *c, ucl_uint32 i)
+{
+    if (i >= 2)
+    {
+        ucl_uint32 t = 4;
+        i += 2;
+        do {
+            t <<= 1;
+        } while (i >= t);
+        t >>= 1;
+        do {
+            t >>= 1;
+            bbPutBit(c, (i & t) ? 1 : 0);
+            bbPutBit(c, 0);
+        } while (t > 2);
+    }
+    bbPutBit(c, (unsigned)i & 1);
+    bbPutBit(c, 1);
+}
+
+
+#if defined(NRV2D) || defined(NRV2E)
+static void code_prefix_ss12(UCL_COMPRESS_T *c, ucl_uint32 i)
+{
+    if (i >= 2)
+    {
+        ucl_uint32 t = 2;
+        do {
+            i -= t;
+            t <<= 2;
+        } while (i >= t);
+        do {
+            t >>= 1;
+            bbPutBit(c, (i & t) ? 1 : 0);
+            bbPutBit(c, 0);
+            t >>= 1;
+            bbPutBit(c, (i & t) ? 1 : 0);
+        } while (t > 2);
+    }
+    bbPutBit(c, (unsigned)i & 1);
+    bbPutBit(c, 1);
+}
+#endif
+
+
+static void
+code_match(UCL_COMPRESS_T *c, ucl_uint m_len, const ucl_uint m_off)
+{
+    unsigned m_low = 0;
+
+    while (m_len > c->conf.max_match)
+    {
+        code_match(c, c->conf.max_match - 3, m_off);
+        m_len -= c->conf.max_match - 3;
+    }
+
+    c->match_bytes += m_len;
+    if (m_len > c->result[3])
+        c->result[3] = m_len;
+    if (m_off > c->result[1])
+        c->result[1] = m_off;
+
+    bbPutBit(c, 0);
+
+#if defined(NRV2B)
+    if (m_off == c->last_m_off)
+    {
+        bbPutBit(c, 0);
+        bbPutBit(c, 1);
+    }
+    else
+    {
+        code_prefix_ss11(c, 1 + ((m_off - 1) >> 8));
+        bbPutByte(c, (unsigned)m_off - 1);
+    }
+    m_len = m_len - 1 - (m_off > M2_MAX_OFFSET);
+    if (m_len >= 4)
+    {
+        bbPutBit(c,0);
+        bbPutBit(c,0);
+        code_prefix_ss11(c, m_len - 4);
+    }
+    else
+    {
+        bbPutBit(c, m_len > 1);
+        bbPutBit(c, (unsigned)m_len & 1);
+    }
+#elif defined(NRV2D)
+    m_len = m_len - 1 - (m_off > M2_MAX_OFFSET);
+    assert(m_len > 0);
+    m_low = (m_len >= 4) ? 0u : (unsigned) m_len;
+    if (m_off == c->last_m_off)
+    {
+        bbPutBit(c, 0);
+        bbPutBit(c, 1);
+        bbPutBit(c, m_low > 1);
+        bbPutBit(c, m_low & 1);
+    }
+    else
+    {
+        code_prefix_ss12(c, 1 + ((m_off - 1) >> 7));
+        bbPutByte(c, ((((unsigned)m_off - 1) & 0x7f) << 1) | ((m_low > 1) ? 0 : 1));
+        bbPutBit(c, m_low & 1);
+    }
+    if (m_len >= 4)
+        code_prefix_ss11(c, m_len - 4);
+#elif defined(NRV2E)
+    m_len = m_len - 1 - (m_off > M2_MAX_OFFSET);
+    assert(m_len > 0);
+    m_low = (m_len <= 2);
+    if (m_off == c->last_m_off)
+    {
+        bbPutBit(c, 0);
+        bbPutBit(c, 1);
+        bbPutBit(c, m_low);
+    }
+    else
+    {
+        code_prefix_ss12(c, 1 + ((m_off - 1) >> 7));
+        bbPutByte(c, ((((unsigned)m_off - 1) & 0x7f) << 1) | (m_low ^ 1));
+    }
+    if (m_low)
+        bbPutBit(c, (unsigned)m_len - 1);
+    else if (m_len <= 4)
+    {
+        bbPutBit(c, 1);
+        bbPutBit(c, (unsigned)m_len - 3);
+    }
+    else
+    {
+        bbPutBit(c, 0);
+        code_prefix_ss11(c, m_len - 5);
+    }
+#else
+#  error
+#endif
+
+    c->last_m_off = m_off;
+    (void)m_low;
+}
+
+
+static void
+code_run(UCL_COMPRESS_T *c, const ucl_bytep ii, ucl_uint lit)
+{
+    if (lit == 0)
+        return;
+    c->lit_bytes += lit;
+    if (lit > c->result[5])
+        c->result[5] = lit;
+    do {
+        bbPutBit(c, 1);
+        bbPutByte(c, *ii++);
+    } while (--lit > 0);
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static int
+len_of_coded_match(UCL_COMPRESS_T *c, ucl_uint m_len, ucl_uint m_off)
+{
+    int b;
+    if (m_len < 2 || (m_len == 2 && (m_off > M2_MAX_OFFSET))
+        || m_off > c->conf.max_offset)
+        return -1;
+    assert(m_off > 0);
+
+    m_len = m_len - 2 - (m_off > M2_MAX_OFFSET);
+
+    if (m_off == c->last_m_off)
+        b = 1 + 2;
+    else
+    {
+#if defined(NRV2B)
+        b = 1 + 10;
+        m_off = (m_off - 1) >> 8;
+        while (m_off > 0)
+        {
+            b += 2;
+            m_off >>= 1;
+        }
+#elif defined(NRV2D) || defined(NRV2E)
+        b = 1 + 9;
+        m_off = (m_off - 1) >> 7;
+        while (m_off > 0)
+        {
+            b += 3;
+            m_off >>= 2;
+        }
+#else
+#  error
+#endif
+    }
+
+#if defined(NRV2B) || defined(NRV2D)
+    b += 2;
+    if (m_len < 3)
+        return b;
+    m_len -= 3;
+#elif defined(NRV2E)
+    b += 2;
+    if (m_len < 2)
+        return b;
+    if (m_len < 4)
+        return b + 1;
+    m_len -= 4;
+#else
+#  error
+#endif
+    do {
+        b += 2;
+        m_len >>= 1;
+    } while (m_len > 0);
+
+    return b;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#if !defined(NDEBUG)
+static
+void assert_match( const ucl_swd_p swd, ucl_uint m_len, ucl_uint m_off )
+{
+    const UCL_COMPRESS_T *c = swd->c;
+    ucl_uint d_off;
+
+    assert(m_len >= 2);
+    if (m_off <= (ucl_uint) (c->bp - c->in))
+    {
+        assert(c->bp - m_off + m_len < c->ip);
+        assert(ucl_memcmp(c->bp, c->bp - m_off, m_len) == 0);
+    }
+    else
+    {
+        assert(swd->dict != NULL);
+        d_off = m_off - (ucl_uint) (c->bp - c->in);
+        assert(d_off <= swd->dict_len);
+        if (m_len > d_off)
+        {
+            assert(ucl_memcmp(c->bp, swd->dict_end - d_off, d_off) == 0);
+            assert(c->in + m_len - d_off < c->ip);
+            assert(ucl_memcmp(c->bp + d_off, c->in, m_len - d_off) == 0);
+        }
+        else
+        {
+            assert(ucl_memcmp(c->bp, swd->dict_end - d_off, m_len) == 0);
+        }
+    }
+}
+#else
+#  define assert_match(a,b,c)   ((void)0)
+#endif
+
+
+#if defined(SWD_BEST_OFF)
+
+static void
+better_match ( const ucl_swd_p swd, ucl_uint *m_len, ucl_uint *m_off )
+{
+}
+
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+UCL_PUBLIC(int)
+ucl_nrv_99_compress        ( const ucl_bytep in, ucl_uint in_len,
+                                   ucl_bytep out, ucl_uintp out_len,
+                                   ucl_progress_callback_p cb,
+                                   int level,
+                             const struct ucl_compress_config_p conf,
+                                   ucl_uintp result)
+{
+    const ucl_bytep ii;
+    ucl_uint lit;
+    ucl_uint m_len, m_off;
+    UCL_COMPRESS_T c_buffer;
+    UCL_COMPRESS_T * const c = &c_buffer;
+#undef s
+#if defined(SWD_USE_MALLOC)
+    ucl_swd_t the_swd = {0};
+#   define s (&the_swd)
+#else
+//    static ucl_swd_p s = 0;
+#endif
+    ucl_uint result_buffer[16];
+    int r;
+
+    struct swd_config_t
+    {
+        unsigned try_lazy;
+        ucl_uint good_length;
+        ucl_uint max_lazy;
+        ucl_uint nice_length;
+        ucl_uint max_chain;
+        ucl_uint32 flags;
+        ucl_uint32 max_offset;
+    };
+    const struct swd_config_t *sc;
+    static const struct swd_config_t swd_config[10] = {
+#define F SWD_F
+        /* faster compression */
+        {   0,   0,   0,   8,    4,   0,  48*1024L },
+        {   0,   0,   0,  16,    8,   0,  48*1024L },
+        {   0,   0,   0,  32,   16,   0,  48*1024L },
+        {   1,   4,   4,  16,   16,   0,  48*1024L },
+        {   1,   8,  16,  32,   32,   0,  48*1024L },
+        {   1,   8,  16, 128,  128,   0,  48*1024L },
+        {   2,   8,  32, 128,  256,   0, 128*1024L },
+        {   2,  32, 128,   F, 2048,   1, 128*1024L },
+        {   2,  32, 128,   F, 2048,   1, 256*1024L },
+        {   2,   F,   F,   F, 4096,   1, SWD_N }
+        /* max. compression */
+#undef F
+    };
+
+    if (level < 1 || level > 10)
+        return UCL_E_INVALID_ARGUMENT;
+    sc = &swd_config[level - 1];
+
+    memset(c, 0, sizeof(*c));
+    memset(&c->conf, 0xff, sizeof(c->conf));
+    c->ip = c->in = in;
+    c->in_end = in + in_len;
+    c->out = out;
+    if (cb && cb->callback)
+        c->cb = cb;
+    cb = NULL;
+    c->result = result ? result : (ucl_uintp) result_buffer;
+    result = NULL;
+    ucl_memset(c->result, 0, 16*sizeof(*c->result));
+    c->result[0] = c->result[2] = c->result[4] = UCL_UINT_MAX;
+    if (conf)
+        ucl_memcpy(&c->conf, conf, sizeof(c->conf));
+    conf = NULL;
+    r = bbConfig(c, 0, 8);
+    if (r == 0)
+        r = bbConfig(c, c->conf.bb_endian, c->conf.bb_size);
+    if (r != 0)
+        return UCL_E_INVALID_ARGUMENT;
+    c->bb_op = out;
+
+    ii = c->ip;             /* point to start of literal run */
+    lit = 0;
+
+#if !defined(s)
+    if (!s)
+        s = (ucl_swd_p) ucl_malloc(ucl_sizeof(*s));
+    if (!s)
+        return UCL_E_OUT_OF_MEMORY;
+    memset(s, 0, ucl_sizeof(*s));
+#endif
+    s->f = UCL_MIN((ucl_uint)SWD_F, c->conf.max_match);
+    s->n = UCL_MIN((ucl_uint)SWD_N, sc->max_offset);
+    s->hmask = UCL_UINT32_C(65535);
+#ifdef HEAD2_VAR
+    s->use_head2 = 1;
+#if defined(ACC_MM_AHSHIFT)
+    if (ACC_MM_AHSHIFT != 3) {
+        s->hmask = 16 * 1024 - 1;
+        s->use_head2 = 0;
+    }
+#endif
+#endif
+    if (c->conf.max_offset != UCL_UINT_MAX)
+        s->n = UCL_MIN(SWD_N, c->conf.max_offset);
+    if (in_len < s->n)
+        s->n = UCL_MAX(in_len, 256);
+    if (s->f < 8 || s->n < 256)
+        return UCL_E_INVALID_ARGUMENT;
+    r = init_match(c,s,NULL,0,sc->flags);
+    if (r == UCL_E_OK && (SWD_HSIZE - 1 != s->hmask))
+        r = UCL_E_ERROR;
+    if (r != UCL_E_OK)
+    {
+#if !defined(s)
+        ucl_free(s);
+#endif
+        return r;
+    }
+    if (sc->max_chain > 0)
+        s->max_chain = sc->max_chain;
+    if (sc->nice_length > 0)
+        s->nice_length = sc->nice_length;
+    if (c->conf.max_match < s->nice_length)
+        s->nice_length = c->conf.max_match;
+
+    if (c->cb)
+        (*c->cb->callback)(0,0,-1,c->cb->user);
+
+    c->last_m_off = 1;
+    r = find_match(c,s,0,0);
+    if (r != UCL_E_OK)
+        return r;
+    while (c->look > 0)
+    {
+        ucl_uint ahead;
+        ucl_uint max_ahead;
+        int l1, l2;
+
+        c->codesize = (ucl_uint) (c->bb_op - out);
+
+        m_len = c->m_len;
+        m_off = c->m_off;
+
+        assert(c->bp == c->ip - c->look);
+        assert(c->bp >= in);
+        if (lit == 0)
+            ii = c->bp;
+        assert(ii + lit == c->bp);
+        assert(s->b_char == *(c->bp));
+
+        if (m_len < 2 || (m_len == 2 && (m_off > M2_MAX_OFFSET))
+            || m_off > c->conf.max_offset)
+        {
+            /* a literal */
+            lit++;
+            s->max_chain = sc->max_chain;
+            r = find_match(c,s,1,0);
+            assert(r == 0);
+            continue;
+        }
+
+    /* a match */
+#if defined(SWD_BEST_OFF)
+        if (s->use_best_off)
+            better_match(s,&m_len,&m_off);
+#endif
+        assert_match(s,m_len,m_off);
+
+        /* shall we try a lazy match ? */
+        ahead = 0;
+        if (sc->try_lazy <= 0 || m_len >= sc->max_lazy || m_off == c->last_m_off)
+        {
+            /* no */
+            l1 = 0;
+            max_ahead = 0;
+        }
+        else
+        {
+            /* yes, try a lazy match */
+            l1 = len_of_coded_match(c,m_len,m_off);
+            assert(l1 > 0);
+            max_ahead = UCL_MIN((ucl_uint)sc->try_lazy, m_len - 1);
+        }
+
+        while (ahead < max_ahead && c->look > m_len)
+        {
+            if (m_len >= sc->good_length)
+                s->max_chain = sc->max_chain >> 2;
+            else
+                s->max_chain = sc->max_chain;
+            r = find_match(c,s,1,0);
+            ahead++;
+
+            assert(r == 0);
+            assert(c->look > 0);
+            assert(ii + lit + ahead == c->bp);
+
+            if (c->m_len < 2)
+                continue;
+#if defined(SWD_BEST_OFF)
+            if (s->use_best_off)
+                better_match(s,&c->m_len,&c->m_off);
+#endif
+            l2 = len_of_coded_match(c,c->m_len,c->m_off);
+            if (l2 < 0)
+                continue;
+#if 1
+            if (l1 + (int)(ahead + c->m_len - m_len) * 5 > l2 + (int)(ahead) * 9)
+#else
+            if (l1 > l2)
+#endif
+            {
+                c->lazy++;
+                assert_match(s,c->m_len,c->m_off);
+
+#if 0
+                if (l3 > 0)
+                {
+                    /* code previous run */
+                    code_run(c,ii,lit);
+                    lit = 0;
+                    /* code shortened match */
+                    code_match(c,ahead,m_off);
+                }
+                else
+#endif
+                {
+                    lit += ahead;
+                    assert(ii + lit == c->bp);
+                }
+                goto lazy_match_done;
+            }
+        }
+
+        assert(ii + lit + ahead == c->bp);
+
+        /* 1 - code run */
+        code_run(c,ii,lit);
+        lit = 0;
+
+        /* 2 - code match */
+        code_match(c,m_len,m_off);
+        s->max_chain = sc->max_chain;
+        r = find_match(c,s,m_len,1+ahead);
+        assert(r == 0);
+
+lazy_match_done: ;
+    }
+
+    /* store final run */
+    code_run(c,ii,lit);
+
+    /* EOF */
+    bbPutBit(c, 0);
+#if defined(NRV2B)
+    code_prefix_ss11(c, UCL_UINT32_C(0x1000000));
+    bbPutByte(c, 0xff);
+#elif defined(NRV2D) || defined(NRV2E)
+    code_prefix_ss12(c, UCL_UINT32_C(0x1000000));
+    bbPutByte(c, 0xff);
+#else
+#  error
+#endif
+    bbFlushBits(c, 0);
+
+    assert(c->textsize == in_len);
+    c->codesize = (ucl_uint) (c->bb_op - out);
+    *out_len = (ucl_uint) (c->bb_op - out);
+    if (c->cb)
+        (*c->cb->callback)(c->textsize,c->codesize,4,c->cb->user);
+
+#if 0
+    printf("%7ld %7ld -> %7ld   %7ld %7ld   %ld  (max: %d %d %d)\n",
+          (long) c->textsize, (long) in_len, (long) c->codesize,
+           c->match_bytes, c->lit_bytes,  c->lazy,
+           c->result[1], c->result[3], c->result[5]);
+#endif
+    assert(c->lit_bytes + c->match_bytes == in_len);
+
+    swd_exit(s);
+#if !defined(s)
+    ucl_free(s);
+#endif
+    return UCL_E_OK;
+#undef s
+}
+
+
+/*
+vi:ts=4:et
+*/
+
diff --git a/tools/z64compress/src/enc/ucl/comp/n2b_99.c b/tools/z64compress/src/enc/ucl/comp/n2b_99.c
new file mode 100644
index 000000000..e3b11cc16
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/comp/n2b_99.c
@@ -0,0 +1,38 @@
+/* n2b_99.c -- implementation of the NRV2B-99 compression algorithm
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+
+#define NRV2B
+#include "n2_99.ch"
+#undef NRV2B
+
+
+/*
+vi:ts=4:et
+*/
+
diff --git a/tools/z64compress/src/enc/ucl/comp/ucl_mchw.ch b/tools/z64compress/src/enc/ucl/comp/ucl_mchw.ch
new file mode 100644
index 000000000..c462576d8
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/comp/ucl_mchw.ch
@@ -0,0 +1,312 @@
+/* ucl_mchw.ch -- matching functions using a window
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+typedef struct
+{
+    int init;
+
+    ucl_uint look;          /* bytes in lookahead buffer */
+
+    ucl_uint m_len;
+    ucl_uint m_off;
+
+    ucl_uint last_m_len;
+    ucl_uint last_m_off;
+
+    const ucl_bytep bp;
+    const ucl_bytep ip;
+    const ucl_bytep in;
+    const ucl_bytep in_end;
+    ucl_bytep out;
+
+    ucl_uint32 bb_b;
+    unsigned bb_k;
+    unsigned bb_c_endian;
+    unsigned bb_c_s;
+    unsigned bb_c_s8;
+    ucl_bytep bb_p;
+    ucl_bytep bb_op;
+
+    struct ucl_compress_config_t conf;
+    ucl_uintp result;
+
+    ucl_progress_callback_p cb;
+
+    ucl_uint textsize;      /* text size counter */
+    ucl_uint codesize;      /* code size counter */
+    ucl_uint printcount;    /* counter for reporting progress every 1K bytes */
+
+    /* some stats */
+    unsigned long lit_bytes;
+    unsigned long match_bytes;
+    unsigned long rep_bytes;
+    unsigned long lazy;
+}
+UCL_COMPRESS_T;
+
+
+
+#if (ACC_OS_TOS && (ACC_CC_PUREC || ACC_CC_TURBOC))
+/* the cast is needed to work around a code generation bug */
+#define getbyte(c)  ((c).ip < (c).in_end ? (int) (unsigned) *((c).ip)++ : (-1))
+#else
+#define getbyte(c)  ((c).ip < (c).in_end ? *((c).ip)++ : (-1))
+#endif
+
+#include "ucl_swd.ch"
+
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static int
+init_match ( UCL_COMPRESS_T *c, ucl_swd_t *s,
+             const ucl_bytep dict, ucl_uint dict_len,
+             ucl_uint32 flags )
+{
+    int r;
+
+    assert(!c->init);
+    c->init = 1;
+
+    s->c = c;
+
+    c->last_m_len = c->last_m_off = 0;
+
+    c->textsize = c->codesize = c->printcount = 0;
+    c->lit_bytes = c->match_bytes = c->rep_bytes = 0;
+    c->lazy = 0;
+
+    r = swd_init(s,dict,dict_len);
+    if (r != UCL_E_OK)
+    {
+        swd_exit(s);
+        return r;
+    }
+
+    s->use_best_off = (flags & 1) ? 1 : 0;
+    return UCL_E_OK;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static int
+find_match ( UCL_COMPRESS_T *c, ucl_swd_t *s,
+             ucl_uint this_len, ucl_uint skip )
+{
+    assert(c->init);
+
+    if (skip > 0)
+    {
+        assert(this_len >= skip);
+        swd_accept(s, this_len - skip);
+        c->textsize += this_len - skip + 1;
+    }
+    else
+    {
+        assert(this_len <= 1);
+        c->textsize += this_len - skip;
+    }
+
+    s->m_len = SWD_THRESHOLD;
+#ifdef SWD_BEST_OFF
+    if (s->use_best_off)
+        memset(s->best_pos,0,sizeof(s->best_pos));
+#endif
+    swd_findbest(s);
+    c->m_len = s->m_len;
+#if defined(__UCL_CHECKER)
+    /* s->m_off may be uninitialized if we didn't find a match,
+     * but then its value will never be used.
+     */
+    c->m_off = (s->m_len == SWD_THRESHOLD) ? 0 : s->m_off;
+#else
+    c->m_off = s->m_off;
+#endif
+
+    swd_getbyte(s);
+
+    if (s->b_char < 0)
+    {
+        c->look = 0;
+        c->m_len = 0;
+        swd_exit(s);
+    }
+    else
+    {
+        c->look = s->look + 1;
+    }
+    c->bp = c->ip - c->look;
+
+#if 0
+    /* brute force match search */
+    if (c->m_len > SWD_THRESHOLD && c->m_len + 1 <= c->look)
+    {
+        const ucl_bytep ip = c->bp;
+        const ucl_bytep m  = c->bp - c->m_off;
+        const ucl_bytep in = c->in;
+
+        if (ip - in > s->n)
+            in = ip - s->n;
+        for (;;)
+        {
+            while (*in != *ip)
+                in++;
+            if (in == ip)
+                break;
+            if (in != m)
+                if (memcmp(in,ip,c->m_len+1) == 0)
+                    printf("%p %p %p %5d\n",in,ip,m,c->m_len);
+            in++;
+        }
+    }
+#endif
+
+    if (c->cb && c->textsize > c->printcount)
+    {
+        (*c->cb->callback)(c->textsize,c->codesize,3,c->cb->user);
+        c->printcount += 1024;
+    }
+
+    return UCL_E_OK;
+}
+
+
+/***********************************************************************
+// bit buffer
+************************************************************************/
+
+static int bbConfig(UCL_COMPRESS_T *c, int endian, int bitsize)
+{
+    if (endian != -1)
+    {
+        if (endian != 0)
+            return UCL_E_ERROR;
+        c->bb_c_endian = endian;
+    }
+    if (bitsize != -1)
+    {
+        if (bitsize != 8 && bitsize != 16 && bitsize != 32)
+            return UCL_E_ERROR;
+        c->bb_c_s = bitsize;
+        c->bb_c_s8 = bitsize / 8;
+    }
+    c->bb_b = 0; c->bb_k = 0;
+    c->bb_p = NULL;
+    c->bb_op = NULL;
+    return UCL_E_OK;
+}
+
+
+static void bbWriteBits(UCL_COMPRESS_T *c)
+{
+    ucl_bytep p = c->bb_p;
+    ucl_uint32 b = c->bb_b;
+
+    p[0] = UCL_BYTE(b >>  0);
+    if (c->bb_c_s >= 16)
+    {
+        p[1] = UCL_BYTE(b >>  8);
+        if (c->bb_c_s == 32)
+        {
+            p[2] = UCL_BYTE(b >> 16);
+            p[3] = UCL_BYTE(b >> 24);
+        }
+    }
+}
+
+
+static void bbPutBit(UCL_COMPRESS_T *c, unsigned bit)
+{
+    assert(bit == 0 || bit == 1);
+    assert(c->bb_k <= c->bb_c_s);
+
+    if (c->bb_k < c->bb_c_s)
+    {
+        if (c->bb_k == 0)
+        {
+            assert(c->bb_p == NULL);
+            c->bb_p = c->bb_op;
+            c->bb_op += c->bb_c_s8;
+        }
+        assert(c->bb_p != NULL);
+        assert(c->bb_p + c->bb_c_s8 <= c->bb_op);
+
+        c->bb_b = (c->bb_b << 1) + bit;
+        c->bb_k++;
+    }
+    else
+    {
+        assert(c->bb_p != NULL);
+        assert(c->bb_p + c->bb_c_s8 <= c->bb_op);
+
+        bbWriteBits(c);
+        c->bb_p = c->bb_op;
+        c->bb_op += c->bb_c_s8;
+        c->bb_b = bit;
+        c->bb_k = 1;
+    }
+}
+
+
+static void bbPutByte(UCL_COMPRESS_T *c, unsigned b)
+{
+    /**printf("putbyte %p %p %x  (%d)\n", op, bb_p, x, bb_k);*/
+    assert(c->bb_p == NULL || c->bb_p + c->bb_c_s8 <= c->bb_op);
+    *c->bb_op++ = UCL_BYTE(b);
+}
+
+
+static void bbFlushBits(UCL_COMPRESS_T *c, unsigned filler_bit)
+{
+    if (c->bb_k > 0)
+    {
+        assert(c->bb_k <= c->bb_c_s);
+        while (c->bb_k != c->bb_c_s)
+            bbPutBit(c, filler_bit);
+        bbWriteBits(c);
+        c->bb_k = 0;
+    }
+    c->bb_p = NULL;
+}
+
+
+
+/*
+vi:ts=4:et
+*/
+
diff --git a/tools/z64compress/src/enc/ucl/comp/ucl_swd.ch b/tools/z64compress/src/enc/ucl/comp/ucl_swd.ch
new file mode 100644
index 000000000..8b829415b
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/comp/ucl_swd.ch
@@ -0,0 +1,686 @@
+/* ucl_swd.c -- sliding window dictionary
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+#if (UCL_UINT_MAX < UCL_0xffffffffL)
+#  error "UCL_UINT_MAX"
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+/* unsigned type for dictionary access - don't waste memory here */
+#if (0UL + SWD_N + SWD_F + SWD_F < 0UL + USHRT_MAX)
+   typedef unsigned short   swd_uint;
+#  define SWD_UINT_MAX      USHRT_MAX
+#else
+   typedef ucl_uint         swd_uint;
+#  define SWD_UINT_MAX      UCL_UINT_MAX
+#endif
+#define swd_uintp           swd_uint __UCL_MMODEL *
+#define SWD_UINT(x)         ((swd_uint)(x))
+
+
+#ifndef SWD_MAX_CHAIN
+#  define SWD_MAX_CHAIN     2048
+#endif
+#define SWD_HSIZE           (SWD_HMASK + 1)
+
+#if !defined(HEAD3)
+#if 1
+#  define HEAD3(b,p) \
+    (((0x9f5f*(((((ucl_uint32)b[p]<<5)^b[p+1])<<5)^b[p+2]))>>5) & SWD_HMASK)
+#else
+#  define HEAD3(b,p) \
+    (((0x9f5f*(((((ucl_uint32)b[p+2]<<5)^b[p+1])<<5)^b[p]))>>5) & SWD_HMASK)
+#endif
+#endif
+
+#if !defined(HEAD2)
+#if (SWD_THRESHOLD == 1)
+#  if 1 && defined(UA_GET2)
+#    define HEAD2(b,p)      UA_GET2(&(b[p]))
+#  else
+#    define HEAD2(b,p)      (b[p] ^ ((unsigned)b[p+1]<<8))
+#  endif
+#  define NIL2              SWD_UINT_MAX
+#endif
+#endif
+
+
+#if defined(__UCL_CHECKER)
+   /* malloc arrays of the exact size to detect any overrun */
+#  ifndef SWD_USE_MALLOC
+#    define SWD_USE_MALLOC
+#  endif
+#endif
+
+
+typedef struct
+{
+/* public - "built-in" */
+    ucl_uint n;
+    ucl_uint f;
+    ucl_uint threshold;
+    ucl_uint hmask;
+
+/* public - configuration */
+    ucl_uint max_chain;
+    ucl_uint nice_length;
+    ucl_bool use_best_off;
+    ucl_uint lazy_insert;
+
+/* public - output */
+    ucl_uint m_len;
+    ucl_uint m_off;
+    ucl_uint look;
+    int b_char;
+#if defined(SWD_BEST_OFF)
+    ucl_uint best_off[ SWD_BEST_OFF ];
+#endif
+
+/* semi public */
+    UCL_COMPRESS_T *c;
+    ucl_uint m_pos;
+#if defined(SWD_BEST_OFF)
+    ucl_uint best_pos[ SWD_BEST_OFF ];
+#endif
+
+/* private */
+    const ucl_bytep dict;
+    const ucl_bytep dict_end;
+    ucl_uint dict_len;
+
+/* private */
+    ucl_uint ip;                /* input pointer (lookahead) */
+    ucl_uint bp;                /* buffer pointer */
+    ucl_uint rp;                /* remove pointer */
+    ucl_uint b_size;
+
+    ucl_bytep b_wrap;
+
+    ucl_uint node_count;
+    ucl_uint first_rp;
+
+#if defined(SWD_USE_MALLOC)
+    ucl_bytep b;
+    swd_uintp head3;
+    swd_uintp succ3;
+    swd_uintp best3;
+    swd_uintp llen3;
+#ifdef HEAD2
+    swd_uintp head2;
+#ifdef HEAD2_VAR
+    int use_head2;
+#endif
+#endif
+#else
+    unsigned char b [ SWD_N + SWD_F + SWD_F ];
+    swd_uint head3 [ SWD_HSIZE ];
+    swd_uint succ3 [ SWD_N + SWD_F ];
+    swd_uint best3 [ SWD_N + SWD_F ];
+    swd_uint llen3 [ SWD_HSIZE ];
+#ifdef HEAD2
+    swd_uint head2 [ UCL_UINT32_C(65536) ];
+#endif
+#endif
+}
+ucl_swd_t;
+
+
+/* Access macro for head3.
+ * head3[key] may be uninitialized if the list is emtpy,
+ * but then its value will never be used.
+ */
+#if defined(__UCL_CHECKER)
+#  define s_get_head3(s,key) \
+        ((s->llen3[key] == 0) ? SWD_UINT_MAX : s->head3[key])
+#else
+#  define s_get_head3(s,key)    s->head3[key]
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_initdict(ucl_swd_t *s, const ucl_bytep dict, ucl_uint dict_len)
+{
+    s->dict = s->dict_end = NULL;
+    s->dict_len = 0;
+
+    if (!dict || dict_len <= 0)
+        return;
+    if (dict_len > s->n)
+    {
+        dict += dict_len - s->n;
+        dict_len = s->n;
+    }
+
+    s->dict = dict;
+    s->dict_len = dict_len;
+    s->dict_end = dict + dict_len;
+    ucl_memcpy(s->b,dict,dict_len);
+    s->ip = dict_len;
+}
+
+
+static
+void swd_insertdict(ucl_swd_t *s, ucl_uint node, ucl_uint len)
+{
+    ucl_uint key;
+
+    s->node_count = s->n - len;
+    s->first_rp = node;
+
+    while (len-- > 0)
+    {
+        key = HEAD3(s->b,node);
+        s->succ3[node] = s_get_head3(s,key);
+        s->head3[key] = SWD_UINT(node);
+        s->best3[node] = SWD_UINT(s->f + 1);
+        s->llen3[key]++;
+        assert(s->llen3[key] <= s->n);
+
+#ifdef HEAD2
+        IF_HEAD2(s) {
+            key = HEAD2(s->b,node);
+            s->head2[key] = SWD_UINT(node);
+        }
+#endif
+
+        node++;
+    }
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+int swd_init(ucl_swd_t *s, const ucl_bytep dict, ucl_uint dict_len)
+{
+#if defined(SWD_USE_MALLOC)
+    s->b = NULL;
+    s->head3 = NULL;
+    s->succ3 = NULL;
+    s->best3 = NULL;
+    s->llen3 = NULL;
+#ifdef HEAD2
+    s->head2 = NULL;
+#endif
+#endif
+
+    if (s->n == 0)
+        s->n = SWD_N;
+    if (s->f == 0)
+        s->f = SWD_F;
+    s->threshold = SWD_THRESHOLD;
+    if (s->n > SWD_N || s->f > SWD_F)
+        return UCL_E_INVALID_ARGUMENT;
+
+#if defined(SWD_USE_MALLOC)
+    s->b = (ucl_bytep) ucl_alloc(1, s->n + s->f + s->f);
+    s->head3 = (swd_uintp) ucl_alloc(SWD_HSIZE, sizeof(*s->head3));
+    s->succ3 = (swd_uintp) ucl_alloc(s->n + s->f, sizeof(*s->succ3));
+    s->best3 = (swd_uintp) ucl_alloc(s->n + s->f, sizeof(*s->best3));
+    s->llen3 = (swd_uintp) ucl_alloc(SWD_HSIZE, sizeof(*s->llen3));
+    if (!s->b || !s->head3  || !s->succ3 || !s->best3 || !s->llen3)
+        return UCL_E_OUT_OF_MEMORY;
+#ifdef HEAD2
+    IF_HEAD2(s) {
+        s->head2 = (swd_uintp) ucl_alloc(UCL_UINT32_C(65536), sizeof(*s->head2));
+        if (!s->head2)
+            return UCL_E_OUT_OF_MEMORY;
+    }
+#endif
+#endif
+
+    /* defaults */
+    s->max_chain = SWD_MAX_CHAIN;
+    s->nice_length = s->f;
+    s->use_best_off = 0;
+    s->lazy_insert = 0;
+
+    s->b_size = s->n + s->f;
+    if (s->b_size + s->f >= SWD_UINT_MAX)
+        return UCL_E_ERROR;
+    s->b_wrap = s->b + s->b_size;
+    s->node_count = s->n;
+
+    ucl_memset(s->llen3, 0, (ucl_uint)sizeof(s->llen3[0]) * SWD_HSIZE);
+#ifdef HEAD2
+    IF_HEAD2(s) {
+#if 1
+        ucl_memset(s->head2, 0xff, (ucl_uint)sizeof(s->head2[0]) * UCL_UINT32_C(65536));
+        assert(s->head2[0] == NIL2);
+#else
+        ucl_uint32 i;
+        for (i = 0; i < UCL_UINT32_C(65536); i++)
+            s->head2[i] = NIL2;
+#endif
+    }
+#endif
+
+    s->ip = 0;
+    swd_initdict(s,dict,dict_len);
+    s->bp = s->ip;
+    s->first_rp = s->ip;
+
+    assert(s->ip + s->f <= s->b_size);
+#if 1
+    s->look = (ucl_uint) (s->c->in_end - s->c->ip);
+    if (s->look > 0)
+    {
+        if (s->look > s->f)
+            s->look = s->f;
+        ucl_memcpy(&s->b[s->ip],s->c->ip,s->look);
+        s->c->ip += s->look;
+        s->ip += s->look;
+    }
+#else
+    s->look = 0;
+    while (s->look < s->f)
+    {
+        int c;
+        if ((c = getbyte(*(s->c))) < 0)
+            break;
+        s->b[s->ip] = UCL_BYTE(c);
+        s->ip++;
+        s->look++;
+    }
+#endif
+    if (s->ip == s->b_size)
+        s->ip = 0;
+
+    if (s->look >= 2 && s->dict_len > 0)
+        swd_insertdict(s,0,s->dict_len);
+
+    s->rp = s->first_rp;
+    if (s->rp >= s->node_count)
+        s->rp -= s->node_count;
+    else
+        s->rp += s->b_size - s->node_count;
+
+#if defined(__UCL_CHECKER)
+    /* initialize memory for the first few HEAD3 (if s->ip is not far
+     * enough ahead to do this job for us). The value doesn't matter. */
+    if (s->look < 3)
+        ucl_memset(&s->b[s->bp+s->look],0,3);
+#endif
+
+    return UCL_E_OK;
+}
+
+
+static
+void swd_exit(ucl_swd_t *s)
+{
+#if defined(SWD_USE_MALLOC)
+    /* free in reverse order of allocations */
+#  ifdef HEAD2
+    ucl_free(s->head2); s->head2 = NULL;
+#endif
+    ucl_free(s->llen3); s->llen3 = NULL;
+    ucl_free(s->best3); s->best3 = NULL;
+    ucl_free(s->succ3); s->succ3 = NULL;
+    ucl_free(s->head3); s->head3 = NULL;
+    ucl_free(s->b); s->b = NULL;
+#else
+    //ACC_UNUSED(s);
+#endif
+}
+
+
+#define swd_pos2off(s,pos) \
+    (s->bp > (pos) ? s->bp - (pos) : s->b_size - ((pos) - s->bp))
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_getbyte(ucl_swd_t *s)
+{
+    int c;
+
+    if ((c = getbyte(*(s->c))) < 0)
+    {
+        if (s->look > 0)
+            --s->look;
+#if defined(__UCL_CHECKER)
+        /* initialize memory - value doesn't matter */
+        s->b[s->ip] = 0;
+        if (s->ip < s->f)
+            s->b_wrap[s->ip] = 0;
+#endif
+    }
+    else
+    {
+        s->b[s->ip] = UCL_BYTE(c);
+        if (s->ip < s->f)
+            s->b_wrap[s->ip] = UCL_BYTE(c);
+    }
+    if (++s->ip == s->b_size)
+        s->ip = 0;
+    if (++s->bp == s->b_size)
+        s->bp = 0;
+    if (++s->rp == s->b_size)
+        s->rp = 0;
+}
+
+
+/***********************************************************************
+// remove node from lists
+************************************************************************/
+
+static
+void swd_remove_node(ucl_swd_t *s, ucl_uint node)
+{
+    if (s->node_count == 0)
+    {
+        ucl_uint key;
+
+#ifdef UCL_DEBUG
+        if (s->first_rp != UCL_UINT_MAX)
+        {
+            if (node != s->first_rp)
+                printf("Remove %5u: %5u %5u %5u %5u  %6u %6u\n",
+                        node, s->rp, s->ip, s->bp, s->first_rp,
+                        s->ip - node, s->ip - s->bp);
+            assert(node == s->first_rp);
+            s->first_rp = UCL_UINT_MAX;
+        }
+#endif
+
+        key = HEAD3(s->b,node);
+        assert(s->llen3[key] > 0);
+        --s->llen3[key];
+
+#ifdef HEAD2
+        IF_HEAD2(s) {
+            key = HEAD2(s->b,node);
+            assert(s->head2[key] != NIL2);
+            if ((ucl_uint) s->head2[key] == node)
+                s->head2[key] = NIL2;
+        }
+#endif
+    }
+    else
+        --s->node_count;
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_accept(ucl_swd_t *s, ucl_uint n)
+{
+    assert(n <= s->look);
+
+    if (n > 0) do
+    {
+        ucl_uint key;
+
+        swd_remove_node(s,s->rp);
+
+        /* add bp into HEAD3 */
+        key = HEAD3(s->b,s->bp);
+        s->succ3[s->bp] = s_get_head3(s,key);
+        s->head3[key] = SWD_UINT(s->bp);
+        s->best3[s->bp] = SWD_UINT(s->f + 1);
+        s->llen3[key]++;
+        assert(s->llen3[key] <= s->n);
+
+#ifdef HEAD2
+        IF_HEAD2(s) {
+            /* add bp into HEAD2 */
+            key = HEAD2(s->b,s->bp);
+            s->head2[key] = SWD_UINT(s->bp);
+        }
+#endif
+
+        swd_getbyte(s);
+    } while (--n > 0);
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_search(ucl_swd_t *s, ucl_uint node, ucl_uint cnt)
+{
+    const ucl_bytep p1;
+    const ucl_bytep p2;
+    const ucl_bytep px;
+    ucl_uint m_len = s->m_len;
+    const ucl_bytep b  = s->b;
+    const ucl_bytep bp = s->b + s->bp;
+    const ucl_bytep bx = s->b + s->bp + s->look;
+    unsigned char scan_end1;
+
+    assert(s->m_len > 0);
+
+    scan_end1 = bp[m_len - 1];
+    for ( ; cnt-- > 0; node = s->succ3[node])
+    {
+        p1 = bp;
+        p2 = b + node;
+        px = bx;
+
+        assert(m_len < s->look);
+
+        if (
+#if 1
+            p2[m_len - 1] == scan_end1 &&
+            p2[m_len] == p1[m_len] &&
+#endif
+            p2[0] == p1[0] &&
+            p2[1] == p1[1])
+        {
+            ucl_uint i;
+            assert(ucl_memcmp(bp,&b[node],3) == 0);
+
+#if 0 && defined(UA_GET4)
+            p1 += 3; p2 += 3;
+            while (p1 < px && UA_GET4(p1) == UA_GET4(p2))
+                p1 += 4, p2 += 4;
+            while (p1 < px && *p1 == *p2)
+                p1 += 1, p2 += 1;
+#else
+            p1 += 2; p2 += 2;
+            do {} while (++p1 < px && *p1 == *++p2);
+#endif
+            i = (ucl_uint) (p1 - bp);
+
+#ifdef UCL_DEBUG
+            if (ucl_memcmp(bp,&b[node],i) != 0)
+                printf("%5ld %5ld %02x%02x %02x%02x\n",
+                        (long)s->bp, (long) node,
+                        bp[0], bp[1], b[node], b[node+1]);
+#endif
+            assert(ucl_memcmp(bp,&b[node],i) == 0);
+
+#if defined(SWD_BEST_OFF)
+            if (i < SWD_BEST_OFF)
+            {
+                if (s->best_pos[i] == 0)
+                    s->best_pos[i] = node + 1;
+            }
+#endif
+            if (i > m_len)
+            {
+                s->m_len = m_len = i;
+                s->m_pos = node;
+                if (m_len == s->look)
+                    return;
+                if (m_len >= s->nice_length)
+                    return;
+                if (m_len > (ucl_uint) s->best3[node])
+                    return;
+                scan_end1 = bp[m_len - 1];
+            }
+        }
+    }
+}
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#ifdef HEAD2
+
+static
+ucl_bool swd_search2(ucl_swd_t *s)
+{
+    ucl_uint key;
+
+    assert(s->look >= 2);
+    assert(s->m_len > 0);
+
+    key = s->head2[ HEAD2(s->b,s->bp) ];
+    if (key == NIL2)
+        return 0;
+#ifdef UCL_DEBUG
+    if (ucl_memcmp(&s->b[s->bp],&s->b[key],2) != 0)
+        printf("%5ld %5ld %02x%02x %02x%02x\n", (long)s->bp, (long)key,
+                s->b[s->bp], s->b[s->bp+1], s->b[key], s->b[key+1]);
+#endif
+    assert(ucl_memcmp(&s->b[s->bp],&s->b[key],2) == 0);
+#if defined(SWD_BEST_OFF)
+    if (s->best_pos[2] == 0)
+        s->best_pos[2] = key + 1;
+#endif
+
+    if (s->m_len < 2)
+    {
+        s->m_len = 2;
+        s->m_pos = key;
+    }
+    return 1;
+}
+
+#endif
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+static
+void swd_findbest(ucl_swd_t *s)
+{
+    ucl_uint key;
+    ucl_uint cnt, node;
+    ucl_uint len;
+
+    assert(s->m_len > 0);
+
+    /* get current head, add bp into HEAD3 */
+    key = HEAD3(s->b,s->bp);
+    node = s->succ3[s->bp] = s_get_head3(s,key);
+    cnt = s->llen3[key]++;
+    assert(s->llen3[key] <= s->n + s->f);
+    if (cnt > s->max_chain && s->max_chain > 0)
+        cnt = s->max_chain;
+    s->head3[key] = SWD_UINT(s->bp);
+
+    s->b_char = s->b[s->bp];
+    len = s->m_len;
+    if (s->m_len >= s->look)
+    {
+        if (s->look == 0)
+            s->b_char = -1;
+        s->m_off = 0;
+        s->best3[s->bp] = SWD_UINT(s->f + 1);
+    }
+    else
+    {
+#if defined(HEAD2_VAR)
+        if (s->use_head2) {
+            if (swd_search2(s) && s->look >= 3)
+                swd_search(s,node,cnt);
+        } else {
+            if (s->look >= 3)
+                swd_search(s,node,cnt);
+        }
+#elif defined(HEAD2)
+        if (swd_search2(s) && s->look >= 3)
+            swd_search(s,node,cnt);
+#else
+        if (s->look >= 3)
+            swd_search(s,node,cnt);
+#endif
+        if (s->m_len > len)
+            s->m_off = swd_pos2off(s,s->m_pos);
+        s->best3[s->bp] = SWD_UINT(s->m_len);
+
+#if defined(SWD_BEST_OFF)
+        if (s->use_best_off)
+        {
+            int i;
+            for (i = 2; i < SWD_BEST_OFF; i++)
+                if (s->best_pos[i] > 0)
+                    s->best_off[i] = swd_pos2off(s,s->best_pos[i]-1);
+                else
+                    s->best_off[i] = 0;
+        }
+#endif
+    }
+
+    swd_remove_node(s,s->rp);
+
+#ifdef HEAD2
+    /* add bp into HEAD2 */
+    IF_HEAD2(s) {
+        key = HEAD2(s->b,s->bp);
+        s->head2[key] = SWD_UINT(s->bp);
+    }
+#endif
+}
+
+
+#undef HEAD3
+#undef HEAD2
+#undef IF_HEAD2
+#undef s_get_head3
+
+
+/*
+vi:ts=4:et
+*/
+
diff --git a/tools/z64compress/src/enc/ucl/getbit.h b/tools/z64compress/src/enc/ucl/getbit.h
new file mode 100644
index 000000000..36ef48c99
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/getbit.h
@@ -0,0 +1,64 @@
+/* getbit.h -- bit-buffer access
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#if 1
+#define getbit_8(bb, src, ilen) \
+    (((bb = bb & 0x7f ? bb*2 : ((unsigned)src[ilen++]*2+1)) >> 8) & 1)
+#elif 1
+#define getbit_8(bb, src, ilen) \
+    (bb*=2,bb&0xff ? (bb>>8)&1 : ((bb=src[ilen++]*2+1)>>8)&1)
+#else
+#define getbit_8(bb, src, ilen) \
+    (((bb*=2, (bb&0xff ? bb : (bb=src[ilen++]*2+1,bb))) >> 8) & 1)
+#endif
+
+
+#define getbit_le16(bb, src, ilen) \
+    (bb*=2,bb&0xffff ? (bb>>16)&1 : (ilen+=2,((bb=(src[ilen-2]+src[ilen-1]*256u)*2+1)>>16)&1))
+
+
+#if 1 && (ACC_ENDIAN_LITTLE_ENDIAN) && defined(UA_GET4)
+#define getbit_le32(bb, bc, src, ilen) \
+    (bc > 0 ? ((bb>>--bc)&1) : (bc=31,\
+    bb=UA_GET4((src)+ilen),ilen+=4,(bb>>31)&1))
+#else
+#define getbit_le32(bb, bc, src, ilen) \
+    (bc > 0 ? ((bb>>--bc)&1) : (bc=31,\
+    bb=src[ilen]+src[ilen+1]*0x100+src[ilen+2]*UCL_UINT32_C(0x10000)+src[ilen+3]*UCL_UINT32_C(0x1000000),\
+    ilen+=4,(bb>>31)&1))
+#endif
+
+
+/*
+vi:ts=4:et
+*/
+
diff --git a/tools/z64compress/src/enc/ucl/n2b_d.c b/tools/z64compress/src/enc/ucl/n2b_d.c
new file mode 100644
index 000000000..0dc359068
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/n2b_d.c
@@ -0,0 +1,179 @@
+/* n2b_d.c -- implementation of the NRV2B decompression algorithm
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+/***********************************************************************
+// actual implementation used by a recursive #include
+************************************************************************/
+
+#ifdef getbit
+
+#ifdef SAFE
+#define fail(x,r)   if (x) { *dst_len = olen; return r; }
+#else
+#define fail(x,r)
+#endif
+
+{
+    ucl_uint32 bb = 0;
+#ifdef TEST_OVERLAP
+    ucl_uint ilen = src_off, olen = 0, last_m_off = 1;
+#else
+    ucl_uint ilen = 0, olen = 0, last_m_off = 1;
+#endif
+#ifdef SAFE
+    const ucl_uint oend = *dst_len;
+#endif
+    //ACC_UNUSED(wrkmem);
+
+#ifdef TEST_OVERLAP
+    src_len += src_off;
+    fail(oend >= src_len, UCL_E_OVERLAP_OVERRUN);
+#endif
+
+    for (;;)
+    {
+        ucl_uint m_off, m_len;
+
+        while (getbit(bb))
+        {
+            fail(ilen >= src_len, UCL_E_INPUT_OVERRUN);
+            fail(olen >= oend, UCL_E_OUTPUT_OVERRUN);
+#ifdef TEST_OVERLAP
+            fail(olen > ilen, UCL_E_OVERLAP_OVERRUN);
+            olen++; ilen++;
+#else
+            dst[olen++] = src[ilen++];
+#endif
+        }
+        m_off = 1;
+        do {
+            m_off = m_off*2 + getbit(bb);
+            fail(ilen >= src_len, UCL_E_INPUT_OVERRUN);
+            fail(m_off > UCL_UINT32_C(0xffffff) + 3, UCL_E_LOOKBEHIND_OVERRUN);
+        } while (!getbit(bb));
+        if (m_off == 2)
+        {
+            m_off = last_m_off;
+        }
+        else
+        {
+            fail(ilen >= src_len, UCL_E_INPUT_OVERRUN);
+            m_off = (m_off-3)*256 + src[ilen++];
+            if (m_off == UCL_UINT32_C(0xffffffff))
+                break;
+            last_m_off = ++m_off;
+        }
+        m_len = getbit(bb);
+        m_len = m_len*2 + getbit(bb);
+        if (m_len == 0)
+        {
+            m_len++;
+            do {
+                m_len = m_len*2 + getbit(bb);
+                fail(ilen >= src_len, UCL_E_INPUT_OVERRUN);
+                fail(m_len >= oend, UCL_E_OUTPUT_OVERRUN);
+            } while (!getbit(bb));
+            m_len += 2;
+        }
+        m_len += (m_off > 0xd00);
+        fail(olen + m_len > oend, UCL_E_OUTPUT_OVERRUN);
+        fail(m_off > olen, UCL_E_LOOKBEHIND_OVERRUN);
+#ifdef TEST_OVERLAP
+        olen += m_len + 1;
+        fail(olen > ilen, UCL_E_OVERLAP_OVERRUN);
+#else
+        {
+            const ucl_bytep m_pos;
+            m_pos = dst + olen - m_off;
+            dst[olen++] = *m_pos++;
+            do dst[olen++] = *m_pos++; while (--m_len > 0);
+        }
+#endif
+    }
+    *dst_len = olen;
+    return ilen == src_len ? UCL_E_OK : (ilen < src_len ? UCL_E_INPUT_NOT_CONSUMED : UCL_E_INPUT_OVERRUN);
+}
+
+#undef fail
+
+#endif /* getbit */
+
+
+/***********************************************************************
+// decompressor entries for the different bit-buffer sizes
+************************************************************************/
+
+#ifndef getbit
+
+#include "ucl_conf.h"
+#include "ucl.h"
+#include "getbit.h"
+
+
+UCL_PUBLIC(int)
+ucl_nrv2b_decompress_8          ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem )
+{
+#define getbit(bb)      getbit_8(bb,src,ilen)
+#include "n2b_d.c"
+#undef getbit
+}
+
+#if 0
+UCL_PUBLIC(int)
+ucl_nrv2b_decompress_le16       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem )
+{
+#define getbit(bb)      getbit_le16(bb,src,ilen)
+#include "n2b_d.c"
+#undef getbit
+}
+
+
+UCL_PUBLIC(int)
+ucl_nrv2b_decompress_le32       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem )
+{
+    unsigned bc = 0;
+#define getbit(bb)      getbit_le32(bb,bc,src,ilen)
+#include "n2b_d.c"
+#undef getbit
+}
+#endif /* 0 */
+
+
+#endif /* !getbit */
+
+
+/*
+vi:ts=4:et
+*/
+
diff --git a/tools/z64compress/src/enc/ucl/ucl.h b/tools/z64compress/src/enc/ucl/ucl.h
new file mode 100644
index 000000000..f136d4a2d
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/ucl.h
@@ -0,0 +1,249 @@
+/* ucl.h -- prototypes for the UCL data compression library
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 2004 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2003 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+#ifndef __UCL_H_INCLUDED
+#define __UCL_H_INCLUDED
+
+#ifndef __UCLCONF_H_INCLUDED
+#include "uclconf.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/***********************************************************************
+// Compression fine-tuning configuration.
+//
+// Pass a NULL pointer to the compression functions for default values.
+// Otherwise set all values to -1 [i.e. initialize the struct by a
+// `memset(x,0xff,sizeof(x))'] and then set the required values.
+************************************************************************/
+
+struct ucl_compress_config_t
+{
+    int bb_endian;
+    int bb_size;
+    ucl_uint max_offset;
+    ucl_uint max_match;
+    int s_level;
+    int h_level;
+    int p_level;
+    int c_flags;
+    ucl_uint m_size;
+};
+
+#define ucl_compress_config_p   ucl_compress_config_t __UCL_MMODEL *
+
+
+/***********************************************************************
+// compressors
+//
+// Pass NULL for `cb' (no progress callback), `conf' (default compression
+// configuration) and `result' (no statistical result).
+************************************************************************/
+
+UCL_EXTERN(int)
+ucl_nrv2b_99_compress      ( const ucl_bytep src, ucl_uint src_len,
+                                   ucl_bytep dst, ucl_uintp dst_len,
+                                   ucl_progress_callback_p cb,
+                                   int level,
+                             const struct ucl_compress_config_p conf,
+                                   ucl_uintp result );
+
+UCL_EXTERN(int)
+ucl_nrv2d_99_compress      ( const ucl_bytep src, ucl_uint src_len,
+                                   ucl_bytep dst, ucl_uintp dst_len,
+                                   ucl_progress_callback_p cb,
+                                   int level,
+                             const struct ucl_compress_config_p conf,
+                                   ucl_uintp result );
+
+UCL_EXTERN(int)
+ucl_nrv2e_99_compress      ( const ucl_bytep src, ucl_uint src_len,
+                                   ucl_bytep dst, ucl_uintp dst_len,
+                                   ucl_progress_callback_p cb,
+                                   int level,
+                             const struct ucl_compress_config_p conf,
+                                   ucl_uintp result );
+
+
+/***********************************************************************
+// decompressors
+//
+// Always pass NULL for `wrkmem'. This parameter is for symetry
+// with my other compression libaries and is not used in UCL -
+// UCL does not need any additional memory (or even local stack space)
+// for decompression.
+************************************************************************/
+
+UCL_EXTERN(int)
+ucl_nrv2b_decompress_8          ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2b_decompress_le16       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2b_decompress_le32       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2b_decompress_safe_8     ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2b_decompress_safe_le16  ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2b_decompress_safe_le32  ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+UCL_EXTERN(int)
+ucl_nrv2d_decompress_8          ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2d_decompress_le16       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2d_decompress_le32       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2d_decompress_safe_8     ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2d_decompress_safe_le16  ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2d_decompress_safe_le32  ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+UCL_EXTERN(int)
+ucl_nrv2e_decompress_8          ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2e_decompress_le16       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2e_decompress_le32       ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2e_decompress_safe_8     ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2e_decompress_safe_le16  ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2e_decompress_safe_le32  ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+
+/***********************************************************************
+// assembler decompressors [TO BE ADDED]
+************************************************************************/
+
+
+/***********************************************************************
+// test an overlapping in-place decompression within a buffer:
+//   - try a virtual decompression from &buf[src_off] -> &buf[0]
+//   - no data is actually written
+//   - only the bytes at buf[src_off..src_off+src_len-1] will get accessed
+//
+// NOTE: always pass NULL for `wrkmem' - see above.
+************************************************************************/
+
+UCL_EXTERN(int)
+ucl_nrv2b_test_overlap_8        ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2b_test_overlap_le16     ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2b_test_overlap_le32     ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+UCL_EXTERN(int)
+ucl_nrv2d_test_overlap_8        ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2d_test_overlap_le16     ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2d_test_overlap_le32     ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+UCL_EXTERN(int)
+ucl_nrv2e_test_overlap_8        ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2e_test_overlap_le16     ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+UCL_EXTERN(int)
+ucl_nrv2e_test_overlap_le32     ( const ucl_bytep buf, ucl_uint src_off,
+                                        ucl_uint  src_len, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* already included */
+
diff --git a/tools/z64compress/src/enc/ucl/ucl_conf.h b/tools/z64compress/src/enc/ucl/ucl_conf.h
new file mode 100644
index 000000000..79f5c6b61
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/ucl_conf.h
@@ -0,0 +1,220 @@
+/* ucl_conf.h -- main internal configuration file for the the UCL library
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+/* WARNING: this file should *not* be used by applications. It is
+   part of the implementation of the library and is subject
+   to change.
+ */
+
+
+#ifndef __UCL_CONF_H
+#define __UCL_CONF_H
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#if defined(__UCLCONF_H_INCLUDED)
+#  error "include this file first"
+#endif
+#include "uclconf.h"
+
+#if !defined(__UCL_MMODEL_HUGE) && defined(HAVE_MEMCMP)
+#  define ucl_memcmp(a,b,c)     memcmp(a,b,c)
+#endif
+#if !defined(__UCL_MMODEL_HUGE) && defined(HAVE_MEMCPY)
+#  define ucl_memcpy(a,b,c)     memcpy(a,b,c)
+#endif
+#if !defined(__UCL_MMODEL_HUGE) && defined(HAVE_MEMMOVE)
+#  define ucl_memmove(a,b,c)    memmove(a,b,c)
+#endif
+#if !defined(__UCL_MMODEL_HUGE) && defined(HAVE_MEMSET)
+#  define ucl_memset(a,b,c)     memset(a,b,c)
+#endif
+#if 0 /* WANT_ACC */
+#if defined(UCL_HAVE_CONFIG_H)
+#  define ACC_CONFIG_NO_HEADER 1
+#endif
+#define __ACCLIB_FUNCNAME(f)        error_do_not_use_acclib
+#include "acc/acc.h"
+
+#if (ACC_CC_MSC && (_MSC_VER >= 1300))
+   /* avoid `-Wall' warnings in system header files */
+#  pragma warning(disable: 4820)
+   /* avoid warnings about inlining */
+#  pragma warning(disable: 4710 4711)
+#endif
+
+#if defined(__UCL_MMODEL_HUGE) && (!ACC_HAVE_MM_HUGE_PTR)
+#  error "this should not happen - check defines for __huge"
+#endif
+
+#if (ACC_OS_DOS16 + 0 != UCL_OS_DOS16 + 0)
+#  error "DOS16"
+#endif
+#if (ACC_OS_OS216 + 0 != UCL_OS_OS216 + 0)
+#  error "OS216"
+#endif
+#if (ACC_OS_WIN16 + 0 != UCL_OS_WIN16 + 0)
+#  error "WIN16"
+#endif
+#if (ACC_OS_DOS32 + 0 != UCL_OS_DOS32 + 0)
+#  error "DOS32"
+#endif
+#if (ACC_OS_OS2 + 0 != UCL_OS_OS2 + 0)
+#  error "DOS32"
+#endif
+#if (ACC_OS_WIN32 + 0 != UCL_OS_WIN32 + 0)
+#  error "WIN32"
+#endif
+#if (ACC_OS_WIN64 + 0 != UCL_OS_WIN64 + 0)
+#  error "WIN64"
+#endif
+
+
+#include "acc/acc_incd.h"
+#if (ACC_OS_DOS16 || ACC_OS_OS216 || ACC_OS_WIN16)
+#  include "acc/acc_ince.h"
+#  include "acc/acc_inci.h"
+#endif
+
+#undef NDEBUG
+#if !defined(UCL_DEBUG)
+#  define NDEBUG 1
+#endif
+#include <assert.h>
+
+
+#if (ACC_OS_DOS16 || ACC_OS_OS216 || ACC_OS_WIN16) && (ACC_CC_BORLANDC)
+#  if (__BORLANDC__ >= 0x0450)  /* v4.00 */
+#    pragma option -h           /* enable fast huge pointers */
+#  else
+#    pragma option -h-          /* disable fast huge pointers - compiler bug */
+#  endif
+#endif
+#endif /* WANT_ACC */
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+#if 1
+#  define UCL_BYTE(x)       ((unsigned char) (x))
+#else
+#  define UCL_BYTE(x)       ((unsigned char) ((x) & 0xff))
+#endif
+#if 0
+#  define UCL_USHORT(x)     ((unsigned short) (x))
+#else
+#  define UCL_USHORT(x)     ((unsigned short) ((x) & 0xffff))
+#endif
+
+#define UCL_MAX(a,b)        ((a) >= (b) ? (a) : (b))
+#define UCL_MIN(a,b)        ((a) <= (b) ? (a) : (b))
+#define UCL_MAX3(a,b,c)     ((a) >= (b) ? UCL_MAX(a,c) : UCL_MAX(b,c))
+#define UCL_MIN3(a,b,c)     ((a) <= (b) ? UCL_MIN(a,c) : UCL_MIN(b,c))
+
+#define ucl_sizeof(type)    ((ucl_uint) (sizeof(type)))
+
+#define UCL_HIGH(array)     ((ucl_uint) (sizeof(array)/sizeof(*(array))))
+
+/* this always fits into 16 bits */
+#define UCL_SIZE(bits)      (1u << (bits))
+#define UCL_MASK(bits)      (UCL_SIZE(bits) - 1)
+
+#define UCL_LSIZE(bits)     (1ul << (bits))
+#define UCL_LMASK(bits)     (UCL_LSIZE(bits) - 1)
+
+#define UCL_USIZE(bits)     ((ucl_uint) 1 << (bits))
+#define UCL_UMASK(bits)     (UCL_USIZE(bits) - 1)
+
+/* Maximum value of a signed/unsigned type.
+   Do not use casts, avoid overflows ! */
+#define UCL_STYPE_MAX(b)    (((1l  << (8*(b)-2)) - 1l)  + (1l  << (8*(b)-2)))
+#define UCL_UTYPE_MAX(b)    (((1ul << (8*(b)-1)) - 1ul) + (1ul << (8*(b)-1)))
+
+
+/***********************************************************************
+// compiler and architecture specific stuff
+************************************************************************/
+
+/* Some defines that indicate if memory can be accessed at unaligned
+ * memory addresses. You should also test that this is actually faster
+ * even if it is allowed by your system.
+ */
+
+#undef UA_GET2
+#undef UA_SET2
+#undef UA_GET4
+#undef UA_SET4
+#if 1 && (ACC_ARCH_AMD64 || ACC_ARCH_IA32)
+#  define UA_GET2(p)    (* (const ucl_ushortp) (p))
+#  define UA_SET2(p)    (* (ucl_ushortp) (p))
+#  define UA_GET4(p)    (* (const acc_uint32e_t *) (p))
+#  define UA_SET4(p)    (* (acc_uint32e_t *) (p))
+#elif 0 && (ACC_ARCH_M68K) && (ACC_CC_GNUC >= 0x020900ul)
+   typedef struct { unsigned short v; } __ucl_ua2_t __attribute__((__aligned__(1)));
+   typedef struct { unsigned long v; }  __ucl_ua4_t __attribute__((__aligned__(1)));
+#  define UA_GET2(p)    (((const __ucl_ua2_t *)(p))->v)
+#  define UA_SET2(p)    (((__ucl_ua2_t *)(p))->v)
+#  define UA_GET4(p)    (((const __ucl_ua4_t *)(p))->v)
+#  define UA_SET4(p)    (((__ucl_ua4_t *)(p))->v)
+#endif
+
+
+/***********************************************************************
+// some globals
+************************************************************************/
+
+__UCL_EXTERN_C int __ucl_init_done;
+UCL_EXTERN(const ucl_bytep) ucl_copyright(void);
+
+
+/***********************************************************************
+// ANSI C preprocessor macros
+************************************************************************/
+
+#define _UCL_STRINGIZE(x)           #x
+#define _UCL_MEXPAND(x)             _UCL_STRINGIZE(x)
+
+
+/***********************************************************************
+//
+************************************************************************/
+
+//#include "ucl_ptr.h"
+
+
+#endif /* already included */
+
+/*
+vi:ts=4:et
+*/
+
diff --git a/tools/z64compress/src/enc/ucl/uclconf.h b/tools/z64compress/src/enc/ucl/uclconf.h
new file mode 100644
index 000000000..ab18ca173
--- /dev/null
+++ b/tools/z64compress/src/enc/ucl/uclconf.h
@@ -0,0 +1,490 @@
+/* uclconf.h -- configuration for the UCL data compression library
+
+   This file is part of the UCL data compression library.
+
+   Copyright (C) 2004 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2003 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2002 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2001 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 2000 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1999 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1998 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1997 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1996 Markus Franz Xaver Johannes Oberhumer
+   All Rights Reserved.
+
+   The UCL library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   The UCL library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with the UCL library; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer
+   <markus@oberhumer.com>
+   http://www.oberhumer.com/opensource/ucl/
+ */
+
+
+#ifndef __UCLCONF_H_INCLUDED
+#define __UCLCONF_H_INCLUDED
+
+#define UCL_VERSION             0x010300L
+#define UCL_VERSION_STRING      "1.03"
+#define UCL_VERSION_DATE        "Jul 20 2004"
+
+/* internal Autoconf configuration file - only used when building UCL */
+#if defined(UCL_HAVE_CONFIG_H)
+#  include <config.h>
+#endif
+#include <limits.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/***********************************************************************
+// UCL requires a conforming <limits.h>
+************************************************************************/
+
+#if !defined(CHAR_BIT) || (CHAR_BIT != 8)
+#  error "invalid CHAR_BIT"
+#endif
+#if !defined(UCHAR_MAX) || !defined(UINT_MAX) || !defined(ULONG_MAX)
+#  error "check your compiler installation"
+#endif
+#if (USHRT_MAX < 1) || (UINT_MAX < 1) || (ULONG_MAX < 1)
+#  error "your limits.h macros are broken"
+#endif
+
+/* workaround a compiler bug under hpux 10.20 */
+#define UCL_0xffffL             65535ul
+#define UCL_0xffffffffL         4294967295ul
+
+#if !defined(UCL_UINT32_C)
+#  if (UINT_MAX < UCL_0xffffffffL)
+#    define UCL_UINT32_C(c)     c ## UL
+#  else
+#    define UCL_UINT32_C(c)     ((c) + 0U)
+#  endif
+#endif
+
+
+/***********************************************************************
+// architecture defines
+************************************************************************/
+
+#if (defined(__CYGWIN__) || defined(__CYGWIN32__)) && defined(__GNUC__)
+#  define UCL_OS_CYGWIN         1
+#elif defined(__EMX__) && defined(__GNUC__)
+#  define UCL_OS_EMX            1
+#elif defined(__BORLANDC__) && defined(__DPMI32__) && (__BORLANDC__ >= 0x0460)
+#  define UCL_OS_DOS32          1
+#elif defined(__BORLANDC__) && defined(__DPMI16__)
+#  define UCL_OS_DOS16          1
+#elif defined(__ZTC__) && defined(DOS386)
+#  define UCL_OS_DOS32          1
+#elif defined(__OS2__) || defined(__OS2V2__)
+#  if (UINT_MAX == UCL_0xffffL)
+#    define UCL_OS_OS216        1
+#  elif (UINT_MAX == UCL_0xffffffffL)
+#    define UCL_OS_OS2          1
+#  else
+#    error "check your limits.h header"
+#  endif
+#elif defined(__WIN64__) || defined(_WIN64) || defined(WIN64)
+#  define UCL_OS_WIN64          1
+#elif defined(__WIN32__) || defined(_WIN32) || defined(WIN32) || defined(__WINDOWS_386__)
+#  define UCL_OS_WIN32          1
+#elif defined(__MWERKS__) && defined(__INTEL__)
+#  define UCL_OS_WIN32          1
+#elif defined(__WINDOWS__) || defined(_WINDOWS) || defined(_Windows)
+#  if (UINT_MAX == UCL_0xffffL)
+#    define UCL_OS_WIN16        1
+#  elif (UINT_MAX == UCL_0xffffffffL)
+#    define UCL_OS_WIN32        1
+#  else
+#    error "check your limits.h header"
+#  endif
+#elif defined(__DOS__) || defined(__MSDOS__) || defined(_MSDOS) || defined(MSDOS) || (defined(__PACIFIC__) && defined(DOS))
+#  if (UINT_MAX == UCL_0xffffL)
+#    define UCL_OS_DOS16        1
+#  elif (UINT_MAX == UCL_0xffffffffL)
+#    define UCL_OS_DOS32        1
+#  else
+#    error "check your limits.h header"
+#  endif
+#elif defined(__WATCOMC__)
+#  if defined(__NT__) && (UINT_MAX == UCL_0xffffL)
+     /* wcl: NT host defaults to DOS target */
+#    define UCL_OS_DOS16        1
+#  elif defined(__NT__) && (__WATCOMC__ < 1100)
+     /* wcl386: Watcom C 11 defines _WIN32 */
+#    define UCL_OS_WIN32        1
+#  else
+#    error "please specify a target using the -bt compiler option"
+#  endif
+#elif defined(__palmos__)
+#  if (UINT_MAX == UCL_0xffffL)
+#    define UCL_OS_PALMOS       1
+#  else
+#    error "check your limits.h header"
+#  endif
+#elif defined(__TOS__) || defined(__atarist__)
+#  define UCL_OS_TOS            1
+#elif defined(macintosh)
+#  define UCL_OS_MACCLASSIC     1
+#elif defined(__VMS)
+#  define UCL_OS_VMS            1
+#else
+#  define UCL_OS_POSIX          1
+#endif
+
+/* memory checkers */
+#if !defined(__UCL_CHECKER)
+#  if defined(__BOUNDS_CHECKING_ON)
+#    define __UCL_CHECKER       1
+#  elif defined(__CHECKER__)
+#    define __UCL_CHECKER       1
+#  elif defined(__INSURE__)
+#    define __UCL_CHECKER       1
+#  elif defined(__PURIFY__)
+#    define __UCL_CHECKER       1
+#  endif
+#endif
+
+/* fix ancient compiler versions */
+#if (UINT_MAX == UCL_0xffffL)
+#if (defined(__MSDOS__) && defined(__TURBOC__) && (__TURBOC__ < 0x0410)) || (defined(MSDOS) && defined(_MSC_VER) && (_MSC_VER < 700))
+#  if !defined(__cdecl)
+#    define __cdecl cdecl
+#  endif
+#  if !defined(__far)
+#    define __far far
+#  endif
+#  if !defined(__huge)
+#    define __huge huge
+#  endif
+#  if !defined(__near)
+#    define __near near
+#  endif
+#endif
+#endif
+
+
+/***********************************************************************
+// integral and pointer types
+************************************************************************/
+
+/* Integral types with 32 bits or more */
+#if !defined(UCL_UINT32_MAX)
+#  if (UINT_MAX >= UCL_0xffffffffL)
+     typedef unsigned int       ucl_uint32;
+     typedef int                ucl_int32;
+#    define UCL_UINT32_MAX      UINT_MAX
+#    define UCL_INT32_MAX       INT_MAX
+#    define UCL_INT32_MIN       INT_MIN
+#  elif (ULONG_MAX >= UCL_0xffffffffL)
+     typedef unsigned long      ucl_uint32;
+     typedef long               ucl_int32;
+#    define UCL_UINT32_MAX      ULONG_MAX
+#    define UCL_INT32_MAX       LONG_MAX
+#    define UCL_INT32_MIN       LONG_MIN
+#  else
+#    error "ucl_uint32"
+#  endif
+#endif
+
+/* ucl_uint is used like size_t */
+#if !defined(UCL_UINT_MAX)
+#  if (UINT_MAX >= UCL_0xffffffffL)
+     typedef unsigned int       ucl_uint;
+     typedef int                ucl_int;
+#    define UCL_UINT_MAX        UINT_MAX
+#    define UCL_INT_MAX         INT_MAX
+#    define UCL_INT_MIN         INT_MIN
+#  elif (ULONG_MAX >= UCL_0xffffffffL)
+     typedef unsigned long      ucl_uint;
+     typedef long               ucl_int;
+#    define UCL_UINT_MAX        ULONG_MAX
+#    define UCL_INT_MAX         LONG_MAX
+#    define UCL_INT_MIN         LONG_MIN
+#  else
+#    error "ucl_uint"
+#  endif
+#endif
+
+/* Memory model that allows to access memory at offsets of ucl_uint. */
+#if !defined(__UCL_MMODEL)
+#  if (UCL_UINT_MAX <= UINT_MAX)
+#    define __UCL_MMODEL
+#  elif defined(UCL_OS_DOS16) || defined(UCL_OS_OS216) || defined(UCL_OS_WIN16)
+#    define __UCL_MMODEL_HUGE   1
+#    define __UCL_MMODEL        __huge
+#    define ucl_uintptr_t       unsigned long
+#  else
+#    define __UCL_MMODEL
+#  endif
+#endif
+
+/* no typedef here because of const-pointer issues */
+#define ucl_bytep               unsigned char __UCL_MMODEL *
+#define ucl_charp               char __UCL_MMODEL *
+#define ucl_voidp               void __UCL_MMODEL *
+#define ucl_shortp              short __UCL_MMODEL *
+#define ucl_ushortp             unsigned short __UCL_MMODEL *
+#define ucl_uint32p             ucl_uint32 __UCL_MMODEL *
+#define ucl_int32p              ucl_int32 __UCL_MMODEL *
+#define ucl_uintp               ucl_uint __UCL_MMODEL *
+#define ucl_intp                ucl_int __UCL_MMODEL *
+#define ucl_voidpp              ucl_voidp __UCL_MMODEL *
+#define ucl_bytepp              ucl_bytep __UCL_MMODEL *
+/* deprecated - use `ucl_bytep' instead of `ucl_byte *' */
+#define ucl_byte                unsigned char __UCL_MMODEL
+
+typedef int ucl_bool;
+
+
+/***********************************************************************
+// function types
+************************************************************************/
+
+/* name mangling */
+#if !defined(__UCL_EXTERN_C)
+#  ifdef __cplusplus
+#    define __UCL_EXTERN_C      extern "C"
+#  else
+#    define __UCL_EXTERN_C      extern
+#  endif
+#endif
+
+/* calling convention */
+#if !defined(__UCL_CDECL)
+#  if defined(__GNUC__) || defined(__HIGHC__) || defined(__NDPC__)
+#    define __UCL_CDECL
+#  elif defined(UCL_OS_DOS16) || defined(UCL_OS_OS216) || defined(UCL_OS_WIN16)
+#    define __UCL_CDECL         __far __cdecl
+#  elif defined(UCL_OS_DOS32) || defined(UCL_OS_OS2) || defined(UCL_OS_WIN32) || defined(UCL_OS_WIN64)
+#    define __UCL_CDECL         __cdecl
+#  else
+#    define __UCL_CDECL
+#  endif
+#endif
+
+/* DLL export information */
+#if !defined(__UCL_EXPORT1)
+#  define __UCL_EXPORT1
+#endif
+#if !defined(__UCL_EXPORT2)
+#  define __UCL_EXPORT2
+#endif
+
+/* __cdecl calling convention for public C and assembly functions */
+#if !defined(UCL_PUBLIC)
+#  define UCL_PUBLIC(_rettype)  __UCL_EXPORT1 _rettype __UCL_EXPORT2 __UCL_CDECL
+#endif
+#if !defined(UCL_EXTERN)
+#  define UCL_EXTERN(_rettype)  __UCL_EXTERN_C UCL_PUBLIC(_rettype)
+#endif
+#if !defined(UCL_PRIVATE)
+#  define UCL_PRIVATE(_rettype) static _rettype __UCL_CDECL
+#endif
+
+/* C++ exception specification for extern "C" function types */
+#if !defined(__cplusplus)
+#  undef UCL_NOTHROW
+#  define UCL_NOTHROW
+#elif !defined(UCL_NOTHROW)
+#  define UCL_NOTHROW
+#endif
+
+/* function types */
+typedef int
+(__UCL_CDECL *ucl_compress_t)   ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+typedef int
+(__UCL_CDECL *ucl_decompress_t) ( const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+typedef int
+(__UCL_CDECL *ucl_optimize_t)   (       ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem );
+
+typedef int
+(__UCL_CDECL *ucl_compress_dict_t)(const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem,
+                                  const ucl_bytep dict, ucl_uint dict_len );
+
+typedef int
+(__UCL_CDECL *ucl_decompress_dict_t)(const ucl_bytep src, ucl_uint  src_len,
+                                        ucl_bytep dst, ucl_uintp dst_len,
+                                        ucl_voidp wrkmem,
+                                  const ucl_bytep dict, ucl_uint dict_len );
+
+/* a progress indicator callback function */
+typedef struct
+{
+    void (__UCL_CDECL *callback) (ucl_uint, ucl_uint, int, ucl_voidp);
+    ucl_voidp user;
+}
+ucl_progress_callback_t;
+#define ucl_progress_callback_p ucl_progress_callback_t __UCL_MMODEL *
+
+
+/***********************************************************************
+// error codes and prototypes
+************************************************************************/
+
+/* Error codes for the compression/decompression functions. Negative
+ * values are errors, positive values will be used for special but
+ * normal events.
+ */
+#define UCL_E_OK                    0
+#define UCL_E_ERROR                 (-1)
+#define UCL_E_INVALID_ARGUMENT      (-2)
+#define UCL_E_OUT_OF_MEMORY         (-3)
+/* compression errors */
+#define UCL_E_NOT_COMPRESSIBLE      (-101)
+/* decompression errors */
+#define UCL_E_INPUT_OVERRUN         (-201)
+#define UCL_E_OUTPUT_OVERRUN        (-202)
+#define UCL_E_LOOKBEHIND_OVERRUN    (-203)
+#define UCL_E_EOF_NOT_FOUND         (-204)
+#define UCL_E_INPUT_NOT_CONSUMED    (-205)
+#define UCL_E_OVERLAP_OVERRUN       (-206)
+
+
+/* ucl_init() should be the first function you call.
+ * Check the return code !
+ *
+ * ucl_init() is a macro to allow checking that the library and the
+ * compiler's view of various types are consistent.
+ */
+#define ucl_init() __ucl_init2(UCL_VERSION,(int)sizeof(short),(int)sizeof(int),\
+    (int)sizeof(long),(int)sizeof(ucl_uint32),(int)sizeof(ucl_uint),\
+    (int)-1,(int)sizeof(char *),(int)sizeof(ucl_voidp),\
+    (int)sizeof(ucl_compress_t))
+UCL_EXTERN(int) __ucl_init2(ucl_uint32,int,int,int,int,int,int,int,int,int);
+
+/* version functions (useful for shared libraries) */
+UCL_EXTERN(ucl_uint32) ucl_version(void);
+UCL_EXTERN(const char *) ucl_version_string(void);
+UCL_EXTERN(const char *) ucl_version_date(void);
+UCL_EXTERN(const ucl_charp) _ucl_version_string(void);
+UCL_EXTERN(const ucl_charp) _ucl_version_date(void);
+
+/* string functions */
+UCL_EXTERN(int)
+ucl_memcmp(const ucl_voidp _s1, const ucl_voidp _s2, ucl_uint _len);
+UCL_EXTERN(ucl_voidp)
+ucl_memcpy(ucl_voidp _dest, const ucl_voidp _src, ucl_uint _len);
+UCL_EXTERN(ucl_voidp)
+ucl_memmove(ucl_voidp _dest, const ucl_voidp _src, ucl_uint _len);
+UCL_EXTERN(ucl_voidp)
+ucl_memset(ucl_voidp _s, int _c, ucl_uint _len);
+
+/* checksum functions */
+UCL_EXTERN(ucl_uint32)
+ucl_adler32(ucl_uint32 _adler, const ucl_bytep _buf, ucl_uint _len);
+UCL_EXTERN(ucl_uint32)
+ucl_crc32(ucl_uint32 _c, const ucl_bytep _buf, ucl_uint _len);
+UCL_EXTERN(const ucl_uint32p)
+ucl_get_crc32_table(void);
+
+/* memory allocation hooks */
+typedef ucl_voidp (__UCL_CDECL *ucl_malloc_hook_t) (ucl_uint);
+typedef void (__UCL_CDECL *ucl_free_hook_t) (ucl_voidp);
+UCL_EXTERN(void)
+ucl_set_malloc_hooks(ucl_malloc_hook_t, ucl_free_hook_t);
+UCL_EXTERN(void)
+ucl_get_malloc_hooks(ucl_malloc_hook_t*, ucl_free_hook_t*);
+
+#ifndef UCL_SAFE_ALLOC
+#define UCL_SAFE_ALLOC 1
+#include <stdio.h>
+#include <stdlib.h>
+
+/* safe calloc */
+static
+inline
+void *
+calloc_safe(size_t nmemb, size_t size)
+{
+	void *result;
+	
+	result = calloc(nmemb, size);
+	
+	if (!result)
+	{
+		fprintf(stderr, "[!] memory error\n");
+		exit(EXIT_FAILURE);
+	}
+	
+	return result;
+}
+
+/* safe malloc */
+static
+inline
+void *
+malloc_safe(size_t size)
+{
+	void *result;
+	
+	result = malloc(size);
+	
+	if (!result)
+	{
+		fprintf(stderr, "[!] memory error\n");
+		exit(EXIT_FAILURE);
+	}
+	
+	return result;
+}
+#endif /* UCL_SAFE_ALLOC */
+
+/* memory allocation functions */
+#if 0
+UCL_EXTERN(ucl_voidp) ucl_malloc(ucl_uint);
+UCL_EXTERN(ucl_voidp) ucl_alloc(ucl_uint, ucl_uint);
+UCL_EXTERN(void) ucl_free(ucl_voidp);
+#else
+#  define ucl_malloc(a) (malloc_safe(a))
+#  define ucl_alloc(a, b) (calloc_safe(a, b))
+#  define ucl_free(a) (free(a))
+#endif
+
+
+/* misc. */
+UCL_EXTERN(ucl_bool) ucl_assert(int _expr);
+UCL_EXTERN(int) _ucl_config_check(void);
+typedef union { ucl_bytep p; ucl_uint u; } __ucl_pu_u;
+typedef union { ucl_bytep p; ucl_uint32 u32; } __ucl_pu32_u;
+
+/* align a char pointer on a boundary that is a multiple of `size' */
+UCL_EXTERN(unsigned) __ucl_align_gap(const ucl_voidp _ptr, ucl_uint _size);
+#define UCL_PTR_ALIGN_UP(_ptr,_size) \
+    ((_ptr) + (ucl_uint) __ucl_align_gap((const ucl_voidp)(_ptr),(ucl_uint)(_size)))
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* already included */
+
diff --git a/tools/z64compress/src/enc/yar.c b/tools/z64compress/src/enc/yar.c
new file mode 100644
index 000000000..823b3425d
--- /dev/null
+++ b/tools/z64compress/src/enc/yar.c
@@ -0,0 +1,450 @@
+/* <z64.me> yar.c: decode and encode MM yaz archives */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#define FERR(x) {         \
+   fprintf(stderr, x);    \
+   fprintf(stderr, "\n"); \
+   exit(EXIT_FAILURE);    \
+}
+
+/* surely an archive won't exceed 64 MB */
+#define YAR_MAX (1024 * 1024 * 64)
+
+/* align out address before writing compressed file */
+#define FILE_ALIGN          \
+   while ((outSz % align))  \
+   {                        \
+      out[outSz] = 0;       \
+      outSz += 1;           \
+   }
+
+struct yarFile
+{
+	int   idx;       /* original index in list */
+	int   ofs;       /* global offset of file */
+};
+
+static
+unsigned int
+u32b(void *src)
+{
+	unsigned char *arr = src;
+	
+	return (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
+}
+
+static
+void
+u32wr(void *dst, unsigned int src)
+{
+	unsigned char *arr = dst;
+	
+	arr[0] = src >> 24;
+	arr[1] = src >> 16;
+	arr[2] = src >>  8;
+	arr[3] = src;
+}
+
+static
+void
+progress(const char *name, int progress, int end)
+{
+	fprintf(
+		stderr
+		, "\r""repacking '%s' %d/%d: "
+		, name
+		, progress
+		, end
+	);
+}
+
+/* reencode yar archive */
+/* returns 0 on success, pointer to error message otherwise */
+char *
+yar_reencode(
+	unsigned char   *src     /* source archive */
+	, unsigned int   sz      /* source archive size */
+	, unsigned char *dst     /* destination archive */
+	, unsigned int  *dst_sz  /* destination archive size  */
+	, int align              /* compressed file alignment */
+	
+	, const char *name       /* name of archive (0 = hide progress) */
+	, const char *codec      /* the expected encoding header "Yaz0" */
+	, void *imm              /* intermediate buffer for conversion  */
+	, void *ctx              /* compression context (if applicable) */
+	
+	/* decompresses file; return non-zero on fail; optional
+	 * if files are already decompressed (up to user to know)
+	 */
+	, int decode(void *src, void *dst, unsigned dstSz, unsigned *srcSz)
+	
+	/* compress file; returns non-zero on fail; optional if
+	 * files are desired to be left decompressed
+	 */
+	, int encode(void *src, unsigned srcSz, void *dst, unsigned *dstSz, void *ctx)
+	
+	/* test if file has been previously encoded; optional */
+	, int exist(void *src, unsigned srcSz, void *dst, unsigned *dstSz)
+)
+{
+	unsigned char *ss;
+	unsigned char *out;
+	unsigned int end;
+	unsigned int end_out;
+	unsigned int outSz = 0;
+	int progress_end;
+	struct yarFile *list = 0;
+	struct yarFile *item;
+	int list_num;
+	
+	assert(src);
+	assert(sz);
+	assert(dst_sz);
+	assert(dst);
+	assert(align >= 0);
+	assert((align & 3) == 0); /* cannot have alignment smaller than 4 */
+	
+	out = dst;
+
+	end = 0;
+	
+	ss = src;
+	item = list;
+	do
+	{
+		unsigned ofs;
+		unsigned uncompSz;
+		unsigned OG_encSz;
+		unsigned char *b;
+		
+		ofs = u32b(ss) + end;
+		
+		/* first entry points to end of list, and first file */
+		if (!end)
+		{
+			end = ofs;
+			outSz = end;
+			
+			/* allocate file list */
+			list_num = (end / 4) + 1;
+			list = calloc(list_num, sizeof(*list));
+			if (!list)
+				return "memory error";
+			item = list;
+			
+			FILE_ALIGN
+			
+			/* output file may be aligned differently */
+			end_out = outSz;
+			
+			progress_end = end / 4;
+		}
+		
+		/* b now points to compressed file */
+		b = src + ofs;
+		
+		/* update progress display */
+		if (name)
+			progress(name, (ss - src) / 4, progress_end);
+		
+		/* there should be room for 4-byte codec and 4-byte size */
+		if (b + 4 >= src + sz)
+			break;
+		
+		/* decompressed file size is second word */
+		uncompSz = u32b(b + 4);
+		
+		/* yaz-encoded file */
+		if (!memcmp(b, codec, 4))
+		{
+			unsigned char *fout = out + outSz;
+			unsigned encSz;
+			
+			/* user doesn't want encoded data */
+			if (!encode)
+			{
+				imm = fout;
+				encSz = uncompSz;
+			}
+			
+			/* decode 'b' only if user provided decoder */
+			if (decode)
+			{
+				if (decode(b, imm, uncompSz, &OG_encSz))
+					return "decoder error";
+			}
+			/* if no decoder is provided, direct copy */
+			else
+				memcpy(imm, b + 0x10, uncompSz);
+			
+			/* encode only if user wants that */
+			if (encode)
+			{
+				/* if no exist function has been provided, or
+				 * if it hasn't been encoded yet, encode it
+				 */
+				if (!exist || !exist(imm, uncompSz, fout, &encSz))
+				{
+					if (encode(imm, uncompSz, fout, &encSz, ctx))
+						return "encoder error";
+				}
+			}
+			
+			/* point current entry to new file location */
+			if (ss > src)
+				u32wr(out + (ss - src), outSz - end_out);
+			
+			/* first entry follows different rules */
+			else
+				u32wr(out + (ss - src), end_out);
+			
+			/* advance out_sz to immediately after current file */
+			outSz += encSz;
+			
+			/* align output */
+			FILE_ALIGN
+		}
+		
+		/* end of list */
+		else if (u32b(b) == 0)
+			break;
+		
+		/* unknown codec */
+		else
+		{
+			char *errmsg = (char*)out;
+			char srep[16];
+			sprintf(srep, "%08x", u32b(b));
+			sprintf(
+				errmsg
+				, "unknown codec 0x%s encountered at %08X!\n"
+				, srep
+				, ofs
+			);
+			return errmsg;
+		}
+		
+		ss += 4;
+		item += 1;
+	
+	} while (ss - src < end);
+	
+	/* update progress display */
+	if (name)
+		progress(name, progress_end, progress_end);
+	
+	/* point final entry to end (four 00 bytes) */
+	u32wr(out + (ss - src), outSz - end_out);
+	memset(out + outSz, 0, 16);
+	outSz += 4;
+	
+	/* in case list end changed due to padding, make multiple *
+	 * end-of-list markers throughout the alignment space     */
+	if (end_out > end)
+	{
+		unsigned i;
+		unsigned last = u32b(out + (end - 4));
+		for (i = 0; i < (end_out - end) / 4; ++i)
+		{
+			u32wr(out + end + i * 4, last);
+		}
+	}
+	
+	/* align final output size to 16 */
+	if (outSz & 15)
+		outSz += 16 - (outSz & 15);
+	
+	/* if new file was constructed, note its size */
+	*dst_sz = outSz;
+	
+	/* cleanup */
+	free(list);
+	
+	/* success */
+	return 0;
+}
+
+#ifdef YAR_MAIN_TEST
+/* 
+ * 
+ * usage example (writes decompressed archive)
+ * 
+ */
+
+/* yaz decoder, courtesy of spinout182 */
+static
+int spinout_yaz_dec(void *_src, void *_dst, int dstSz)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	
+	int srcPlace = 0, dstPlace = 0; /*current read/write positions*/
+	
+	unsigned int validBitCount = 0; /*number of valid bits left in "code" byte*/
+	unsigned char currCodeByte = 0;
+	
+	int uncompressedSize = dstSz;
+	
+	src += 0x10;
+	
+	while(dstPlace < uncompressedSize)
+	{
+		/*read new "code" byte if the current one is used up*/
+		if(!validBitCount)
+		{
+			currCodeByte = src[srcPlace];
+			++srcPlace;
+			validBitCount = 8;
+		}
+		
+		if(currCodeByte & 0x80)
+		{
+			/*direct copy*/
+			dst[dstPlace] = src[srcPlace];
+			dstPlace++;
+			srcPlace++;
+		}
+		else
+		{
+			/*RLE part*/
+			unsigned char byte1 = src[srcPlace];
+			unsigned char byte2 = src[srcPlace + 1];
+			srcPlace += 2;
+			
+			unsigned int dist = ((byte1 & 0xF) << 8) | byte2;
+			unsigned int copySource = dstPlace - (dist + 1);
+
+			unsigned int numBytes = byte1 >> 4;
+			if(numBytes)
+				numBytes += 2;
+			else
+			{
+				numBytes = src[srcPlace] + 0x12;
+				srcPlace++;
+			}
+
+			/*copy run*/
+			int i;
+			for(i = 0; i < numBytes; ++i)
+			{
+				dst[dstPlace] = dst[copySource];
+				copySource++;
+				dstPlace++;
+			}
+		}
+		
+		/*use next bit from "code" byte*/
+		currCodeByte <<= 1;
+		validBitCount-=1;		
+	}
+	
+	return 0;
+}
+
+
+/* encodes decompressed data, storing result in dst */
+static
+int encode(void *src, int srcSz, void *_dst, int *dstSz, void *ctx)
+{
+	unsigned char *dst = _dst;
+	
+/* header */
+	/* codec */
+	memcpy(dst, "raw0", 4);
+	
+	/* decompressed size */
+	u32wr(dst + 4, srcSz);
+	
+	/* 8 more bytes of padding */
+	memset(dst + 8, 0, 8);
+	
+/* contents */
+	/* direct copy (data left unencoded; you could encode here though) */
+	memcpy(dst + 0x10, src, srcSz);
+	*dstSz = srcSz + 0x10;
+	
+	return 0;
+}
+
+/* checks if data has already been encoded */
+/* if it does, dst is filled with that data and 1 is returned */
+/* 0 is returned otherwise */
+static
+int exist(void *src, int srcSz, void *dst, int *dstSz)
+{
+	return 0;
+}
+
+/* unsafe but it's a test program so it's fine */
+static
+unsigned char *
+file_read(char *fn, unsigned *sz)
+{
+	FILE *fp;
+	unsigned char *raw;
+	
+	assert(fn);
+	assert(sz);
+	
+	fp = fopen(fn, "rb");
+	if (!fp)
+		FERR("failed to open file for reading");
+	
+	fseek(fp, 0, SEEK_END);
+	*sz = ftell(fp);
+	
+	if (!sz)
+		FERR("read file size == 0");
+	
+	fseek(fp, 0, SEEK_SET);
+	raw = malloc(*sz);
+	if (!raw)
+		FERR("memory error");
+	
+	if (fread(raw, 1, *sz, fp) != *sz)
+		FERR("file read error");
+	
+	fclose(fp);
+	return raw;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned char *raw;
+	unsigned int raw_sz;
+	
+	unsigned char *out;
+	unsigned char *imm;
+	unsigned int out_sz = 0;
+	
+	if (argc != 2)
+		FERR("args: unyar in.yar > out.yar");
+	
+	raw = file_read(argv[1], &raw_sz);
+	fprintf(stderr, "input file %s:\n", argv[1]);
+	
+	/* surely an archive won't exceed 64 MB */
+	out = malloc(1024 * 1024 * 64);
+	imm = malloc(1024 * 1024 * 64);
+	
+	yar_reencode(
+		raw, raw_sz, out, &out_sz, 12, "Yaz0", imm
+		, spinout_yaz_dec
+		, encode
+		, exist
+	);
+	
+	/* write output to stdout */
+	fwrite(out, 1, out_sz, stdout);
+
+	free(raw);
+	free(out);
+	free(imm);
+}
+
+#endif /* YAR_MAIN_TEST */
+
diff --git a/tools/z64compress/src/enc/yar.h b/tools/z64compress/src/enc/yar.h
new file mode 100644
index 000000000..08890949e
--- /dev/null
+++ b/tools/z64compress/src/enc/yar.h
@@ -0,0 +1,36 @@
+/* <z64.me> yar.c: decode and encode MM yaz archives */
+
+#ifndef Z64YAR_H_INCLUDED
+#define Z64YAR_H_INCLUDED
+
+/* reencode yar archive */
+/* returns 0 on success, pointer to error message otherwise */
+char *
+yar_reencode(
+	unsigned char   *src     /* source archive */
+	, unsigned int   sz      /* source archive size */
+	, unsigned char *dst     /* destination archive */
+	, unsigned int  *dst_sz  /* destination archive size  */
+	, int align              /* compressed file alignment */
+	
+	, const char *name       /* name of archive (0 = hide progress) */
+	, const char *codec      /* the expected encoding header "Yaz0" */
+	, void *imm              /* intermediate buffer for conversion  */
+	, void *ctx              /* compression context (if applicable) */
+	
+	/* decompresses file; return non-zero on fail; optional
+	 * if files are already decompressed (up to user to know)
+	 */
+	, int decode(void *src, void *dst, unsigned dstSz, unsigned *srcSz)
+	
+	/* compress file; returns non-zero on fail; optional if
+	 * files are desired to be left decompressed
+	 */
+	, int encode(void *src, unsigned srcSz, void *dst, unsigned *dstSz, void *ctx)
+	
+	/* test if file has been previously encoded; optional */
+	, int exist(void *src, unsigned srcSz, void *dst, unsigned *dstSz)
+);
+
+#endif /* Z64YAR_H_INCLUDED */
+
diff --git a/tools/z64compress/src/enc/yaz.c b/tools/z64compress/src/enc/yaz.c
new file mode 100644
index 000000000..1cd9b6892
--- /dev/null
+++ b/tools/z64compress/src/enc/yaz.c
@@ -0,0 +1,470 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include "stretchy_buffer.h"
+
+struct yazCtx
+{
+	uint16_t  *c;
+	uint32_t  *cmds;
+	uint16_t  *ctrl;
+	uint8_t   *raws;
+	uint8_t   *ctl;
+	uint8_t   *back;
+	int       *return_data;
+};
+
+void yazCtx_free(void *_ctx)
+{
+	struct yazCtx *ctx = _ctx;
+	
+	if (!ctx)
+		return;
+	
+	free(ctx->return_data);
+	sb_free(ctx->c);
+	sb_free(ctx->raws);
+	sb_free(ctx->ctrl);
+	sb_free(ctx->cmds);
+	sb_free(ctx->ctl);
+	sb_free(ctx->back);
+}
+
+void *yazCtx_new(void)
+{
+	struct yazCtx *ctx = calloc(1, sizeof(*ctx));
+	
+	if (!ctx)
+		return 0;
+	
+	/* allocate everything */
+	ctx->c = sb_add(ctx->c, 32);
+	ctx->return_data = malloc(2 * sizeof(*ctx->return_data));
+	ctx->raws = sb_add(ctx->raws, 32);
+	ctx->ctrl = sb_add(ctx->ctrl, 32);
+	ctx->cmds = sb_add(ctx->cmds, 32);
+	ctx->ctl  = sb_add(ctx->ctl , 32);
+	ctx->back = sb_add(ctx->back, 32);
+	
+	return ctx;
+}
+
+// MIO0 encoding
+#define MIx 0
+
+#define min(MINA, MINB) ( ( (MINA)<(MINB) ) ? (MINA) : (MINB) )
+#define max(MAXA, MAXB) ( ( (MAXA)>(MAXB) ) ? (MAXA) : (MAXB) )
+
+#define U32b(u32X) ((u32X)[0]<<24|(u32X)[1]<<16|(u32X)[2]<<8|(u32X)[3])
+#define U16b(u32X) ((u32X)[0]<<8|(u32X)[1])
+#define U32wr(u32DST,u32SRC)	(*(u32DST+0))=((u32SRC)>>24)&0xFF,\
+								(*(u32DST+1))=((u32SRC)>>16)&0xFF,\
+								(*(u32DST+2))=((u32SRC)>>8)&0xFF,\
+								(*(u32DST+3))=((u32SRC)>>0)&0xFF
+#define U16wr(u16DST,u16SRC)	(*(u16DST+0))=((u16SRC)>>8)&0xFF,\
+								(*(u16DST+1))=((u16SRC)>>0)&0xFF
+
+static uint16_t *_enc_next_cpy(struct yazCtx *ctx, uint8_t *back) {
+	stb__sbn(ctx->c)=0; // initialize count to 0
+	int x;
+	for (x=0; x < (sb_count(back) & (0xFFFFFFFE)); x += 2) {
+		sb_push(ctx->c, (back[x]<<8) | back[x+1]);
+	}
+	return ctx->c;
+}
+
+static uint32_t _enc_z_from_tables(struct yazCtx *ctx, uint8_t *ctl, uint8_t *back, uint8_t *values, uint8_t *output, int dec_s, const char *mode) {
+	//_enc_next_cpy(NULL);
+	uint8_t *b=ctl, *v=values;
+	uint16_t *c = _enc_next_cpy(ctx, back);
+	uint32_t bit=0x10000, output_position=0;
+	// if dec_s declared, will keep accurate track
+	while (dec_s > 0) {
+		// get next bit
+		if (bit > 0xFFFF) {
+			bit = b[0];
+			b++;
+			output[output_position++] = bit & 0xFF;
+			bit |= 0x100;
+		}
+		// catch end of control commands
+		if (bit & 0x80) {
+			output[output_position++] = v[0];
+			v++;
+			dec_s--;
+		} else {
+			uint16_t val=c[0];
+			c++;
+			output[output_position++] = ((val>>8)&0xFF);
+			output[output_position++] = ((val)&0xFF);
+			
+			// decrement dec_s accurately with length
+			val>>=12;
+			val&=0xF;
+			if(MIx)
+				dec_s--;
+			else if(val==0) {
+				val = v[0];
+				v++;
+				output[output_position++]=val;
+				val+=16;
+			}
+			dec_s -= val+2;
+		}
+		bit <<= 1;
+	}
+	return output_position;
+}
+
+static int _enc_find(struct yazCtx *ctx, uint8_t *array, uint8_t *needle, int needle_len, int start_index, int source_length) {
+	while(start_index < (source_length - needle_len + 1)) {
+		int r, index = -1;
+		for(r=start_index; r < (source_length - needle_len + 1); r++) {
+			if(array[r]==needle[0]) {
+				index=r;
+				break;
+			}
+		}
+		
+		// if we did not find even the first element, the search has failed
+		if (index == -1)
+			return -1;
+		
+		int i, p;
+		// check for needle
+		for (i = 0, p = index; i < needle_len; i++, p++) {
+			if (array[p] != needle[i])
+				break;
+		}
+		if(i==needle_len) {
+			// needle was found
+			return index;
+		}
+		// continue to search for needle
+		start_index = index + 1;
+	}
+	return -1;
+}
+
+static int *_enc_search(struct yazCtx *ctx, uint8_t *data, uint32_t pos, uint32_t sz, uint32_t cap/*=0x111*/) {
+	int *return_data = ctx->return_data;
+	// this is necessary unless pos is signed, so let's play it safe
+	int mp = (pos>0x1000)?(pos-0x1000):0;
+	int ml = min(cap, sz - pos);
+	if(ml<3) {
+		return_data[0]=return_data[1]=0;
+		return return_data;
+	}
+	int
+		hitp = 0,
+		hitl = 3,
+		hl = -1
+	;
+	
+	if (mp < pos) {
+		hl = _enc_find(ctx, data+mp, data+pos, hitl, 0, pos + hitl - mp);
+		while (hl < (pos - mp)) {
+			while ((hitl < ml) && (data[pos + hitl] == data[mp + hl + hitl]) ) {
+				hitl += 1;
+			}
+			mp += hl;
+			hitp = mp;
+			if (hitl == ml) {
+				return_data[0] = hitp;
+				return_data[1] = hitl;
+				return return_data;
+			}
+			mp += 1;
+			hitl += 1;
+			if (mp >= pos)
+				break;
+			hl = _enc_find(ctx, data+mp, data+pos, hitl, 0, pos + hitl - mp);
+		}
+	}
+	
+	// if length < 4, return miss
+	if (hitl < 4)
+		hitl = 1;
+	
+	return_data[0] = hitp;
+	return_data[1] = hitl-1;
+	return return_data;
+}
+
+static
+uint32_t encode(struct yazCtx *ctx, uint8_t *data, uint32_t data_size, uint8_t *output, const char *mode) {
+	uint32_t
+		cap=0x111,
+		sz=data_size,
+		pos=0,
+		flag=0x80000000
+	;
+	// initialize count of each to 0
+	stb__sbn(ctx->raws)=0;
+	stb__sbn(ctx->ctrl)=0;
+	stb__sbn(ctx->cmds)=0;
+	
+	sb_push(ctx->cmds, 0);
+	
+	if(data_size==0) {
+		memcpy(output, mode, 4);
+		int i;
+		for(i=4; i<16; i++)
+			output[i]=0x00;
+		return 16;
+	}
+	while(pos<sz) {
+		int *search_return = _enc_search(ctx, data, pos, sz, cap);
+		
+		int hitp = search_return[0];
+		int hitl = search_return[1];
+		
+		if (hitl < 3) {
+			// push a raw if copying isn't possible
+			sb_push(ctx->raws, data[pos]);
+			ctx->cmds[sb_count(ctx->cmds)-1] |= flag;
+			pos += 1;
+		} else {
+			search_return = _enc_search(ctx, data, pos+1, sz, cap);
+			int tstp = search_return[0];
+			int tstl = search_return[1];
+			
+			if ((hitl + 1) < tstl) {
+				sb_push(ctx->raws, data[pos]);
+				ctx->cmds[sb_count(ctx->cmds)-1] |= flag;
+				pos += 1;
+				flag >>= 1;
+				if (flag == 0) {
+					flag = 0x80000000;
+					sb_push(ctx->cmds, 0);
+				}
+				hitl = tstl;
+				hitp = tstp;
+			}
+			int e = pos - hitp - 1;
+			pos += hitl;
+			// handle MIx first, then Yax conditions
+			if (cap == 0x12) {
+				hitl -= 3;
+				sb_push(ctx->ctrl, (hitl<<12) | e);
+			} else if (hitl < 0x12) {
+				hitl -= 2;
+				sb_push(ctx->ctrl, (hitl<<12)|e);
+			} else {
+				sb_push(ctx->ctrl, e);
+				sb_push(ctx->raws, hitl - 0x12);
+			}
+		}
+		// advance the flag and refill if required
+		flag >>= 1;
+		if (flag == 0) {
+			flag = 0x80000000;
+			sb_push(ctx->cmds, 0);//cmds.push_back(0);
+		}
+	}
+	
+	// if no cmds in final word, delete it
+	if (flag == 0x80000000) {
+		stb__sbn(ctx->cmds) -= 1;//cmds.erase(cmds.end()-1);
+	}
+	
+	// block and stream differentiation
+	// Yay is block, Yaz is stream
+	int mode_block=1, mode_stream=1; // temporary, for testing
+#ifdef YAZ_MAIN_TEST
+	int g_hlen = 8;
+#else
+	extern int g_hlen;
+#endif
+	mode_block=!strcmp(mode,"Yay0");
+	if (g_hlen) {
+		memcpy(output, mode, 4);
+		U32wr(output+4, sz);
+	} else
+		output -= 8; /* headerless */
+	if (mode_block) {
+		uint32_t l = (sb_count(ctx->cmds) << 2) + 16;
+		uint32_t o = (sb_count(ctx->ctrl) << 1) + l;
+		U32wr(output+8, l);
+		U32wr(output+12, o);
+		
+		uint32_t output_position = g_hlen + 8;
+		uint32_t x;
+		for (x=0; x<sb_count(ctx->cmds); x++) {
+			U32wr(output+output_position, ctx->cmds[x]);
+			output_position+=4;
+		}
+		for (x=0; x<sb_count(ctx->ctrl); x++) {
+			U16wr(output+output_position, ctx->ctrl[x]);
+			output_position+=2;
+		}
+		for (x=0; x<sb_count(ctx->raws); x++) {
+			output[output_position++] = ctx->raws[x];
+		}
+		return output_position;
+	} else if(mode_stream) {
+		U32wr(output+8, 0);
+		U32wr(output+12, 0);
+		
+		uint32_t output_position = 0;
+		stb__sbn(ctx->ctl)=0; // initialize count to 0
+		stb__sbn(ctx->back)=0; // initialize count to 0
+		uint32_t x;
+		for (x=0; x < sb_count(ctx->cmds); x++) {
+			sb_push(ctx->ctl, (ctx->cmds[x]>>24)&0xFF);
+			sb_push(ctx->ctl, (ctx->cmds[x]>>16)&0xFF);
+			sb_push(ctx->ctl, (ctx->cmds[x]>>8)&0xFF);
+			sb_push(ctx->ctl, (ctx->cmds[x])&0xFF);
+		}
+		for (x=0; x < sb_count(ctx->ctrl); x++) {
+			sb_push(ctx->back, (ctx->ctrl[x]>>8)&0xFF);
+			sb_push(ctx->back, (ctx->ctrl[x])&0xFF);
+		}
+		output_position = _enc_z_from_tables(ctx, ctx->ctl, ctx->back, ctx->raws, output+g_hlen+8, data_size, mode);
+		return output_position + g_hlen + 8;
+	}
+	return 0;
+}
+
+
+int
+yazenc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	if (!_ctx)
+		return 1;
+	*dst_sz = encode(_ctx, src, src_sz, dst, "Yaz0");
+	return 0;
+}
+
+/* yaz decoder, courtesy of spinout182 */
+int
+yazdec(void *_src, void *_dst, unsigned dstSz, unsigned *srcSz)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	
+	int srcPlace = 0, dstPlace = 0; /*current read/write positions*/
+	
+	unsigned int validBitCount = 0; /*number of valid bits left in "code" byte*/
+	unsigned char currCodeByte = 0;
+	
+	int uncompressedSize = dstSz;
+	
+	src += 0x10;
+	
+	while(dstPlace < uncompressedSize)
+	{
+		/*read new "code" byte if the current one is used up*/
+		if(!validBitCount)
+		{
+			currCodeByte = src[srcPlace];
+			++srcPlace;
+			validBitCount = 8;
+		}
+		
+		if(currCodeByte & 0x80)
+		{
+			/*direct copy*/
+			dst[dstPlace] = src[srcPlace];
+			dstPlace++;
+			srcPlace++;
+		}
+		else
+		{
+			/*RLE part*/
+			unsigned char byte1 = src[srcPlace];
+			unsigned char byte2 = src[srcPlace + 1];
+			srcPlace += 2;
+			
+			unsigned int dist = ((byte1 & 0xF) << 8) | byte2;
+			unsigned int copySource = dstPlace - (dist + 1);
+
+			unsigned int numBytes = byte1 >> 4;
+			if(numBytes)
+				numBytes += 2;
+			else
+			{
+				numBytes = src[srcPlace] + 0x12;
+				srcPlace++;
+			}
+
+			/*copy run*/
+			int i;
+			for(i = 0; i < numBytes; ++i)
+			{
+				dst[dstPlace] = dst[copySource];
+				copySource++;
+				dstPlace++;
+			}
+		}
+		
+		/*use next bit from "code" byte*/
+		currCodeByte <<= 1;
+		validBitCount-=1;		
+	}
+	
+	if (srcSz)
+		*srcSz = srcPlace;
+	
+	return 0;
+}
+
+#ifdef YAZ_MAIN_TEST
+
+#define FERR(x) {         \
+   fprintf(stderr, x);    \
+   fprintf(stderr, "\n"); \
+   exit(EXIT_FAILURE);    \
+}
+
+int main(int argc, char* argv[])
+{
+	FILE *fp;
+	struct yazCtx *ctx;
+	unsigned size;
+	
+	if(argc < 2)
+		FERR("args: yazenc in.raw > out.yaz");
+	
+	fp = fopen(argv[1], "rb");
+	if(fp == NULL)
+		FERR("failed to open file");
+	
+	fseek(fp, 0, SEEK_END);
+	size = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+	
+	fprintf(stderr, "input file size: %d\n", size);
+	
+	void *buf = malloc(size);
+	void *outbuf = malloc( (size + 64) * 2);
+	
+	if (fread(buf, 1, size, fp) != size)
+		FERR("failed to read file");
+	
+	fclose(fp);
+	
+	ctx = yazCtx_new();
+	if (yazenc(buf, size, outbuf, &size, ctx))
+		FERR("encoding error");
+	
+	if (fwrite(outbuf, 1, size, stdout) != size)
+		FERR("failed to write stdout");
+	
+	yazCtx_free(ctx);
+	free(buf);
+	free(outbuf);
+	return EXIT_SUCCESS;
+}
+#endif /* YAZ_MAIN_TEST */
+
+
diff --git a/tools/z64compress/src/enc/zlib.c b/tools/z64compress/src/enc/zlib.c
new file mode 100644
index 000000000..28b7fd756
--- /dev/null
+++ b/tools/z64compress/src/enc/zlib.c
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <zlib.h>
+#include "libdeflate/libdeflate.h"
+
+#define  CAPACITY  (1024 * 1024 * 4)  /* output buffer max (4 mb) */
+
+int
+zlibenc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	unsigned result_sz;
+	
+	extern int g_hlen; /* header length */
+	memset(dst, 0, g_hlen);
+	memcpy(dst, "ZLIB", 4);
+	dst[4] = (src_sz >> 24);
+	dst[5] = (src_sz >> 16);
+	dst[6] = (src_sz >>  8);
+	dst[7] = (src_sz >>  0);
+	
+	/* zlib and gzip have different header lengths
+	 * https://stackoverflow.com/a/68538037
+	 */
+#if 1
+	
+	#if 0 /* zlib */
+	z_stream stream = {0};
+	int r;
+	stream.avail_in = src_sz;
+	stream.next_in = src;
+	stream.avail_out = CAPACITY;
+	stream.next_out = dst + g_hlen;
+	#define HEADER_LEN 2
+	if ((r = deflateInit(&stream, Z_BEST_COMPRESSION)) != Z_OK)
+	{
+		fprintf(stderr, "[!] fatal compression error %d\n", r);
+		exit(EXIT_FAILURE);
+	}
+	if ((r = deflate(&stream, Z_FINISH)) == Z_STREAM_ERROR)
+	{
+		fprintf(stderr, "[!] Z_STREAM_ERROR\n");
+		exit(EXIT_FAILURE);
+	}
+	deflateEnd(&stream);
+	
+	result_sz = CAPACITY - stream.avail_out;
+	#else /* libdeflate */
+	#define HEADER_LEN 0
+	int level = 12;
+	struct libdeflate_compressor *compressor;
+	compressor = libdeflate_alloc_compressor(level);
+	result_sz = libdeflate_deflate_compress(
+		compressor
+		, src, src_sz
+		, dst + g_hlen
+		, CAPACITY
+	);
+	libdeflate_free_compressor(compressor);
+	#endif
+#else
+	/* this gzip code was left in for testing purposes; it may
+	 * be useful if matching ique recompression is ever revisited;
+	 * ique matches (except for one byte...) when compressed using
+	 * gzip 1.2.4 or 1.2.4a (they produce identical results),
+	 * available here: https://ftp.gnu.org/gnu/gzip/
+	 * this is not a compression error, because decompressing the
+	 * recompressed rom produces a rom identical to the original
+	 * decompressed ique rom;
+	 * TODO: find out why that byte doesn't match on recompression;
+	 * TODO: once that's working, add --codec ique for those wanting
+	 * matching ique recompression; otherwise, modern zlib works great!
+	 */
+	#define HEADER_LEN 10
+	FILE *fp = fopen("tmp.bin", "wb");
+	fwrite(src, 1, src_sz, fp);
+	fclose(fp);
+	system("./gzip -c -9 -n tmp.bin > tmp.bin.gzip");
+	fp = fopen("tmp.bin.gzip", "rb");
+	fseek(fp, 0, SEEK_END);
+	result_sz = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+	fread(dst, 1, result_sz, fp);
+	fclose(fp);
+#endif
+	*dst_sz = result_sz + g_hlen;
+	
+	/* trim zlib/gzip header */
+	memmove(dst + g_hlen, dst + g_hlen + HEADER_LEN, result_sz);
+	*dst_sz -= HEADER_LEN;
+	
+	return 0;
+	(void)_ctx; /* -Wunused-parameter */
+}
+
diff --git a/tools/z64compress/src/enc/zx7.c b/tools/z64compress/src/enc/zx7.c
new file mode 100644
index 000000000..a2e898233
--- /dev/null
+++ b/tools/z64compress/src/enc/zx7.c
@@ -0,0 +1,37 @@
+#if 0
+#include "zx7/zx7.h"
+#include "zx7/optimize.c"
+#include "zx7/compress.c"
+#include "zx7/zx7.c"
+
+int
+zx7enc(
+	void *_src
+	, unsigned src_sz
+	, void *_dst
+	, unsigned *dst_sz
+	, void *_ctx
+)
+{
+	unsigned char *src = _src;
+	unsigned char *dst = _dst;
+	
+	extern int g_hlen; /* header length */
+	memset(dst, 0, g_hlen);
+	memcpy(dst, "ZX70", 4);
+	dst[4] = (src_sz >> 24);
+	dst[5] = (src_sz >> 16);
+	dst[6] = (src_sz >>  8);
+	dst[7] = (src_sz >>  0);
+	
+	*dst_sz = ZX7Compress(src, src_sz, dst + g_hlen);
+	
+	if (!*dst_sz)
+		return 1;
+	
+	*dst_sz += g_hlen;
+	
+	return 0;
+}
+#endif
+
diff --git a/tools/z64compress/src/enc/zx7/compress.c b/tools/z64compress/src/enc/zx7/compress.c
new file mode 100644
index 000000000..1df625e3d
--- /dev/null
+++ b/tools/z64compress/src/enc/zx7/compress.c
@@ -0,0 +1,160 @@
+/*
+ * (c) Copyright 2012-2016 by Einar Saukas. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *	 * Redistributions of source code must retain the above copyright
+ *	   notice, this list of conditions and the following disclaimer.
+ *	 * Redistributions in binary form must reproduce the above copyright
+ *	   notice, this list of conditions and the following disclaimer in the
+ *	   documentation and/or other materials provided with the distribution.
+ *	 * The name of its author may not be used to endorse or promote products
+ *	   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "zx7.h"
+
+#if !TARGET_PRIZM
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdlib.h>
+
+static unsigned char* c_output_data;
+static unsigned int c_output_index;
+static unsigned int c_bit_index;
+static int c_bit_mask;
+
+static inline void c_write_byte(int value) {
+   c_output_data[c_output_index++] = value;
+}
+
+static void c_write_bit(int value) {
+	if (c_bit_mask == 0) {
+		c_bit_mask = 128;
+		c_bit_index = c_output_index;
+		c_write_byte(0);
+	}
+	if (value > 0) {
+		c_output_data[c_bit_index] |= c_bit_mask;
+	}
+	c_bit_mask >>= 1;
+}
+
+static void write_elias_gamma(int value) {
+	int i;
+
+	for (i = 2; i <= value; i <<= 1) {
+		c_write_bit(0);
+	}
+	while ((i >>= 1) > 0) {
+		c_write_bit(value & i);
+	}
+}
+
+unsigned char *compress(
+	Optimal *optimal
+	, unsigned char *input_data
+	, unsigned int input_size
+	, long skip
+	, unsigned int *output_size
+	, unsigned char *dst
+)
+{
+	unsigned int input_index;
+	unsigned int input_prev;
+	int offset1;
+	int mask;
+	int i;
+
+	/* calculate and allocate output buffer */
+	input_index = input_size-1;
+	*output_size = (optimal[input_index].bits+18+7)/8 + 3;
+	unsigned char *ret = dst;
+	if (!ret) {
+		return 0;
+	}
+
+	c_output_data = ret + 3;
+
+	/* un-reverse optimal sequence */
+	optimal[input_index].bits = 0;
+	while (input_index != skip) {
+		input_prev = input_index - (optimal[input_index].len > 0 ? optimal[input_index].len : 1);
+		optimal[input_prev].bits = input_index;
+		input_index = input_prev;
+	}
+
+	c_output_index = 0;
+	c_bit_mask = 0;
+
+	/* first byte is always literal */
+	c_write_byte(input_data[input_index]);
+
+	/* process remaining bytes */
+	while ((input_index = optimal[input_index].bits) > 0) {
+		if (optimal[input_index].len == 0) {
+
+			/* literal indicator */
+			c_write_bit(0);
+
+			/* literal value */
+			c_write_byte(input_data[input_index]);
+
+		} else {
+
+			/* sequence indicator */
+			c_write_bit(1);
+
+			/* sequence length */
+			write_elias_gamma(optimal[input_index].len-1);
+
+			/* sequence offset */
+			offset1 = optimal[input_index].offset-1;
+			if (offset1 < 128) {
+				c_write_byte(offset1);
+			} else {
+				offset1 -= 128;
+				c_write_byte((offset1 & 127) | 128);
+				for (mask = 1024; mask > 127; mask >>= 1) {
+					c_write_bit(offset1 & mask);
+				}
+			}
+		}
+	}
+
+	/* sequence indicator */
+	c_write_bit(1);
+
+	/* end marker > MAX_LEN */
+	for (i = 0; i < 16; i++) {
+		c_write_bit(0);
+	}
+	c_write_bit(1);
+
+	// decompressed size is first three bytes
+	ret[0] = (input_size & 0xFF0000) >> 16;
+	ret[1] = (input_size & 0x00FF00) >> 8;
+	ret[2] = (input_size & 0x0000FF);
+
+	return ret;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/z64compress/src/enc/zx7/dzx7.c b/tools/z64compress/src/enc/zx7/dzx7.c
new file mode 100644
index 000000000..5909f31f7
--- /dev/null
+++ b/tools/z64compress/src/enc/zx7/dzx7.c
@@ -0,0 +1,138 @@
+/*
+ * (c) Copyright 2015 by Einar Saukas. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of its author may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "zx7.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static unsigned char *d_input_data;
+static unsigned char *d_output_data;
+static unsigned int d_input_index;
+static unsigned int d_output_index;
+static unsigned int d_input_size;
+static int bit_mask;
+static int bit_value;
+
+static
+int d_read_byte(void) {
+    return d_input_data[d_input_index++];
+}
+
+static
+int d_read_bit(void) {
+    bit_mask >>= 1;
+    if (bit_mask == 0) {
+        bit_mask = 128;
+        bit_value = d_read_byte();
+    }
+    return bit_value & bit_mask ? 1 : 0;
+}
+
+static
+int read_elias_gamma(void) {
+    int i;
+    int value;
+
+    i = 0;
+    while (!d_read_bit()) {
+        i++;
+    }
+    if (i > 15) {
+        return -1;
+    }
+    value = 1;
+    while (i--) {
+        value = value << 1 | d_read_bit();
+    }
+    return value;
+}
+
+int read_offset(void) {
+    int value;
+    int i;
+
+    value = d_read_byte();
+    if (value < 128) {
+        return value;
+    } else {
+        i = d_read_bit();
+        i = i << 1 | d_read_bit();
+        i = i << 1 | d_read_bit();
+        i = i << 1 | d_read_bit();
+        return ((value & 127) | (i << 7)) + 128;
+    }
+}
+
+static
+void d_write_byte(int value) {
+    d_output_data[d_output_index++] = value;
+}
+
+void d_write_bytes(int offset, int length) {
+    int i;
+    while (length-- > 0) {
+        i = d_output_index-offset;
+        d_write_byte(d_output_data[i]);
+    }
+}
+
+unsigned int ZX7GetDecompressedSize(unsigned char* compressedData) {
+	return compressedData[0] * 65536 + compressedData[1] * 256 + compressedData[2];
+}
+
+int ZX7Decompress(unsigned char* srcData, unsigned char* destData, unsigned int destLength) {
+	if (destLength < ZX7GetDecompressedSize(srcData) || !srcData || !destData) {
+		return -1;
+	}
+
+    int length;
+
+	d_input_data = srcData + 3;
+	d_output_data = destData;
+
+    d_input_size = 0;
+    d_input_index = 0;
+    d_output_index = 0;
+    bit_mask = 0;
+
+    d_write_byte(d_read_byte());
+    while (1) {
+        if (!d_read_bit()) {
+            d_write_byte(d_read_byte());
+        } else {
+            length = read_elias_gamma()+1;
+            if (length == 0) {
+                return 0;
+            }
+            d_write_bytes(read_offset()+1, length);
+        }
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/tools/z64compress/src/enc/zx7/optimize.c b/tools/z64compress/src/enc/zx7/optimize.c
new file mode 100644
index 000000000..e0d59396d
--- /dev/null
+++ b/tools/z64compress/src/enc/zx7/optimize.c
@@ -0,0 +1,167 @@
+/*
+ * (c) Copyright 2012-2016 by Einar Saukas. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *	 * Redistributions of source code must retain the above copyright
+ *	   notice, this list of conditions and the following disclaimer.
+ *	 * Redistributions in binary form must reproduce the above copyright
+ *	   notice, this list of conditions and the following disclaimer in the
+ *	   documentation and/or other materials provided with the distribution.
+ *	 * The name of its author may not be used to endorse or promote products
+ *	   derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "zx7.h"
+
+#if !TARGET_PRIZM
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#include <stdlib.h>
+#include <string.h> /* memset */
+
+/* reusable memory (free using zx7_shutdown()) */
+static unsigned int *min = 0;
+static unsigned int *max = 0;
+static unsigned int *matches = 0;
+static unsigned int *match_slots = 0;
+static Optimal *optimal = 0;
+
+void
+zx7_shutdown(void)
+{
+	if (min)
+		free(min);
+	if (max)
+		free(max);
+	if (matches)
+		free(matches);
+	if (match_slots)
+		free(match_slots);
+	if (optimal)
+		free(optimal);
+}
+
+static int elias_gamma_bits(int value) {
+	int bits;
+
+	bits = 1;
+	while (value > 1) {
+		bits += 2;
+		value >>= 1;
+	}
+	return bits;
+}
+
+static int count_bits(int offset, int len) {
+	return 1 + (offset > 128 ? 12 : 8) + elias_gamma_bits(len-1);
+}
+
+Optimal *optimize(
+	unsigned char *input_data
+	, unsigned int input_size
+	, unsigned long skip
+) {
+	
+	unsigned int *match;
+	int match_index;
+	int offset;
+	unsigned int len;
+	unsigned int best_len;
+	unsigned int bits;
+	unsigned int i;
+
+	/* allocate all data structures at once */
+	if (!min)
+		min = malloc((MAX_OFFSET+1) * sizeof(*min));
+	if (!max)
+		max = malloc((MAX_OFFSET+1) * sizeof(*max));
+	if (!matches)
+		matches = malloc(256 * 256 * sizeof(*matches));
+	if (!match_slots)
+		match_slots = malloc(input_size * sizeof(*match_slots));
+	if (!optimal)
+		optimal = malloc(input_size * sizeof(*optimal));
+
+	if (!min || !max || !matches || !match_slots || !optimal)
+		return 0;
+	
+	memset(min, 0, (MAX_OFFSET+1) * sizeof(*min));
+	memset(max, 0, (MAX_OFFSET+1) * sizeof(*max));
+	memset(matches, 0, 256 * 256 * sizeof(*matches));
+	memset(match_slots, 0, input_size * sizeof(*match_slots));
+	memset(optimal, 0, input_size * sizeof(*optimal));
+
+	/* index skipped bytes */
+	for (i = 1; i <= skip; i++) {
+		match_index = input_data[i-1] << 8 | input_data[i];
+		match_slots[i] = matches[match_index];
+		matches[match_index] = i;
+	}
+
+	/* first byte is always literal */
+	optimal[skip].bits = 8;
+
+	/* process remaining bytes */
+	for (; i < input_size; i++) {
+
+		optimal[i].bits = optimal[i-1].bits + 9;
+		match_index = input_data[i-1] << 8 | input_data[i];
+		best_len = 1;
+		for (match = &matches[match_index]; *match != 0 && best_len < MAX_LEN; match = &match_slots[*match]) {
+			offset = i - *match;
+			if (offset > MAX_OFFSET) {
+				*match = 0;
+				break;
+			}
+
+			for (len = 2; len <= MAX_LEN && i >= skip+len; len++) {
+				if (len > best_len) {
+					best_len = len;
+					bits = optimal[i-len].bits + count_bits(offset, len);
+					if (optimal[i].bits > bits) {
+						optimal[i].bits = bits;
+						optimal[i].offset = offset;
+						optimal[i].len = len;
+					}
+				} else if (max[offset] != 0 && i+1 == max[offset]+len) {
+					len = i-min[offset];
+					if (len > best_len) {
+						len = best_len;
+					}
+				}
+				if (i < offset+len || input_data[i-len] != input_data[i-len-offset]) {
+					break;
+				}
+			}
+			min[offset] = i+1-len;
+			max[offset] = i;
+		}
+		match_slots[i] = matches[match_index];
+		matches[match_index] = i;
+	}
+
+	return optimal;
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/z64compress/src/enc/zx7/zx7.c b/tools/z64compress/src/enc/zx7/zx7.c
new file mode 100644
index 000000000..06f65d374
--- /dev/null
+++ b/tools/z64compress/src/enc/zx7/zx7.c
@@ -0,0 +1,46 @@
+/*
+ * (c) Copyright 2012-2016 by Einar Saukas. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of its author may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "zx7.h"
+
+#if !TARGET_PRIZM
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ZX7 Compress the given data, outData is malloc'd and the return value is the length (first 3 bytes of data will be 24-bit size result for convenience)
+unsigned int ZX7Compress(unsigned char *srcData, unsigned int inLength, unsigned char *outData) {
+	unsigned int output_size;
+   compress(optimize(srcData, inLength, 0), srcData, inLength, 0, &output_size, outData);
+
+	return output_size;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/z64compress/src/enc/zx7/zx7.h b/tools/z64compress/src/enc/zx7/zx7.h
new file mode 100644
index 000000000..a3a4e6cda
--- /dev/null
+++ b/tools/z64compress/src/enc/zx7/zx7.h
@@ -0,0 +1,66 @@
+/*
+ * (c) Copyright 2012-2016 by Einar Saukas. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * The name of its author may not be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !TARGET_PRIZM
+	
+typedef struct optimal_t {
+    unsigned int bits;
+    int offset;
+    int len;
+} Optimal;
+
+#define MAX_OFFSET  2176  /* range 1..2176 */
+#define MAX_LEN    65536  /* range 2..65536 */
+
+Optimal *optimize(unsigned char *input_data, unsigned int input_size, unsigned long skip);
+
+unsigned char *compress(Optimal *optimal, unsigned char *input_data, unsigned int input_size, long skip, unsigned int *output_size, unsigned char *dst);
+
+// THOMAS : added these for my use:
+
+// ZX7 Compress the given data, outData is malloc'd and the return value is the length (first 3 bytes of data will be 24-bit size result for convenience)
+unsigned int ZX7Compress(unsigned char *srcData, unsigned int inLength, unsigned char *outData);
+
+#endif
+
+// Get decompressed size of ZX7Compress'd data
+unsigned int ZX7GetDecompressedSize(unsigned char* compressedData);
+
+// Decompress the given data. Returns 0 with no errors
+int ZX7Decompress(unsigned char* srcData, unsigned char* destData, unsigned int destLength);
+
+/* free reusable buffers */
+void
+zx7_shutdown(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/tools/z64compress/src/main.c b/tools/z64compress/src/main.c
new file mode 100644
index 000000000..104be2262
--- /dev/null
+++ b/tools/z64compress/src/main.c
@@ -0,0 +1,393 @@
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "wow.h"
+#include "rom.h"
+
+FILE* printer;
+int g_hlen = 8;
+
+static void compress(struct rom *rom, int start, int end)
+{
+	rom_dma_compress(rom, start, end, 1);
+}
+
+static void skip(struct rom *rom, int start, int end)
+{
+	rom_dma_compress(rom, start, end, 0);
+}
+
+static void repack(struct rom *rom, int start, int end)
+{
+	rom_dma_repack(
+		rom
+		, start
+		, end
+		, "yaz" /* old codec */
+		, 0 /* new codec */
+	);
+}
+
+static void do_pattern(
+	struct rom *rom
+	, const char *str
+	, void func(struct rom *rom, int start, int end)
+)
+{
+	const char *Ostr = str;
+	int last_int = -1;
+	int last_op = 0;
+	int cur;
+	int len;
+	
+	while (*str)
+	{
+		if (*str == '\'' || *str == '"')
+		{
+			++str;
+			continue;
+		}
+		
+		/* calculate length of current token */
+		len = strspn(str, "0123456789xXaAbBcCdDeEfF"); /* allow hex */
+		if (!len) /* no len, assume it's an operator */
+			len = 1;
+		
+		/* is a number or variable */
+		if (isdigit(*str) || !strncmp(str, "END", 3))
+		{
+			/* 'END' is shorthand for last entry */
+			if (!strncmp(str, "END", 3))
+			{
+				cur = rom_dma_num(rom);
+				str += 2;
+			}
+			
+			/* otherwise, it's a number */
+			else
+				sscanf(str, "%i", &cur);
+			
+			if (cur < last_int)
+				die(
+					"invalid pattern '%s'; "
+					"values are not in ascending order"
+					, Ostr
+				);
+			
+			/* apply operations on item(s) */
+			if (last_op == '-')
+				func(rom, last_int, cur);
+			else
+				func(rom, cur, cur);
+			
+			/* prevents processing this item
+			 * again when 'through' is used
+			 */
+			cur += 1;
+		}
+		
+		/* 'through' or 'single item', respectively */
+		else if (*str == '-' || *str == ',')
+		{
+			if (last_int < 0)
+				die(
+					"invalid pattern '%s'; "
+					"pattern does not begin with number"
+					, Ostr
+				);
+			last_op = *str;
+		}
+		
+		/* unknown character encountered */
+		else
+		{
+			die(
+				"invalid pattern '%s'; "
+				"encountered unknown operator '%c'"
+				, Ostr
+				, *str
+			);
+		}
+		
+		/* advance */
+		str += len;
+		last_int = cur;
+	}
+}
+
+static void usage(void)
+{
+	/* compression examples for users to adapt to their needs */
+	fprintf(printer, "\n");
+	fprintf(printer, "  compressing oot debug\n");
+	fprintf(printer, "    --in           \"path/to/in.z64\"\n");
+	fprintf(printer, "    --out          \"path/to/out.z64\"\n");
+	fprintf(printer, "    --mb           32\n");
+	fprintf(printer, "    --codec        yaz\n");
+	fprintf(printer, "    --cache        \"path/to/cache\"\n");
+	fprintf(printer, "    --dma          \"0x12F70,1548\"\n");
+	fprintf(printer, "    --compress     \"9-14,28-END\"\n");
+	fprintf(printer, "    --threads      4\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "  compressing oot ntsc 1.0\n");
+	fprintf(printer, "    --in           \"path/to/in.z64\"\n");
+	fprintf(printer, "    --out          \"path/to/out.z64\"\n");
+	fprintf(printer, "    --mb           32\n");
+	fprintf(printer, "    --codec        yaz\n");
+	fprintf(printer, "    --cache        \"path/to/cache\"\n");
+	fprintf(printer, "    --dma          \"0x7430,1526\"\n");
+	fprintf(printer, "    --compress     \"10-14,27-END\"\n");
+	fprintf(printer, "    --threads      4\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "  compressing mm usa\n");
+	fprintf(printer, "    --in           \"path/to/in.z64\"\n");
+	fprintf(printer, "    --out          \"path/to/out.z64\"\n");
+	fprintf(printer, "    --mb           32\n");
+	fprintf(printer, "    --codec        yaz\n");
+	fprintf(printer, "    --cache        \"path/to/cache\"\n");
+	fprintf(printer, "    --dma          \"0x1A500,1568\"\n");
+	fprintf(printer, "    --compress     \"10-14,23,24,31-END\"\n");
+	fprintf(printer, "    --skip         \"1127\"\n");
+	fprintf(printer, "    --repack       \"15-20,22\"\n");
+	fprintf(printer, "    --threads      4\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "  arguments\n");
+	fprintf(printer, "    --in           uncompressed input rom\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --out          compressed output rom\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --matching     attempt matching compression at the cost of\n");
+	fprintf(printer, "                   some optimizations and reduced performance\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --mb           how many mb the compressed rom should be\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --codec        currently supported codecs\n");
+	fprintf(printer, "                      yaz\n");
+	fprintf(printer, "                      ucl\n");
+	fprintf(printer, "                      lzo\n");
+	fprintf(printer, "                      zlib\n");
+	fprintf(printer, "                      aplib\n");
+	fprintf(printer, "                 * to use non-yaz codecs, find patches\n");
+	fprintf(printer, "                   and code on my z64enc repo\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --cache        is optional and won't be created if\n");
+	fprintf(printer, "                   no path is specified (having a cache\n");
+	fprintf(printer, "                   makes subsequent compressions faster)\n");
+	fprintf(printer, "                 * pro-tip: linux users who don't want a\n");
+	fprintf(printer, "                   cache to persist across power cycles\n");
+	fprintf(printer, "                   can use the path \"/tmp/z64compress\"\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --dma          specify dmadata address and count\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --compress     enable compression on specified files\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --skip         disable compression on specified files\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --headerless   don't write file headers (for iQue)\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --repack       handles Majora's Mask archives\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --threads      optional multithreading;\n");
+	fprintf(printer, "                   exclude this argument to disable it\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "    --only-stdout  reserve stderr for errors and print\n");
+	fprintf(printer, "                   everything else to stdout\n");
+	fprintf(printer, "\n");
+	fprintf(printer, "  arguments are executed as they\n");
+	fprintf(printer, "  are parsed, so order matters!\n");
+	fprintf(printer, "\n");
+}
+
+wow_main
+{
+	struct rom *rom = 0;
+	const char *Ain = 0;
+	const char *Aout = 0;
+	const char *Adma = 0;
+	const char *Acodec = 0;
+	const char *Acache = 0;
+	int Amb = 0;
+	int Athreads = 0;
+	bool Amatching = false;
+	bool Aonly_stdout = false;
+	bool Aheaderless = false;
+	wow_main_argv;
+
+	printer = stderr;
+	for (int i = 1; i < argc; ++i)
+	{
+		if (!strcmp(argv[i], "--only-stdout"))
+		{
+			setvbuf(stdout, NULL, _IONBF, 0);
+			printer = stdout;
+		}
+	}
+	
+	fprintf(printer, "welcome to z64compress 1.0.2 <z64.me>\n");
+	
+	if (argc <= 1)
+	{
+		usage();
+		return EXIT_FAILURE;
+	}
+	
+	/* hacky argument handling  */
+	for (int i = 1; i < argc; i += 2)
+	{
+		const char *arg = argv[i];
+		
+		/* arguments that do not require additional parameters */
+
+		if(!strcmp(arg, "--only-stdout"))
+		{
+			if (Aonly_stdout)
+				die("--only-stdout arg provided more than once");
+			// handled above
+			Aonly_stdout = true;
+			i--;
+			continue;
+		}
+		else if (!strcmp(arg, "--matching"))
+		{
+			if (Amatching)
+				die("--matching arg provided more than once");
+			Amatching = true;
+			i--;
+			continue;
+		}
+		else if (!strcmp(arg, "--headerless"))
+		{
+			if (Aheaderless)
+				die("--headerless arg provided more than once");
+			Aheaderless = true;
+			g_hlen = 0;
+			i--;
+			continue;
+		}
+		
+		/* arguments with additional parameters */
+
+		const char *next = argv[i + 1];
+
+		if (!next)
+			die("%s missing parameter", arg);
+		
+		if (!strcmp(arg, "--in"))
+		{
+			if (Ain)
+				die("--in arg provided more than once");
+			Ain = next;
+			rom = rom_new(Ain);
+		}
+		else if (!strcmp(arg, "--out"))
+		{
+			if (Aout)
+				die("--out arg provided more than once");
+			Aout = next;
+		}
+		else if (!strcmp(arg, "--cache"))
+		{
+			if (Acache)
+				die("--cache arg provided more than once");
+			Acache = next;
+			rom_set_cache(rom, Acache);
+		}
+		else if (!strcmp(arg, "--codec"))
+		{
+			if (Acodec)
+				die("--codec arg provided more than once");
+			if (!Ain)
+				die("--dma arg provided before --in arg");
+			Acodec = next;
+			rom_set_codec(rom, Acodec);
+		}
+		else if (!strcmp(arg, "--dma"))
+		{
+			int num;
+			int start = 0;
+			
+			if (!Acodec)
+				die("--dma arg provided before --codec arg");
+			if (!Ain)
+				die("--dma arg provided before --in arg");
+			if (Adma)
+				die("--dma arg provided more than once");
+			Adma = next;
+			if (sscanf(Adma, "%i,%i", &start, &num) != 2)
+				die("--dma bad formatting '%s'", Adma);
+			rom_dma(rom, start, num, Amatching);
+		}
+		else if (!strcmp(arg, "--mb"))
+		{
+			if (Amb)
+				die("--mb arg provided more than once");
+			if (sscanf(next, "%i", &Amb) != 1)
+				die("--mb could not get value from string '%s'", next);
+			if (Amb <= 0)
+				die("--mb invalid value %d", Amb);
+		}
+		else if (!strcmp(arg, "--compress"))
+		{
+			if (!Adma)
+				die("--compress arg provided before --dma arg");
+			do_pattern(rom, next, compress);
+		}
+		else if (!strcmp(arg, "--skip"))
+		{
+			if (!Adma)
+				die("--skip arg provided before --dma arg");
+			do_pattern(rom, next, skip);
+		}
+		else if (!strcmp(arg, "--repack"))
+		{
+			if (!Adma)
+				die("--repack arg provided before --dma arg");
+			if (!Acodec)
+				die("--repack arg provided before --codec arg");
+			do_pattern(rom, next, repack);
+		}
+		else if(!strcmp(arg, "--threads"))
+		{
+			if (Athreads)
+				die("--threads arg provided more than once");
+			if (sscanf(next, "%i", &Athreads) != 1)
+				die("--threads could not get value from string '%s'", next);
+			if (Athreads < 0)
+				die("--threads invalid value %d", Athreads);
+		}
+		else
+		{
+			die("unknown argument '%s'", arg);
+		}
+	}
+	
+	#define ARG_ZERO_TEST(TEST, NAME) \
+		if (!(TEST)) \
+			die("no " NAME " arg provided")
+	
+	ARG_ZERO_TEST(Ain   , "--in"   );
+	ARG_ZERO_TEST(Aout  , "--out"  );
+	ARG_ZERO_TEST(Acodec, "--codec");
+	
+	#undef ARG_ZERO_TEST
+	
+	/* finished initializing dma settings */
+	rom_dma_ready(rom, Amatching);
+	
+	/* compress rom */
+	rom_compress(rom, Amb, Athreads, Amatching);
+	fprintf(printer, "rom compressed successfully!\n");
+	
+	/* write compressed rom */
+	rom_save(rom, Aout);
+	fprintf(printer, "compressed rom written successfully!\n");
+	
+	/* cleanup */
+	rom_free(rom);
+	
+	return EXIT_SUCCESS;
+}
+
diff --git a/tools/z64compress/src/n64crc.c b/tools/z64compress/src/n64crc.c
new file mode 100644
index 000000000..a34d646ea
--- /dev/null
+++ b/tools/z64compress/src/n64crc.c
@@ -0,0 +1,197 @@
+/* snesrc - SNES Recompiler
+ *
+ * Mar 23, 2010: addition by spinout to actually fix CRC if it is incorrect
+ *
+ * Copyright notice for this file:
+ *  Copyright (C) 2005 Parasyte
+ *
+ * Based on uCON64's N64 checksum algorithm by Andreas Sterbenz
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <assert.h>
+
+#define ROL(i, b) (((i) << (b)) | ((i) >> (32 - (b))))
+#define BYTES2LONG(b) ( (b)[0] << 24 | \
+                        (b)[1] << 16 | \
+                        (b)[2] <<  8 | \
+                        (b)[3] )
+
+#define N64_HEADER_SIZE  0x40
+#define N64_BC_SIZE      (0x1000 - N64_HEADER_SIZE)
+
+#define N64_CRC1         0x10
+#define N64_CRC2         0x14
+
+#define CHECKSUM_START   0x00001000
+#define CHECKSUM_LENGTH  0x00100000
+#define CHECKSUM_CIC6102 0xF8CA4DDC
+#define CHECKSUM_CIC6103 0xA3886759
+#define CHECKSUM_CIC6105 0xDF26F436
+#define CHECKSUM_CIC6106 0x1FEA617A
+
+
+static void gen_table(unsigned int crc_table[256])
+{
+	unsigned int crc, poly;
+	int	i, j;
+
+	poly = 0xEDB88320;
+	for (i = 0; i < 256; i++) {
+		crc = i;
+		for (j = 8; j > 0; j--) {
+			if (crc & 1) crc = (crc >> 1) ^ poly;
+			else crc >>= 1;
+		}
+		crc_table[i] = crc;
+	}
+}
+
+
+static unsigned int crc32(
+	unsigned int crc_table[256]
+	, unsigned char *data
+	, int len
+)
+{
+	unsigned int crc = ~0;
+	int i;
+
+	for (i = 0; i < len; i++) {
+		crc = (crc >> 8) ^ crc_table[(crc ^ data[i]) & 0xFF];
+	}
+
+	return ~crc;
+}
+
+
+static int N64GetCIC(unsigned int crc_table[256], unsigned char *data)
+{
+	switch (crc32(crc_table, &data[N64_HEADER_SIZE], N64_BC_SIZE)) {
+		case 0x6170A4A1: return 6101;
+		case 0x90BB6CB5: return 6102;
+		case 0x0B050EE0: return 6103;
+		case 0x98BC2C86: return 6105;
+		case 0xACC8580A: return 6106;
+	}
+
+	return 0;
+}
+
+
+static int N64CalcCRC(
+	unsigned int crc_table[256]
+	, unsigned int *crc
+	, unsigned char *data
+)
+{
+	int bootcode, i;
+	unsigned int seed;
+	unsigned int t1, t2, t3;
+	unsigned int t4, t5, t6;
+	unsigned int r, d;
+
+	switch ((bootcode = N64GetCIC(crc_table, data))) {
+		case 6101:
+		case 6102:
+			seed = CHECKSUM_CIC6102;
+			break;
+		case 6103:
+			seed = CHECKSUM_CIC6103;
+			break;
+		case 6105:
+			seed = CHECKSUM_CIC6105;
+			break;
+		case 6106:
+			seed = CHECKSUM_CIC6106;
+			break;
+		default:
+			return 1;
+	}
+
+	t1 = t2 = t3 = t4 = t5 = t6 = seed;
+
+	i = CHECKSUM_START;
+	while (i < (CHECKSUM_START + CHECKSUM_LENGTH)) {
+		d = BYTES2LONG(&data[i]);
+		if ((t6 + d) < t6)
+			t4++;
+		t6 += d;
+		t3 ^= d;
+		r = ROL(d, (d & 0x1F));
+		t5 += r;
+		if (t2 > d)
+			t2 ^= r;
+		else
+			t2 ^= t6 ^ d;
+
+		if (bootcode == 6105)
+			t1 += BYTES2LONG(&data[N64_HEADER_SIZE + 0x0710 + (i & 0xFF)]) ^ d;
+		else
+			t1 += t5 ^ d;
+
+		i += 4;
+	}
+	if (bootcode == 6103) {
+		crc[0] = (t6 ^ t4) + t3;
+		crc[1] = (t5 ^ t2) + t1;
+	}
+	else if (bootcode == 6106) {
+		crc[0] = (t6 * t4) + t3;
+		crc[1] = (t5 * t2) + t1;
+	}
+	else {
+		crc[0] = t6 ^ t4 ^ t3;
+		crc[1] = t5 ^ t2 ^ t1;
+	}
+
+	return 0;
+}
+
+
+/* recalculate rom crc */
+void n64crc(void *rom)
+{
+	unsigned int crc_table[256];
+	unsigned char CRC1[4];
+	unsigned char CRC2[4];
+	unsigned int crc[2];
+	unsigned char *rom8 = rom;
+	
+	assert(rom);
+	
+	gen_table(crc_table);
+
+	if (!N64CalcCRC(crc_table, crc, rom))
+	{
+		unsigned int kk1 = crc[0];
+		unsigned int kk2 = crc[1];
+		int i;
+		
+		for (i = 0; i < 4; ++i)
+		{
+			CRC1[i] = (kk1 >> (24-8*i))&0xFF;
+			CRC2[i] = (kk2 >> (24-8*i))&0xFF;
+		}
+		
+		for (i = 0; i < 4; ++i)
+			*(rom8 + N64_CRC1 + i) = CRC1[i];
+		
+		for (i = 0; i < 4; ++i)
+			*(rom8 + N64_CRC2 + i) = CRC2[i];
+	}
+}
+
diff --git a/tools/z64compress/src/n64crc.h b/tools/z64compress/src/n64crc.h
new file mode 100644
index 000000000..b5342dac6
--- /dev/null
+++ b/tools/z64compress/src/n64crc.h
@@ -0,0 +1,32 @@
+/* snesrc - SNES Recompiler
+ *
+ * Mar 23, 2010: addition by spinout to actually fix CRC if it is incorrect
+ *
+ * Copyright notice for this file:
+ *  Copyright (C) 2005 Parasyte
+ *
+ * Based on uCON64's N64 checksum algorithm by Andreas Sterbenz
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef N64CRC_H_INCLUDED
+#define N64CRC_H_INCLUDED
+
+/* recalculate rom crc */
+void n64crc(void *rom);
+
+#endif /* N64CRC_H_INCLUDED */
+
diff --git a/tools/z64compress/src/rom.c b/tools/z64compress/src/rom.c
new file mode 100644
index 000000000..50a7a4914
--- /dev/null
+++ b/tools/z64compress/src/rom.c
@@ -0,0 +1,1714 @@
+/* 
+ * rom.c
+ * 
+ * functions for compression magic reside herein
+ * 
+ * z64me <z64.me>
+ * 
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+/* POSIX dependencies */
+#include <dirent.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+/* threading */
+#include <pthread.h>
+
+#include "enc/enc.h"  /* file compression */
+#include "enc/yar.h"  /* MM archive tools */
+
+#include "sha1.h"     /* sha1 helpers */
+#include "n64crc.h"   /* n64crc() */
+
+#include "wow.h"
+#include "wow_dirent.h" /* XXX always #include after dirent.h */
+#undef   fopen
+#undef   fread
+#undef   fwrite
+#undef   remove
+#define  fopen   wow_fopen
+#define  fread   wow_fread
+#define  fwrite  wow_fwrite
+#define  remove  wow_remove
+
+extern FILE* printer;
+
+#define SIZE_16MB (1024 * 1024 * 16)
+#define SIZE_4MB  (1024 * 1024 * 4)
+
+#define DMA_DELETED 0xffffffff /* aka UINT32_MAX */
+
+#define DMASORT(ROM, FUNC) \
+	qsort( \
+		ROM->dma \
+		, ROM->dma_num \
+		, sizeof(*(ROM->dma)) \
+		, FUNC \
+	)
+
+#define DMASORT_N(ROM, FUNC, NUM) \
+	qsort( \
+		ROM->dma \
+		, NUM \
+		, sizeof(*(ROM->dma)) \
+		, FUNC \
+	)
+
+#define DMA_FOR_EACH \
+for (dma = rom->dma; (unsigned)(dma - rom->dma) < rom->dma_num; ++dma)
+
+#define PROGRESS_A_B (int)(dma - rom->dma), rom->dma_num
+
+#define ALIGN(x, n) (((x) + ((n)-1)) & ~((n)-1))
+#define ALIGN16(x) 	ALIGN(x, 16)
+#define ALIGN8MB(x) ALIGN(x, 8 * 0x100000)
+
+/*
+ *
+ * private types
+ *
+ */
+
+
+struct encoder
+{
+	int (*encfunc)(
+		void *src
+		, unsigned src_sz
+		, void *dst
+		, unsigned *dst_sz
+		, void *_ctx
+	);
+	void *(*ctx_new)(void);
+	void (*ctx_free)(void *);
+};
+
+
+struct dma
+{
+	char             *compname;  /* name of compressed file    */
+	void             *compbuf;   /* cache-less compressed data */
+	unsigned int      index;     /* original index location    */
+	int               compress;  /* entry can be compressed    */
+	int               deleted;   /* points to deleted file     */
+	unsigned          compSz;    /* cache-less compressed size */
+	unsigned int      start;     /* start offset               */
+	unsigned int      end;       /* end offset                 */
+	unsigned int      Pstart;    /* start of physical (P) data */
+	unsigned int      Pend;      /* end of physical (P) data   */
+	unsigned int      Ostart;    /* original (O) start         */
+	unsigned int      Oend;      /* original (O) end           */
+};
+
+
+struct rom
+{
+	char             *fn;        /* filename of loaded rom            */
+	char             *codec;     /* compression codec                 */
+	char             *cache;     /* compression cache                 */
+	unsigned char    *data;      /* raw rom data                      */
+	unsigned int      data_sz;   /* size of rom data                  */
+	unsigned int      ofs;       /* offset where rom_write() writes   */
+	int               is_comp;   /* non-0 if rom has been compressed  */
+	struct dma       *dma;       /* dma array                         */
+	unsigned int      dma_num;   /* number of entries in dma array    */
+	unsigned char    *dma_raw;   /* pointer to raw dmadata            */
+	int               dma_ready; /* non-zero after dma_ready()        */
+	
+	/* memory pools for things like compression */
+	struct
+	{
+		void *mb16;  /* 16 mb */
+		void *mb4;   /* 4 mb  */
+	} mem;
+};
+
+
+struct fldr_item
+{
+	char             *name;      /* name  */
+	void             *udata;     /* udata */
+};
+
+
+struct folder
+{
+	struct fldr_item *item;      /* item array */
+	int               num;       /* number of items in array */
+	struct fldr_item *active;    /* active item */
+};
+
+
+struct compThread
+{
+	struct rom *rom;
+	void *data;
+	int (*encfunc)(
+		void *src
+		, unsigned src_sz
+		, void *dst
+		, unsigned *dst_sz
+		, void *_ctx
+	);
+	const char *codec;
+	char *dot_codec;
+	struct folder *list;
+	int stride;   /* number of entries to advance each time */
+	int ofs;      /* starting entry in list */
+	int report;   /* report progress to stderr (last thread only) */
+	void *ctx;    /* compression context */
+	bool matching;
+	pthread_t pt; /* pthread */
+};
+
+/*
+ *
+ * private functions
+ *
+ */
+
+
+/* get 32-bit value from raw data */
+static int get32(void *_data)
+{
+	unsigned char *data = _data;
+	
+	return (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
+}
+
+
+/* get size of a file; returns 0 if fopen fails */
+static unsigned int file_size(const char *fn)
+{
+	FILE *fp;
+	unsigned int sz;
+	
+	fp = fopen(fn, "rb");
+	if (!fp)
+		return 0;
+	
+	fseek(fp, 0, SEEK_END);
+	sz = ftell(fp);
+	fclose(fp);
+	
+	return sz;
+}
+
+/* load a file into an existing buffer */
+static void *file_load_into(
+	const char *dir
+	, const char *fn
+	, unsigned int *sz
+	, void *dst
+)
+{
+	FILE *fp;
+	
+	assert(fn);
+	assert(sz);
+	assert(dst);
+	
+	if (!dir)
+		dir = "";
+	
+	*sz = 0;
+	
+	fp = fopen(fn, "rb");
+	if (!fp)
+		die("failed to open '%s%s' for reading", dir, fn);
+	
+	fseek(fp, 0, SEEK_END);
+	*sz = ftell(fp);
+	
+	if (!*sz)
+		die("size of file '%s%s' is zero", dir, fn);
+	
+	fseek(fp, 0, SEEK_SET);
+	
+	if (fread(dst, 1, *sz, fp) != *sz)
+		die("failed to read contents of '%s%s'", dir, fn);
+	
+	fclose(fp);
+	
+	return dst;
+}
+
+/* load a file */
+static void *file_load(const char *fn, unsigned int *sz)
+{
+	unsigned char *dst;
+	
+	assert(fn);
+	assert(sz);
+	
+	*sz = file_size(fn);
+	if (!*sz)
+		die("failed to get size of file '%s'", fn);
+	
+	dst = malloc_safe(*sz);
+	
+	return file_load_into(0, fn, sz, dst);
+}
+
+/* write file */
+static unsigned int file_write(
+	const char *fn
+	, void *data
+	, unsigned int data_sz
+)
+{
+	FILE *fp;
+	
+	assert(fn);
+	assert(data);
+	assert(data_sz);
+	
+	fp = fopen(fn, "wb");
+	if (!fp)
+		return 0;
+	
+	data_sz = fwrite(data, 1, data_sz, fp);
+	
+	fclose(fp);
+	
+	return data_sz;
+}
+
+
+/* allocate a folder structure and parse current working directory */
+static struct folder *folder_new(void)
+{
+	wow_DIR *dir;
+	struct wow_dirent *ep;
+	struct fldr_item *item;
+	struct folder *folder;
+	char cwd[4096];
+	int count = 0;
+	int recount = 0;
+	
+	/* allocate a folder */
+	folder = calloc_safe(1, sizeof(*folder));
+	
+	/* get current working directory for error reporting */
+	wow_getcwd_safe(cwd, sizeof(cwd));
+	
+	/* first pass: count the contents */
+	dir = wow_opendir(".");
+	if (!dir)
+		die("failed to parse directory '%s'", cwd);
+	while ((ep = wow_readdir(dir)))
+		count += 1;
+	wow_closedir(dir);
+	
+	/* folder is empty */
+	if (!count)
+		die("folder '%s' is empty", cwd);
+	
+	/* allocate item array */
+	item = calloc_safe(count, sizeof(*item));
+	folder->item = item;
+	folder->num = count;
+	
+	/* second pass: retrieve requested contents */
+	dir = wow_opendir(".");
+	if (!dir)
+		die("failed to parse directory '%s'", cwd);
+	for (
+		recount = 0
+		; (ep = wow_readdir(dir)) && recount < count
+		; ++recount, ++item
+	)
+	{
+		const char *dn;
+		
+		dn = (const char*)ep->d_name;
+		
+		/* skip names starting with '.' (covers both "." and "..") */
+		if (*dn == '.')
+			continue;
+		
+		/* skip directories */
+		if (wow_is_dir(dn))
+			continue;
+		
+		/* make a copy of the string */
+		item->name = strdup_safe(dn);
+	}
+	
+	if (recount != count)
+		die("contents of '%s' changed during read, try again", cwd);
+	wow_closedir(dir);
+	
+	return folder;
+}
+
+
+/* free a folder structure */
+static void folder_free(struct folder *folder)
+{
+	if (!folder)
+		return;
+	
+	/* folder contains items */
+	if (folder->item)
+	{
+		struct fldr_item *item;
+		
+		/* walk item list, freeing resources owned by each */
+		for (
+			item = folder->item
+			; item - folder->item < folder->num
+			; ++item
+		)
+		{
+			if (item->name)
+				free(item->name);
+		}
+		
+		/* free item list */
+		free(folder->item);
+	}
+	
+	/* free folder */
+	free(folder);
+}
+
+
+/* locate folder item by name, ignoring extension (such as '.raw') */
+static struct fldr_item *folder_findNameNoExt(
+	struct folder *folder
+	, char *name
+)
+{
+	struct fldr_item *item;
+	
+	assert(folder);
+	assert(name);
+	
+	for (
+		item = folder->item
+		; item - folder->item < folder->num
+		; ++item
+	)
+	{
+		char *period;
+		int nchar;
+		
+		/* item has no name */
+		if (!item->name)
+			continue;
+		
+		/* doesn't contain a period */
+		if (!(period = strrchr(item->name, '.')))
+			continue;
+		
+		/* number of bytes to compare */
+		nchar = period - item->name;
+		
+		/* names match */
+		if (!memcmp(item->name, name, nchar))
+			return item;
+	}
+	
+	return 0;
+}
+
+
+/* retrieve encoder from name */
+static const struct encoder *encoder(const char *name)
+{
+	if (!strcmp(name, "yaz"))
+	{
+		static const struct encoder yaz = {
+			.encfunc = yazenc
+			, .ctx_new = yazCtx_new
+			, .ctx_free = yazCtx_free
+		};
+		
+		return &yaz;
+	}
+	else if (!strcmp(name, "lzo"))
+	{
+		static const struct encoder lzo = {
+			.encfunc = lzoenc
+			, .ctx_new = lzoCtx_new
+			, .ctx_free = lzoCtx_free
+		};
+		
+		return &lzo;
+	}
+	else if (!strcmp(name, "ucl"))
+	{
+		static const struct encoder ucl = {
+			.encfunc = uclenc
+		};
+		
+		return &ucl;
+	}
+	/*else if (!strcmp(name, "zx7"))
+	{
+		static const struct encoder zx7 = {
+			.encfunc = zx7enc
+		};
+		
+		return &zx7;
+	}*/
+	else if (!strcmp(name, "zlib"))
+	{
+		static const struct encoder zlib = {
+			.encfunc = zlibenc
+		};
+		
+		return &zlib;
+	}
+	else if (!strcmp(name, "aplib"))
+	{
+		static const struct encoder aplib = {
+			.encfunc = aplenc
+		};
+		
+		return &aplib;
+	}
+	else
+		die("unknown compression codec '%s'", name);
+	
+	return 0;
+}
+
+
+/* sort dma array by start, ascending */
+static int sortfunc_dma_start_ascend(const void *_a, const void *_b)
+{
+	const struct dma *a = _a;
+	const struct dma *b = _b;
+	
+	if (a->start < b->start)
+		return -1;
+	
+	else if (a->start > b->start)
+		return 1;
+	
+	return 0;
+}
+
+
+/* sort dma array by size, descending */
+static int sortfunc_dma_size_descend(const void *_a, const void *_b)
+{
+	const struct dma *a = _a;
+	const struct dma *b = _b;
+	
+	unsigned int a_len = a->end - a->start;
+	unsigned int b_len = b->end - b->start;
+	
+	if (a_len < b_len)
+		return 1;
+	
+	else if (a_len > b_len)
+		return -1;
+	
+	return 0;
+}
+
+
+/* enter a directory (will be created if it doesn't exist) */
+static void dir_enter(const char *dir)
+{
+	/* unable to enter directory */
+	if (wow_chdir(dir))
+	{
+		/* attempt to create directory */
+		if (wow_mkdir(dir))
+			die("failed to create directory '%s'", dir);
+		
+		if (wow_chdir(dir))
+			die("failed to enter directory '%s'", dir);
+	}
+}
+
+
+static void report_progress(
+	struct rom *rom
+	, const char *codec
+	, int v
+	, int total
+)
+{
+	/* caching enabled */
+	if (rom->cache)
+		fprintf(
+			printer
+			, "\r""updating '%s/%s' %d/%d: "
+			, rom->cache
+			, codec
+			, v
+			, total
+		);
+	
+	else
+		fprintf(
+			printer
+			, "\r""compressing file %d/%d: "
+			, v
+			, total
+		);
+}
+
+/* compress a list of files */
+static void dma_compress(
+	struct rom *rom
+	, void *compbuf
+	, int encfunc(
+		void *src
+		, unsigned src_sz
+		, void *dst
+		, unsigned *dst_sz
+		, void *_ctx
+	)
+	, const char *codec
+	, char *dot_codec
+	, struct folder *list
+	, int stride   /* number of entries to advance each time */
+	, int ofs      /* starting entry in list */
+	, int report   /* report progress to stderr (last thread only) */
+	, void *ctx    /* compression context */
+	, bool matching
+)
+{
+	struct dma *dma;
+	struct fldr_item *item;
+	
+	for (dma = rom->dma + ofs
+		; (unsigned)(dma - rom->dma) < rom->dma_num
+		; dma += stride
+	)
+	{
+		char *iname = 0;
+		unsigned char *data = rom->data + dma->start;
+		unsigned char checksum[64];
+		char readable[64];
+		int len = dma->end - dma->start;
+		
+		/* report the progress */
+		if (report)
+			report_progress(rom, codec, PROGRESS_A_B);
+		
+		/* skip files that have a size of 0 */
+		if (dma->start == dma->end)
+			continue;
+		
+		/* caching is disabled, just compress */
+		if (!rom->cache)
+		{
+			int err;
+			dma->compbuf = compbuf;
+			
+			/* don't compress this file */
+			if (!dma->compress)
+			{
+				dma->compSz = dma->end - dma->start;
+				dma->compbuf =	memdup_safe(
+					rom->data + dma->start
+					, dma->compSz
+				);
+				continue;
+			}
+			
+			err =
+			encfunc(
+				rom->data + dma->start
+				, dma->end - dma->start
+				, dma->compbuf
+				, &dma->compSz
+				, ctx
+			);
+			
+			/* file doesn't benefit from compression */
+			if (!matching && dma->compSz >= dma->end - dma->start)
+			{
+				dma->compSz = dma->end - dma->start;
+				dma->compbuf =	memdup_safe(
+					rom->data + dma->start
+					, dma->compSz
+				);
+				dma->compress = 0;
+			}
+			else
+				dma->compbuf =	memdup_safe(dma->compbuf, dma->compSz);
+			
+			if (err)
+				die("compression error");
+			
+			/* the rest of the loop applies only to caches */
+			continue;
+		}
+		
+		/* get readable checksum name */
+		stb_sha1(checksum, data, len);
+		stb_sha1_readable(readable, checksum);
+		
+		/* see if item already exists in folder */
+		item = folder_findNameNoExt(list, readable);
+		if (item)
+		{
+			/* use full file name, including extension */
+			iname = item->name;
+			
+			/* it exists, so use udata to mark the file as used */
+			item->udata = dot_codec;
+			dma->compSz = file_size(iname);
+			
+			/* uncompressed file */
+			if (strstr(iname, ".raw"))
+				dma->compress = 0;
+		}
+		/* item doesn't exist, so create it */
+		else
+		{
+			void *out = compbuf;
+			unsigned out_sz;
+			int err;
+			
+			/* file not marked for compression */
+			if (!dma->compress)
+			{
+				out = rom->data + dma->start;
+				out_sz = dma->end - dma->start;
+				dma->compress = 0;
+				strcat(readable, ".raw");
+				
+				/* write file */
+				if (file_write(readable, out, out_sz) != out_sz)
+					die("error writing file 'cache/%s/%s'", codec, readable);
+				
+				dma->compSz = out_sz;
+				dma->compname = strdup_safe(readable);
+				
+				/* the remaining block applies only to compressed files */
+				continue;
+			}
+			
+			err =
+			encfunc(
+				rom->data + dma->start
+				, dma->end - dma->start
+				, out
+				, &out_sz
+				, ctx
+			);
+			
+			if (err)
+				die("compression error");
+			
+			/* file doesn't benefit from compression */
+			if (!matching && out_sz >= dma->end - dma->start)
+			{
+				out = rom->data + dma->start;
+				out_sz = dma->end - dma->start;
+				dma->compress = 0;
+				strcat(readable, ".raw");
+			}
+			/* file benefits from compression */
+			else
+				/* add encoding as extension, ex '.yaz' */
+				strcat(readable, dot_codec);
+			
+			/* write file */
+			if (file_write(readable, out, out_sz) != out_sz)
+				die("error writing file 'cache/%s/%s'", codec, readable);
+			
+			dma->compSz = out_sz;
+			iname = readable;
+		}
+		
+		/* back up compressed filename to
+		 * avoid having to re-checksum later
+		 */
+		dma->compname = strdup_safe(iname);
+	}
+}
+
+
+static void *dma_compress_threadfunc(void *_CT)
+{
+	struct compThread *CT = _CT;
+	
+	dma_compress(
+		CT->rom
+		, CT->data
+		, CT->encfunc
+		, CT->codec
+		, CT->dot_codec
+		, CT->list
+		, CT->stride
+		, CT->ofs
+		, CT->report
+		, CT->ctx
+		, CT->matching
+	);
+	
+	return 0;
+}
+
+
+static void dma_compress_thread(
+	struct compThread *CT
+	, struct rom *rom
+	, void *compbuf
+	, int encfunc(
+		void *src
+		, unsigned src_sz
+		, void *dst
+		, unsigned *dst_sz
+		, void *_ctx
+	)
+	, const char *codec
+	, char *dot_codec
+	, struct folder *list
+	, int stride   /* number of entries to advance each time */
+	, int ofs      /* starting entry in list */
+	, int report   /* report progress to stderr (last thread only) */
+	, void *ctx    /* compression context */
+	, bool matching
+)
+{
+	CT->rom = rom;
+	CT->data = compbuf;
+	CT->encfunc = encfunc;
+	CT->codec = codec;
+	CT->dot_codec = dot_codec;
+	CT->list = list;
+	CT->stride = stride;
+	CT->ofs = ofs;
+	CT->report = report;
+	CT->ctx = ctx;
+	CT->matching = matching;
+	
+	if (pthread_create(&CT->pt, 0, dma_compress_threadfunc, CT))
+		die("threading error");
+}
+
+
+/* get dma entry by original index (useful after reordering) */
+static struct dma *dma_get_idx(struct rom *rom, unsigned idx)
+{
+	struct dma *dma;
+	
+	assert(idx < rom->dma_num && "dma index too high");
+	
+	/* walk dma list for matching index */
+	DMA_FOR_EACH
+	{
+		if (dma->index == idx)
+			break;
+	}
+	
+	return dma;
+}
+
+
+/* write 'num' bytes to rom and advance */
+static void rom_write(struct rom *rom, void *data, int sz)
+{
+	unsigned char *raw;
+	
+	assert(rom);
+	assert(rom->data);
+	assert(data);
+	assert(sz);
+	
+	if (rom->ofs + sz > rom->data_sz)
+		die(
+			"can't write %d bytes at 0x%X b/c it exceeds rom size"
+			, sz, rom->ofs
+		);
+	
+	raw = rom->data + rom->ofs;
+	
+	memcpy(raw, data, sz);
+	
+	rom->ofs += sz;
+}
+
+
+/* write 32 bit value to rom and advance */
+static void rom_write32(struct rom *rom, unsigned int value)
+{
+	unsigned char raw[4];
+	
+	raw[0] = value >> 24;
+	raw[1] = value >> 16;
+	raw[2] = value >>  8;
+	raw[3] = value;
+	
+	rom_write(rom, raw, 4);
+}
+
+
+/* write dma table into rom */
+static void rom_write_dmadata(struct rom *rom)
+{
+	struct dma *dma;
+	int num;
+	int numUsed;
+	
+	assert(rom);
+	assert(rom->dma);
+	assert(rom->dma_raw);
+	
+	dma = rom->dma;
+	num = rom->dma_num;
+	
+	/* sort all entries by size, descending */
+	DMASORT(rom, sortfunc_dma_size_descend);
+	
+	/* find first entry where size == 0 (aka first unused entry) */
+	for (dma = rom->dma; dma - rom->dma < num; ++dma)
+		if (dma->start == dma->end)
+			break;
+	numUsed = dma - rom->dma;
+	
+	/* sort all used entries by start address, ascending */
+	DMASORT_N(rom, sortfunc_dma_start_ascend, numUsed);
+	
+	/*
+	 * at this point, unused entries have been moved to the end
+	 */
+	
+	/* zero the table */
+	memset(rom->dma_raw, 0, num * 16);
+	
+	/* write every entry */
+	rom->ofs = rom->dma_raw - rom->data;
+	for (dma = rom->dma; dma - rom->dma < num; ++dma)
+	{
+		rom_write32(rom, dma->start);
+		rom_write32(rom, dma->end);
+		rom_write32(rom, dma->Pstart);
+		rom_write32(rom, dma->Pend);
+		
+		/* early end condition: all entries have been written */
+		if (!dma->end)
+			break;
+	}
+}
+
+
+/*
+ *
+ * public functions
+ *
+ */
+
+/* compress rom using specified algorithm */
+void rom_compress(struct rom *rom, int mb, int numThreads, bool matching)
+{
+	struct dma *dma;
+	struct folder *list = 0;
+	struct fldr_item *item;
+	char *dot_codec = 0;
+	const char *codec;
+	char cwd[4096] = {0};
+	char cache_codec[4096] = {0};
+	const char *cache;
+	const struct encoder *enc = 0;
+	unsigned int compsz = mb * 0x100000;
+	unsigned int comp_total = 0;
+	unsigned int largest_compress = 1024;
+	float total_compressed = 0;
+	float total_decompressed = 0;
+	struct compThread *compThread = 0;
+	int dma_num = rom->dma_num;
+	int i;
+	
+	assert(rom);
+	assert(rom->dma);
+	assert(rom->dma_ready);
+	assert(rom->is_comp == 0 && "rom_compressed called more than once");
+	
+	rom->is_comp = 1;
+	
+	if (numThreads <= 0)
+		numThreads = 1;
+	
+	/* default codec = yaz */
+	if (!(codec = rom->codec))
+		codec = "yaz";
+	
+	cache = rom->cache;
+	
+	if (compsz > rom->data_sz || mb < 0)
+		die("invalid mb argument %d", mb);
+	
+	/* get encoding functions */
+	enc = encoder(codec);
+	
+	/* restore original start/end for nonexistent files */
+	DMA_FOR_EACH
+	{
+		if (dma->deleted)
+		{
+			dma->start = dma->Ostart;
+			dma->end   = dma->Oend;
+			dma->compress = 0; /* deleted files don't compress */
+		}
+	}
+	
+	/* sort dma entries by size, descending */
+	DMASORT(rom, sortfunc_dma_size_descend);
+	
+	/* locate largest file that will be compressed */
+	DMA_FOR_EACH
+	{
+		if (dma->compress && dma->end - dma->start > largest_compress)
+			largest_compress = dma->end - dma->start;
+	}
+	
+	/* no file should compress to over 2x its uncompressed size */
+	largest_compress *= 2;
+	
+	/* allocate compression buffer for each thread */
+	compThread = calloc_safe(numThreads, sizeof(*compThread));
+	for (i = 0; i < numThreads; ++i)
+	{
+		compThread[i].data = malloc_safe(largest_compress);
+	
+		/* allocate compression contexts (if applicable) */
+		if (enc->ctx_new)
+		{
+			compThread[i].ctx = enc->ctx_new();
+			if (!compThread[i].ctx)
+				die("memory error");
+		}
+	}
+	
+	/* if using compression cache */
+	if (cache)
+	{
+		sprintf(cache_codec, "%s/%s/", cache, codec);
+		
+		/* store current working directory for later */
+		wow_getcwd_safe(cwd, sizeof(cwd));
+		
+		/* create and enter cache folder */
+		dir_enter(cache);
+		
+		/* create and enter directory for the encoding algorithm */
+		dir_enter(codec);
+		
+		/* make a '.yaz' string from 'yaz' */
+		dot_codec = malloc_safe(strlen(codec) + 1/*'.'*/ + 1/*'\0'*/);
+		strcpy(dot_codec, ".");
+		strcat(dot_codec, codec);
+		
+		/* get list of all files in current working directory */
+		list = folder_new();
+	}
+	
+	/* now compress every compressible file */
+	if (numThreads <= 1)
+	{
+		dma_compress(
+			rom
+			, compThread[0].data
+			, enc->encfunc
+			, codec
+			, dot_codec
+			, list
+			, 1  /* stride */
+			, 0  /* ofs    */
+			, 1  /* report */
+			, compThread[0].ctx
+			, matching
+		);
+	}
+	else
+	{
+		/* spawn threads */
+		for (i = 0; i < numThreads; ++i)
+		{
+			dma_compress_thread(
+				&compThread[i]
+				, rom
+				, compThread[i].data
+				, enc->encfunc
+				, codec
+				, dot_codec
+				, list
+				, numThreads         /* stride */
+				, i                  /* ofs    */
+				, (i+1)==numThreads  /* report */
+				, compThread[i].ctx
+				, matching
+			);
+		}
+
+		/* wait for all threads to complete */
+		for (i = 0; i < numThreads; ++i)
+		{
+			if (pthread_join(compThread[i].pt, NULL))
+				die("threading error");
+		}
+	}
+	
+	/* all files now compressed */
+	report_progress(rom, codec, PROGRESS_A_B);
+	fprintf(printer, "success!\n");
+	
+	/* sort by original start, ascending */
+	DMASORT(rom, sortfunc_dma_start_ascend);
+	
+	/* determine physical addresses for each segment */
+	comp_total = 0;
+	DMA_FOR_EACH
+	{
+		char *fn = dma->compname;
+		unsigned int sz;
+		unsigned int sz16;
+		
+		if (dma->deleted)
+			continue;
+		
+		/* cached file logic */
+		if (cache)
+		{
+			/* skip entries that don't reference compressed files */
+			if (!fn)
+				continue;
+			
+			sz = dma->compSz;
+			
+			/* sz == 0 */
+			if (!sz)
+				die("'%s/%s/%s' file size == 0", cache, codec, fn);
+		}
+		
+		/* in-memory file logic */
+		else
+		{
+			/* skip entries that don't reference compressed data */
+			sz = dma->compSz;
+			if (!sz)
+				continue;
+		}
+		
+		/* ensure we remain 16-byte-aligned after advancing */
+		sz16 = sz;
+		if (sz16 & 15)
+			sz16 += 16 - (sz16 & 15);
+		
+		dma->Pstart = comp_total;
+		if (dma->compress)
+		{
+			dma->Pend = dma->Pstart + sz16;
+			
+			/* compressed file ratio variables */
+			total_compressed += sz16;
+			total_decompressed += dma->end - dma->start;
+		}
+		else
+			dma->Pend = 0;
+		comp_total += sz16;
+		
+		if (mb != 0 && dma->Pend > compsz)
+			die("ran out of compressed rom space (try increasing --mb)");
+	}
+
+	/* adaptive final size */
+	if (mb == 0)
+		compsz = ALIGN8MB(comp_total);
+	
+	if (matching)
+	{
+		/* fill the entire (compressed) rom space with 00010203...FF...
+		   in order to match retail rom padding                         */
+		unsigned char n = 0; /* will intentionally overflow */
+		for (unsigned int j = 0; j < compsz; j++, n++)
+		{
+			rom->data[j] = n;
+		}
+	}
+	else
+	{
+		/* zero the entire (compressed) rom space */
+		memset(rom->data, 0, compsz);
+	}
+	
+	/* inject compressed files */
+	comp_total = 0;
+	DMA_FOR_EACH
+	{
+		unsigned char *dst;
+		char *fn = dma->compname;
+		unsigned int sz;
+		fprintf(printer, "\r""injecting file %d/%d: ", PROGRESS_A_B);
+		
+		if (dma->deleted)
+			continue;
+		
+		dst = rom->data + dma->Pstart;
+		
+		/* external cached file logic */
+		if (cache)
+		{
+			/* skip entries that don't reference compressed files */
+			if (!fn)
+				continue;
+
+			/* load file into rom at offset */
+			dst = file_load_into(cache_codec, fn, &sz, dst);
+		}
+		/* otherwise, a simple memcpy */
+		else
+		{
+			memcpy(dst, dma->compbuf, dma->compSz);
+			sz = dma->compSz;
+		}
+
+		if (matching)
+		{
+			/* since matching rom padding is not zero but file padding is zero,
+				fill file padding space with zeros                              */
+			memset(dst + sz, 0, ALIGN16(sz) - sz);
+		}
+	}
+	fprintf(printer, "\r""injecting file %d/%d: ", dma_num, dma_num);
+	fprintf(printer, "success!\n");
+	
+	fprintf(
+		printer
+		, "compression ratio: %.02f%%\n"
+		, (total_compressed / total_decompressed) * 100.0f
+	);
+	
+	/* now free compressed file names */
+	DMA_FOR_EACH
+	{
+		if (dma->compname)
+			free(dma->compname);
+	}
+	
+	/* remove unused cache files */
+	if (list)
+	{
+		for (item = list->item; item - list->item < list->num; ++item)
+		{
+			/* udata hasn't been marked, so file is unused */
+			if (item->name && !item->udata)
+			{
+				if (remove(item->name))
+					die("failed to remove '%s/%s/%s'"
+						, cache, codec, item->name
+					);
+			}
+		}
+	}
+	
+	/* update rom size for when rom_save() is used */
+	rom->data_sz = compsz;
+	
+	/* cleanup */
+	DMA_FOR_EACH
+	{
+		/* zero starts/ends of deleted files */
+		if (!matching && dma->deleted)
+		{
+			dma->start = 0;
+			dma->end = 0;
+			dma->Pstart = 0;
+			dma->Pend = 0;
+		}
+		
+		/* free any compbufs */
+		if (dma->compbuf)
+			free(dma->compbuf);
+		dma->compSz = 0;
+		dma->compbuf = 0;
+	}
+	if (list)
+		folder_free(list);
+	if (dot_codec)
+		free(dot_codec);
+	for (i = 0; i < numThreads; ++i)
+	{
+		free(compThread[i].data);
+	
+		/* free compression contexts (if applicable) */
+		if (enc->ctx_free)
+		{
+			assert(compThread[i].ctx);
+			enc->ctx_free(compThread[i].ctx);
+		}
+	}
+	free(compThread);
+	
+	/* return to prior working directory */
+	if (*cwd)
+		wow_chdir(cwd);
+}
+
+
+/* specify start of dmadata and number of entries */
+void rom_dma(struct rom *rom, unsigned int offset, int num_entries, bool matching)
+{
+	struct dma *dma;
+	unsigned char *raw;
+	
+	assert(rom);
+	assert(rom->data);
+	assert(rom->dma == 0 && "called rom_dma() more than once");
+	
+	if (num_entries <= 0)
+		die("invalid number of dma entries %d", num_entries);
+	
+	dma = calloc_safe(num_entries, sizeof(*dma));
+	rom->dma = dma;
+	rom->dma_num = num_entries;
+	
+	raw = rom->data + offset;
+	rom->dma_raw = raw;
+	
+	/* initialize every entry */
+	while (dma - rom->dma < num_entries)
+	{
+		/* propagate defaults */
+		dma->index  = dma - rom->dma;
+		dma->start  = get32(raw);
+		dma->end    = get32(raw + 4);
+		dma->Pstart = get32(raw + 8);
+		dma->Pend   = get32(raw + 12);
+		dma->Ostart = dma->start;
+		dma->Oend   = dma->end;
+		dma->compress = 0; /* compression off by default */
+		
+		/* nonexistent file */
+		if (dma->Pstart == DMA_DELETED && dma->Pend == DMA_DELETED)
+		{
+			dma->deleted = 1;
+
+			if (!matching)
+			{
+				dma->start = 0;
+				dma->end = 0;
+				dma->Ostart = 0;
+				dma->Oend = 0;
+				dma->Pstart = 0;
+				dma->Pend = 0;
+			}
+		}
+		
+		/* invalid dma conditions */
+		else if (
+			(dma->Pend & 3) /* not 4-byte aligned */
+			|| (dma->Pstart & 3)
+			|| (dma->start  & 3)
+			|| (dma->end & 3)
+			|| dma->start > dma->end
+			|| (dma->Pstart > dma->Pend && dma->Pend)
+			|| dma->Pend > rom->data_sz
+		)
+		{
+			die(
+				"invalid dma entry encountered: %08X %08X %08X %08X"
+				, dma->start, dma->end, dma->Pstart, dma->Pend
+			);
+		}
+		
+		/* rom is compressed */
+		if (dma->Pend && dma->Pend != DMA_DELETED)
+		{
+			die(
+				"encountered dma entry %08X %08X %08X %08X"
+				", which suggests the rom is already compressed...\n"
+				"now exiting..."
+				, dma->start, dma->end, dma->Pstart, dma->Pend
+			);
+		}
+		
+		/* advance to next entry */
+		raw += 16;
+		dma += 1;
+	}
+}
+
+/* call this once dma settings are finalized */
+void rom_dma_ready(struct rom *rom, bool matching)
+{
+	struct dma *dma;
+	int num;
+	unsigned int lowest = 0;
+	unsigned int highest_end = 0; /* highest end dma offset */
+	
+	assert(rom);
+	assert(rom->data);
+	assert(rom->dma);
+	assert(rom->dma_ready == 0 && "dma_ready called more than once");
+	
+	dma = rom->dma;
+	num = rom->dma_num;
+	
+	/* sort by start offset, ascending */
+	DMASORT(rom, sortfunc_dma_start_ascend);
+	
+	/* confirm no entries overlap */
+	for (dma = rom->dma ; dma - rom->dma < num; ++dma)
+	{
+		/* skip blank entries */
+		if (!dma->start && !dma->end)
+			continue;
+		
+		/* warn on empty files */
+		if (dma->end == dma->start)
+		{
+			fprintf(
+				printer
+				, "warning: dma entry %d is empty file (%08X == %08X)\n"
+				, dma->index, dma->start, dma->end
+			);
+			dma->Pstart = dma->Pend = DMA_DELETED;
+		}
+		
+		/* nonexistent file */
+		if (dma->Pstart == DMA_DELETED && dma->Pend == DMA_DELETED)
+		{
+			dma->deleted = 1;
+
+			if (!matching)
+			{
+				dma->Ostart = 0;
+				dma->Oend = 0;
+				dma->start = 0;
+				dma->end = 0;
+				dma->compress = 0;
+			}
+			continue;
+		}
+		
+		/* fatal error on entries where end < start */
+		if (dma->end < dma->start)
+			die(
+				"dma invalid entry %d (%08X < %08X)"
+				, dma->index, dma->end, dma->start
+			);
+		
+		/* fatal error on unaligned entries */
+		if ((dma->start & 3) || (dma->end & 3))
+			die(
+				"dma unaligned pointer (%08X %08X)"
+				, dma->start
+				, dma->end
+			);
+		
+		/* fatal error on entries exceeding rom size */
+		if (dma->end > rom->data_sz)
+			die(
+				"dma entry %d (%08X - %08X) exceeds rom size (%08X)"
+				, dma->index, dma->start, dma->end, rom->data_sz
+			);
+		
+		/* if at least one entry has been processed, and its
+		 * start is lower than any of the previous ends
+		 */
+		if (dma > rom->dma && dma->start < lowest)
+			die(
+				"dma table entry %d (%08X - %08X) "
+				"overlaps entry %d (%08X - %08X)"
+				, dma->index, dma->start, dma->end
+				, (dma-1)->index, (dma-1)->start, (dma-1)->end
+			);
+		
+		/* store highest dma end offset */
+		if (dma->end > highest_end)
+			highest_end = dma->end;
+		
+		/* lowest acceptable start for next entry is end of current */
+		lowest = dma->end;
+	}
+	
+	/* note dma_ready() has been called */
+	rom->dma_ready = 1;
+}
+
+/* reencode existing archives within rom
+ * NOTE: must be used before dma_ready()
+ */
+/* TODO optimization opportunities: threading, caching */
+void rom_dma_repack(
+	struct rom *rom
+	, unsigned start
+	, unsigned end
+	, const char *from /* old codec */
+	, const char *to /* new codec */
+)
+{
+	const struct encoder *enc = 0;
+	int (*decfunc)(
+		void *src, void *dst, unsigned dstSz, unsigned *srcSz
+	) = 0;
+	void *ctx = 0;
+	
+	assert(rom);
+	assert(rom->data);
+	assert(rom->dma);
+	assert(rom->dma_ready == 0 && "dma_repack must precede dma_ready");
+	
+	/* default codec = yaz */
+	if (!from)
+		from = "yaz";
+	if (!(to = rom->codec))
+		to = "yaz";
+	
+	/* swap start and end if they are not in ascending order */
+	if (end < start)
+	{
+		int t = end;
+		end = start;
+		start = t;
+	}
+	
+	/* allocate compression buffers, 16 mb */
+	if (!rom->mem.mb16)
+		rom->mem.mb16 = malloc_safe(SIZE_16MB);
+	if (!rom->mem.mb4)
+		rom->mem.mb4 = malloc_safe(SIZE_4MB);
+	
+	/* no need to reencode when the codec is the same */
+	if (!strcmp(from, to))
+		return;
+	
+	/* get decoding function */
+	if (!strcmp(from, "yaz"))
+	{
+		from = "Yaz0";
+		decfunc = yazdec;
+	}
+	else if (!strcmp(from, "raw"))
+	{
+		from = "raw0";
+	}
+	else
+		die("dma_repack from='%s' unsupported", from);
+	
+	/* get encoding function */
+	enc = encoder(to);
+	
+	/* allocate compression context (if applicable) */
+	if (enc->ctx_new)
+	{
+		ctx = enc->ctx_new();
+		if (!ctx)
+			die("memory error");
+	}
+	
+	/* start <= idx <= end */
+	while (start <= end && start < rom->dma_num)
+	{
+		struct dma *dma = dma_get_idx(rom, start);
+		
+		unsigned char *dst = rom->data + dma->start;
+		const char *errstr;
+		unsigned int Osz = dma->end - dma->start;
+		unsigned int Nsz;
+		char name[32];
+		
+		dma->compress = 0;
+		
+		sprintf(name, "%08X", dma->start);
+		
+		errstr =
+		yar_reencode(
+			dst
+			, Osz
+			, rom->mem.mb16
+			, &Nsz
+			, 4
+			
+			, name
+			, from
+			, rom->mem.mb4
+			, ctx
+			
+			, decfunc
+			, enc->encfunc
+			, 0
+		);
+		
+		/* fatal error */
+		if (errstr)
+			die("%s", errstr);
+		
+		/* repacked archive won't fit in place of original archive */
+		if (Nsz > Osz)
+			die("repacking failed, new archive 0x%X bytes too big"
+				, Nsz - Osz
+			);
+		
+		/* copy encoded file into rom */
+		memcpy(dst, rom->mem.mb16, Nsz);
+		
+		/* file sizes changed */
+		fprintf(printer, "%.2f kb saved!\n", ((float)(Osz-Nsz))/1000.0f);
+		
+		dma->end = dma->start + Nsz;
+		
+		start += 1;
+	}
+	
+	/* free compression context (if applicable) */
+	if (enc->ctx_free)
+	{
+		assert(ctx);
+		enc->ctx_free(ctx);
+	}
+}
+
+
+/* set compression flag on indices start <= idx <= end */
+void rom_dma_compress(
+	struct rom *rom
+	, unsigned start
+	, unsigned end
+	, int comp
+)
+{
+	assert(rom);
+	assert(rom->data);
+	assert(rom->dma);
+	assert(rom->dma_ready == 0 && "dma_compress must precede dma_ready");
+	
+	/* swap start and end if they are not in ascending order */
+	if (end < start)
+	{
+		int t = end;
+		end = start;
+		start = t;
+	}
+	
+	/* start <= idx <= end */
+	while (start <= end && start < rom->dma_num)
+	{
+		struct dma *dma = rom->dma + start;
+		
+		dma->compress = comp;
+		start += 1;
+	}
+}
+
+
+/* set rom compressed file cache directory */
+void rom_set_cache(struct rom *rom, const char *cache)
+{
+	assert(rom);
+	assert(cache);
+	
+	if (rom->cache)
+		free(rom->cache);
+	
+	rom->cache = strdup_safe(cache);
+}
+
+/* get number of dma entries */
+int rom_dma_num(struct rom *rom)
+{
+	assert(rom);
+	
+	return rom->dma_num;
+}
+
+/* set rom compression codec
+ * valid options: "yaz", "lzo", "ucl", "aplib"
+ * NOTE: to use codecs besides yaz, get patches from the z64enc repo
+ */
+void rom_set_codec(struct rom *rom, const char *codec)
+{
+	assert(rom);
+	assert(codec);
+	
+	if (rom->codec)
+		free(rom->codec);
+	
+	rom->codec = strdup_safe(codec);
+}
+
+/* save rom to disk using specified filename */
+void rom_save(struct rom *rom, const char *fn)
+{
+	assert(rom);
+	assert(rom->data);
+	
+	/* updates dmadata */
+	rom_write_dmadata(rom);
+	
+	/* recalculate crc */
+	n64crc(rom->data);
+	
+	if (file_write(fn, rom->data, rom->data_sz) != rom->data_sz)
+		die("failed to write file '%s'", fn);
+}
+
+/* allocate a rom structure */
+struct rom *rom_new(const char *fn)
+{
+	struct rom *dst;
+	
+	assert(fn);
+	
+	/* allocate destination rom structure */
+	dst = calloc_safe(1, sizeof(*dst));
+	
+	/* propagate rom file */
+	dst->data = file_load(fn, &dst->data_sz);
+	
+	/* double its bounds just in case compressed rom is larger
+	 * (this can happen if, say, a 23mb rom is provided,
+	 * gets compressed to 17mb, and is rounded up to 24mb)
+	 * (retail rom sizes always use increments of 8)
+	 */
+	dst->data_sz *= 2;
+	dst->data = realloc(dst->data, dst->data_sz);
+	
+	/* back up load file name */
+	dst->fn = strdup_safe(fn);
+	
+	return dst;
+}
+
+/* free a rom structure */
+void rom_free(struct rom *rom)
+{
+	if (!rom)
+		return;
+	
+	if (rom->codec)
+		free(rom->codec);
+	
+	if (rom->data)
+		free(rom->data);
+	
+	if (rom->dma)
+		free(rom->dma);
+	
+	if (rom->cache)
+		free(rom->cache);
+	
+	if (rom->fn)
+		free(rom->fn);
+	
+	/* free any memory pools that were allocated */
+	if (rom->mem.mb16)
+		free(rom->mem.mb16);
+	if (rom->mem.mb4)
+		free(rom->mem.mb4);
+	
+	free(rom);
+}
+
diff --git a/tools/z64compress/src/rom.h b/tools/z64compress/src/rom.h
new file mode 100644
index 000000000..dbc6eaa43
--- /dev/null
+++ b/tools/z64compress/src/rom.h
@@ -0,0 +1,62 @@
+/* 
+ * rom.h
+ * 
+ * functions for compression magic reside herein
+ * 
+ * z64me <z64.me>
+ * 
+ */
+
+#ifndef Z64COMPRESS_ROM_H_INCLUDED
+#define Z64COMPRESS_ROM_H_INCLUDED
+
+/* opaque definition */
+struct rom;
+
+/* allocate a rom structure and load rom file */
+struct rom *rom_new(const char *fn);
+
+/* free a rom structure */
+void rom_free(struct rom *rom);
+
+/* save rom to disk using specified filename */
+void rom_save(struct rom *rom, const char *fn);
+
+/* compress rom using specified algorithm */
+void rom_compress(struct rom *rom, int mb, int numThreads, bool matching);
+
+/* specify start of dmadata and number of entries */
+void rom_dma(struct rom *rom, unsigned int offset, int num_entries, bool matching);
+
+/* call this once dma settings are finalized */
+void rom_dma_ready(struct rom *rom, bool matching);
+
+/* set compression flag on indices start <= idx <= end */
+void
+rom_dma_compress(struct rom *rom, unsigned start, unsigned end, int comp);
+
+/* reencode existing archives within rom
+ * NOTE: must be used before dma_ready()
+ */
+void rom_dma_repack(
+	struct rom *rom
+	, unsigned start
+	, unsigned end
+	, const char *from /* old codec */
+	, const char *to /* new codec */
+);
+
+/* get number of dma entries */
+int rom_dma_num(struct rom *rom);
+
+/* set rom compression codec
+ * valid options: "yaz", "lzo", "ucl", "aplib"
+ * NOTE: to use codecs besides yaz, get patches from the z64enc repo
+ */
+void rom_set_codec(struct rom *rom, const char *codec);
+
+/* set rom compressed file cache directory */
+void rom_set_cache(struct rom *rom, const char *cache);
+
+#endif /* Z64COMPRESS_ROM_H_INCLUDED */
+
diff --git a/tools/z64compress/src/sha1.c b/tools/z64compress/src/sha1.c
new file mode 100644
index 000000000..07b068dd6
--- /dev/null
+++ b/tools/z64compress/src/sha1.c
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define stb_big32(c) (((c)[0]<<24) + (c)[1]*65536 + (c)[2]*256 + (c)[3])
+
+static void stb__sha1(unsigned char *chunk, unsigned h[5])
+{
+   int i;
+   unsigned a,b,c,d,e;
+   unsigned w[80];
+
+   for (i=0; i < 16; ++i)
+      w[i] = stb_big32(&chunk[i*4]);
+   for (i=16; i < 80; ++i) {
+      unsigned t;
+      t = w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16];
+      w[i] = (t + t) | (t >> 31);
+   }
+
+   a = h[0];
+   b = h[1];
+   c = h[2];
+   d = h[3];
+   e = h[4];
+
+   #define STB__SHA1(k,f)                                            \
+   {                                                                 \
+      unsigned temp = (a << 5) + (a >> 27) + (f) + e + (k) + w[i];  \
+      e = d;                                                       \
+      d = c;                                                     \
+      c = (b << 30) + (b >> 2);                               \
+      b = a;                                              \
+      a = temp;                                    \
+   }
+
+   i=0;
+   for (; i < 20; ++i) STB__SHA1(0x5a827999, d ^ (b & (c ^ d))       );
+   for (; i < 40; ++i) STB__SHA1(0x6ed9eba1, b ^ c ^ d               );
+   for (; i < 60; ++i) STB__SHA1(0x8f1bbcdc, (b & c) + (d & (b ^ c)) );
+   for (; i < 80; ++i) STB__SHA1(0xca62c1d6, b ^ c ^ d               );
+
+   #undef STB__SHA1
+
+   h[0] += a;
+   h[1] += b;
+   h[2] += c;
+   h[3] += d;
+   h[4] += e;
+}
+
+void stb_sha1(unsigned char output[20], unsigned char *buffer, unsigned len)
+{
+   unsigned char final_block[128];
+   unsigned end_start, final_len, j;
+   int i;
+
+   unsigned h[5];
+
+   h[0] = 0x67452301;
+   h[1] = 0xefcdab89;
+   h[2] = 0x98badcfe;
+   h[3] = 0x10325476;
+   h[4] = 0xc3d2e1f0;
+
+   // we need to write padding to the last one or two
+   // blocks, so build those first into 'final_block'
+
+   // we have to write one special byte, plus the 8-byte length
+
+   // compute the block where the data runs out
+   end_start = len & ~63;
+
+   // compute the earliest we can encode the length
+   if (((len+9) & ~63) == end_start) {
+      // it all fits in one block, so fill a second-to-last block
+      end_start -= 64;
+   }
+
+   final_len = end_start + 128;
+
+   // now we need to copy the data in
+   assert(end_start + 128 >= len+9);
+   assert(end_start < len || len < 64-9);
+
+   j = 0;
+   if (end_start > len)
+      j = (unsigned) - (int) end_start;
+
+   for (; end_start + j < len; ++j)
+      final_block[j] = buffer[end_start + j];
+   final_block[j++] = 0x80;
+   while (j < 128-5) // 5 byte length, so write 4 extra padding bytes
+      final_block[j++] = 0;
+   // big-endian size
+   final_block[j++] = len >> 29;
+   final_block[j++] = len >> 21;
+   final_block[j++] = len >> 13;
+   final_block[j++] = len >>  5;
+   final_block[j++] = len <<  3;
+   assert(j == 128 && end_start + j == final_len);
+
+   for (j=0; j < final_len; j += 64) { // 512-bit chunks
+      if (j+64 >= end_start+64)
+         stb__sha1(&final_block[j - end_start], h);
+      else
+         stb__sha1(&buffer[j], h);
+   }
+
+   for (i=0; i < 5; ++i) {
+      output[i*4 + 0] = h[i] >> 24;
+      output[i*4 + 1] = h[i] >> 16;
+      output[i*4 + 2] = h[i] >>  8;
+      output[i*4 + 3] = h[i] >>  0;
+   }
+}
+
+// client can truncate this wherever they like
+void stb_sha1_readable(char display[30], unsigned char sha[20])
+{
+   char encoding[65] = "0123456789abcdefghijklmnopqrstuv"
+                       "wxyzABCDEFGHIJKLMNOPQRSTUVWXYZ#$";
+   int num_bits = 0, acc=0;
+   int i=0,o=0;
+   while (o < 26) {
+      int v;
+      // expand the accumulator
+      if (num_bits < 6) {
+         assert(i != 20);
+         acc += sha[i++] << num_bits;
+         num_bits += 8;
+      }
+      v = acc & ((1 << 6) - 1);
+      display[o++] = encoding[v];
+      acc >>= 6;
+      num_bits -= 6;
+   }
+   assert(num_bits == 20*8 - 26*6);
+   display[o++] = '\0';   
+}
+
diff --git a/tools/z64compress/src/sha1.h b/tools/z64compress/src/sha1.h
new file mode 100644
index 000000000..3c7e0dc23
--- /dev/null
+++ b/tools/z64compress/src/sha1.h
@@ -0,0 +1,8 @@
+#ifndef STB_SHA1_H_INCLUDED
+#define STB_SHA1_H_INCLUDED
+
+void stb_sha1(unsigned char output[20], unsigned char *buffer, unsigned len);
+void stb_sha1_readable(char display[30], unsigned char sha[20]);
+
+#endif /* STB_SHA1_H_INCLUDED */
+
diff --git a/tools/z64compress/src/wow.c b/tools/z64compress/src/wow.c
new file mode 100644
index 000000000..21b2c671d
--- /dev/null
+++ b/tools/z64compress/src/wow.c
@@ -0,0 +1,3 @@
+#define WOW_IMPLEMENTATION
+#include "wow.h"
+
diff --git a/tools/z64compress/src/wow.h b/tools/z64compress/src/wow.h
new file mode 100644
index 000000000..cc2d88cd3
--- /dev/null
+++ b/tools/z64compress/src/wow.h
@@ -0,0 +1,769 @@
+/* 
+ * wow.h
+ * 
+ * a small collection of functions
+ * to make writing software easier
+ * 
+ * z64me <z64.me>
+ * 
+ */
+
+#ifndef WOW_H_INCLUDED
+#define WOW_H_INCLUDED
+
+#include <stddef.h> /* size_t */
+#include <stdio.h> /* file ops */
+#include <stdlib.h> /* alloc */
+#include <sys/stat.h> /* stat */
+#include <string.h> /* strdup */
+#include <unistd.h> /* chdir, getcwd */
+#include <stdarg.h>
+#include <locale.h>
+
+#ifdef _WIN32
+ #include <windows.h>
+ #undef near
+ #undef far
+#endif
+
+
+#if (_WIN32 && UNICODE)
+ #define wow_main       int wmain(int argc, wchar_t *Wargv[])
+ #define wow_main_argv  char **argv = wow_conv_args(argc, (void*)Wargv)
+#else
+ #define wow_main       int main(int argc, char *argv[])
+ #define wow_main_argv  do{}while(0)
+#endif
+
+
+#ifndef WOW_API_PREFIX
+ #define WOW_API_PREFIX
+#endif
+
+WOW_API_PREFIX
+void *
+wow_utf8_to_wchar(const char *str);
+
+WOW_API_PREFIX
+char *
+wow_wchar_to_utf8(void *wstr);
+
+
+/* converts argv[] from wchar to char win32, in place */
+WOW_API_PREFIX
+void *
+wow_conv_args(int argc, void *argv[]);
+
+
+/* returns non-zero if path is a directory */
+WOW_API_PREFIX
+int
+wow_is_dir_w(void const *path);
+
+
+/* returns non-zero if path is a directory */
+WOW_API_PREFIX
+int
+wow_is_dir(char const *path);
+
+
+/* fread abstraction that falls back to buffer-based fread *
+ * if a big fread fails; if that still fails, returns 0    */
+WOW_API_PREFIX
+size_t
+wow_fread_bytes(void *ptr, size_t bytes, FILE *stream);
+
+
+/* fwrite abstraction that falls back to buffer-based fwrite *
+ * if a big fwrite fails; if that still fails, returns 0     */
+WOW_API_PREFIX
+size_t
+wow_fwrite_bytes(const void *ptr, size_t bytes, FILE *stream);
+
+
+/* fread abstraction that falls back to buffer-based fread *
+ * if a big fread fails; if that still fails, returns 0    */
+WOW_API_PREFIX
+size_t
+wow_fread(void *ptr, size_t size, size_t nmemb, FILE *stream);
+
+
+/* fwrite abstraction that falls back to buffer-based fwrite *
+ * if a big fwrite fails; if that still fails, returns 0     */
+WOW_API_PREFIX
+size_t
+wow_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream);
+
+
+/* fopen abstraction for utf8 support on windows win32 */
+WOW_API_PREFIX
+FILE *
+wow_fopen(char const *name, char const *mode);
+
+
+/* remove abstraction for utf8 support on windows win32 */
+WOW_API_PREFIX
+int
+wow_remove(char const *path);
+
+
+/* mkdir */
+WOW_API_PREFIX
+int
+wow_mkdir(char const *path);
+
+
+/* chdir */
+WOW_API_PREFIX
+int
+wow_chdir(char const *path);
+
+
+/* getcwd */
+WOW_API_PREFIX
+char *
+wow_getcwd(char *buf, size_t size);
+
+
+/* getcwd_safe */
+WOW_API_PREFIX
+char *
+wow_getcwd_safe(char *buf, size_t size);
+
+
+/* system */
+WOW_API_PREFIX
+int
+wow_system(char const *path);
+
+WOW_API_PREFIX void die(const char *fmt, ...)
+	__attribute__ ((format (printf, 1, 2)))
+;
+WOW_API_PREFIX void *calloc_safe(size_t nmemb, size_t size);
+WOW_API_PREFIX void *malloc_safe(size_t size);
+WOW_API_PREFIX void *realloc_safe(void *ptr, size_t size);
+WOW_API_PREFIX char *strdup_safe(const char *s);
+WOW_API_PREFIX void *memdup_safe(void *ptr, size_t size);
+
+#ifdef WOW_IMPLEMENTATION
+
+WOW_API_PREFIX void die(const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+#ifdef UNICODE
+	char buf[4096];
+	vsprintf(buf, fmt, args);
+	wchar_t *wc = wow_utf8_to_wchar(buf);
+	setlocale(LC_ALL, "");
+	fwprintf(stderr, L"%ls", wc);
+	free(wc);
+#else
+	vfprintf(stderr, fmt, args);
+#endif
+	va_end(args);
+	fprintf(stderr, "\n");
+	exit(EXIT_FAILURE);
+}
+
+WOW_API_PREFIX void *calloc_safe(size_t nmemb, size_t size)
+{
+	void *result = calloc(nmemb, size);
+	
+	if (!result)
+		die("memory error");
+	
+	return result;
+}
+
+WOW_API_PREFIX void *malloc_safe(size_t size)
+{
+	void *result = malloc(size);
+	
+	if (!result)
+		die("memory error");
+	
+	return result;
+}
+
+WOW_API_PREFIX void *realloc_safe(void *ptr, size_t size)
+{
+	void *result = realloc(ptr, size);
+	
+	if (!result)
+		die("memory error");
+	
+	return result;
+}
+
+WOW_API_PREFIX char *strdup_safe(const char *s)
+{
+	char *result;
+	int n;
+	
+	if (!s)
+		return 0;
+	
+	n = strlen(s) + 1;
+	
+	result = malloc_safe(n);
+	
+	strcpy(result, s);
+	
+	return result;
+}
+
+WOW_API_PREFIX void *memdup_safe(void *ptr, size_t size)
+{
+	void *result;
+	
+	if (!ptr || !size)
+		return 0;
+	
+	result = malloc_safe(size);
+	
+	memcpy(result, ptr, size);
+	
+	return result;
+}
+
+
+WOW_API_PREFIX
+void *
+wow_utf8_to_wchar(const char *str)
+{
+#ifdef UNICODE
+extern __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+    wchar_t *wstr;
+    int wstr_sz = (strlen(str) + 1) * 16;//sizeof(*wstr);
+    wstr = calloc_safe(1, wstr_sz);
+    MultiByteToWideChar(65001/*utf8*/, 0, str, -1, wstr, wstr_sz);
+    return wstr;
+#else
+    return strdup(str);
+#endif
+}
+
+WOW_API_PREFIX
+char *
+wow_wchar_to_utf8_buf(void *wstr, void *dst, int dst_max)
+{
+#ifdef UNICODE
+extern __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+    WideCharToMultiByte(65001/*utf8*/, 0, wstr, -1, dst, dst_max, NULL, NULL);
+    return dst;
+#else
+    (void)dst_max; /* unused parameter */
+    return strcpy(dst, wstr);
+#endif
+}
+
+WOW_API_PREFIX
+char *
+wow_wchar_to_utf8(void *wstr)
+{
+#ifdef UNICODE
+extern __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+    char *str;
+    int str_sz = (wcslen(wstr) + 1) * sizeof(*str);
+    str = calloc_safe(1, str_sz);
+    WideCharToMultiByte(65001/*utf8*/, 0, wstr, -1, str, str_sz, NULL, NULL);
+    return str;
+#else
+    return strdup(wstr);
+#endif
+}
+
+WOW_API_PREFIX
+char *
+wow_wchar_to_utf8_inplace(void *wstr)
+{
+#ifdef UNICODE
+extern __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+    char buf[4096];
+    char *str;
+    int wstr_len = wcslen(wstr);
+    unsigned str_sz = (wstr_len + 1) * sizeof(*str);
+    if (str_sz >= sizeof(buf))
+        str = malloc_safe(str_sz);
+    else
+        str = buf;
+    WideCharToMultiByte(65001/*utf8*/, 0, wstr, -1, str, str_sz, NULL, NULL);
+    memcpy(wstr, str, wstr_len + 1);
+    ((char*)wstr)[wstr_len+1] = '\0';
+    if (str != buf)
+        free(str);
+    return wstr;
+#else
+    return wstr;
+#endif
+}
+
+
+/* argument abstraction: converts argv[] from wchar to char win32 */
+WOW_API_PREFIX
+void *
+wow_conv_args(int argc, void *argv[])
+{
+#ifdef UNICODE
+	int i;
+	for (i=0; i < argc; ++i)
+	{
+		//fprintf(stderr, "[%d]: %s\n", i, argv[i]);
+		//fwprintf(stderr, L"[%d]: %s\n", i, (wchar_t*)argv[i]);
+		argv[i] = wow_wchar_to_utf8_inplace(argv[i]);
+		//fwprintf(stderr, L"[%d]: %s\n", i, wow_utf8_to_wchar(argv[i]));
+	}
+#else
+	(void)argc; /* unused parameter */
+#endif
+	return argv;
+}
+
+/* returns non-zero if path is a directory */
+WOW_API_PREFIX
+int
+wow_is_dir_w(void const *path)
+{
+	struct stat s;
+#if (_WIN32 && UNICODE)
+	if (wstat(path, &s) == 0)
+#else
+	if (stat(path, &s) == 0)
+#endif
+	{
+		if (s.st_mode & S_IFDIR)
+			return 1;
+	}
+	
+	return 0;
+}
+
+
+/* returns non-zero if path is a directory */
+WOW_API_PREFIX
+int
+wow_is_dir(char const *path)
+{
+	int rv;
+	void *wpath = 0;
+	
+#if (_WIN32 && UNICODE)
+	wpath = wow_utf8_to_wchar(path);
+	rv = wow_is_dir_w(wpath);
+#else
+	rv = wow_is_dir_w(path);
+#endif
+	if (wpath)
+		free(wpath);
+	
+	return rv;
+}
+
+
+/* fread abstraction that falls back to buffer-based fread *
+ * if a big fread fails; if that still fails, returns 0    */
+WOW_API_PREFIX
+size_t
+wow_fread_bytes(void *ptr, size_t bytes, FILE *stream)
+{
+	if (!stream || !ptr || !bytes)
+		return 0;
+	
+	unsigned char *ptr8 = ptr;
+	size_t Oofs = ftell(stream);
+	size_t bufsz = 1024 * 1024; /* 1 mb at a time */
+	size_t Obytes = bytes;
+	size_t rem;
+	
+	fseek(stream, 0, SEEK_END);
+	rem = ftell(stream) - Oofs;
+	fseek(stream, Oofs, SEEK_SET);
+	
+	if (bytes > rem)
+		bytes = rem;
+	
+	/* everything worked */
+	if (fread(ptr, 1, bytes, stream) == bytes)
+		return Obytes;
+	
+	/* failed: try falling back to slower buffered read */
+	fseek(stream, Oofs, SEEK_SET);
+	while (bytes)
+	{
+		/* don't read past end */
+		if (bytes < bufsz)
+			bufsz = bytes;
+		if (bufsz > rem)
+		{
+			bytes = rem;
+			bufsz = rem;
+		}
+		
+		/* still failed */
+		if (fread(ptr8, 1, bufsz, stream) != bufsz)
+			return 0;
+		
+		/* advance */
+		ptr8 += bufsz;
+		bytes -= bufsz;
+		rem -= bufsz;
+	}
+	
+	/* success */
+	return Obytes;
+}
+
+
+/* fwrite abstraction that falls back to buffer-based fwrite *
+ * if a big fwrite fails; if that still fails, returns 0     */
+WOW_API_PREFIX
+size_t
+wow_fwrite_bytes(const void *ptr, size_t bytes, FILE *stream)
+{
+	if (!stream || !ptr || !bytes)
+		return 0;
+	
+	const unsigned char *ptr8 = ptr;
+	size_t bufsz = 1024 * 1024; /* 1 mb at a time */
+	size_t Obytes = bytes;
+	
+	/* everything worked */
+	if (fwrite(ptr, 1, bytes, stream) == bytes)
+		return bytes;
+	
+	/* failed: try falling back to slower buffered read */
+	while (bytes)
+	{
+		/* don't read past end */
+		if (bytes < bufsz)
+			bufsz = bytes;
+		
+		/* still failed */
+		if (fwrite(ptr8, 1, bufsz, stream) != bufsz)
+			return 0;
+		
+		/* advance */
+		ptr8 += bufsz;
+		bytes -= bufsz;
+	}
+	
+	/* success */
+	return Obytes;
+}
+
+
+/* fread abstraction that falls back to buffer-based fread *
+ * if a big fread fails; if that still fails, returns 0    */
+WOW_API_PREFIX
+size_t
+wow_fread(void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+	if (!stream || !ptr || !size || !nmemb)
+		return 0;
+	
+	if (wow_fread_bytes(ptr, size * nmemb, stream) == size * nmemb)
+		return nmemb;
+	
+	return 0;
+}
+
+
+/* fwrite abstraction that falls back to buffer-based fwrite *
+ * if a big fwrite fails; if that still fails, returns 0     */
+WOW_API_PREFIX
+size_t
+wow_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream)
+{
+	if (!stream || !ptr || !size || !nmemb)
+		return 0;
+	
+	if (wow_fwrite_bytes(ptr, size * nmemb, stream) == size * nmemb)
+		return nmemb;
+	
+	return 0;
+}
+
+
+/* fopen abstraction for utf8 support on windows win32 */
+WOW_API_PREFIX
+FILE *
+wow_fopen(char const *name, char const *mode)
+{
+#ifdef UNICODE
+	void *wname = 0;
+	void *wmode = 0;
+	FILE *fp = 0;
+	
+	wname = wow_utf8_to_wchar(name);
+	if (!wname)
+		goto L_cleanup;
+	
+	/* TODO eventually, an error message would be cool */
+	if (wow_is_dir_w(wname))
+		goto L_cleanup;
+	
+	wmode = wow_utf8_to_wchar(mode);
+	if (!wmode)
+		goto L_cleanup;
+	
+	fp = _wfopen(wname, wmode);
+	
+L_cleanup:
+	if (wname) free(wname);
+	if (wmode) free(wmode);
+	if (fp)
+		return fp;
+	return 0;
+#else
+	/* TODO eventually, an error message would be cool */
+	if (wow_is_dir_w(name))
+		return 0;
+	return fopen(name, mode);
+#endif
+}
+
+
+/* remove abstraction for utf8 support on windows win32 */
+WOW_API_PREFIX
+int
+wow_remove(char const *path)
+{
+#ifdef UNICODE
+	void *wpath = 0;
+	int rval;
+	
+	wpath = wow_utf8_to_wchar(path);
+	if (!wpath)
+		return -1;
+	
+	rval = _wremove(wpath);
+	free(wpath);
+	return rval;
+#else
+	return remove(path);
+#endif
+}
+
+
+/* mkdir */
+WOW_API_PREFIX
+int
+wow_mkdir(char const *path)
+{
+#if defined(_WIN32) && defined(UNICODE)
+extern int _wmkdir(const wchar_t *);
+	void *wname = 0;
+	int rval;
+	
+	wname = wow_utf8_to_wchar(path);
+	if (!wname)
+		return -1;
+	
+	rval = _wmkdir(wname);
+	
+	if (wname)
+		free(wname);
+	
+	return rval;
+#elif defined(_WIN32) /* win32 no unicode */
+extern int _mkdir(const char *);
+	return _mkdir(path);
+#else /* ! _WIN32 */
+	return mkdir(path, 0777);
+#endif
+}
+
+
+/* chdir */
+WOW_API_PREFIX
+int
+wow_chdir(char const *path)
+{
+#if defined(_WIN32) && defined(UNICODE)
+extern int _wchdir(const wchar_t *);
+	void *wname = 0;
+	int rval;
+	
+	wname = wow_utf8_to_wchar(path);
+	if (!wname)
+		return -1;
+	
+	rval = _wchdir(wname);
+	
+	if (wname)
+		free(wname);
+	
+	return rval;
+#elif defined(_WIN32) /* win32 no unicode */
+extern int _chdir(const char *);
+	return _chdir(path);
+#else /* ! _WIN32 */
+	return chdir(path);
+#endif
+}
+
+
+/* getcwd */
+WOW_API_PREFIX
+char *
+wow_getcwd(char *buf, size_t size)
+{
+#if defined(_WIN32) && defined(UNICODE)
+//extern int _wgetcwd(const wchar_t *, int);
+extern _CRTIMP wchar_t *__cdecl _wgetcwd(wchar_t *_DstBuf,int _SizeInWords);
+	wchar_t wname[4096];
+	
+	if (!buf || !size)
+		return 0;
+	
+	if (!_wgetcwd(wname, sizeof(wname) / sizeof(wname[0])))
+		return 0;
+	
+	return wow_wchar_to_utf8_buf(wname, buf, size);
+#elif defined(_WIN32) /* win32 no unicode */
+//extern char *_getcwd(char *, int);
+	return _getcwd(buf, size);
+#else /* ! _WIN32 */
+	return getcwd(buf, size);
+#endif
+}
+
+
+/* getcwd_safe */
+WOW_API_PREFIX
+char *
+wow_getcwd_safe(char *buf, size_t size)
+{
+	char *result = wow_getcwd(buf, size);
+	
+	if (!result)
+		die("failed to get current working directory");
+	
+	return result;
+}
+
+
+/* system */
+WOW_API_PREFIX
+int
+wow_system(char const *path)
+{
+#if defined(_WIN32) && defined(UNICODE)
+	void *wname = 0;
+	int rval;
+	
+	wname = wow_utf8_to_wchar(path);
+	if (!wname)
+		return -1;
+	
+	rval = _wsystem(wname);
+	
+	if (wname)
+		free(wname);
+	
+	return rval;
+#else /* not win32 unicode */
+	return system(path);
+#endif
+}
+
+
+/* system_gui */
+WOW_API_PREFIX
+int
+wow_system_gui(char const *name, const char *param)
+{
+#if defined(_WIN32)
+	STARTUPINFOW si;
+	PROCESS_INFORMATION pi;
+
+	ZeroMemory(&si, sizeof(si));
+	si.cb = sizeof(si);
+	ZeroMemory(&pi, sizeof(pi));
+	int rval = 0 /*success */;
+//extern int ShellExecuteA(void *hwnd, void *op, void *file, void *param, void *dir, int cmd);
+//extern int ShellExecuteW(void *hwnd, void *op, void *file, void *param, void *dir, int cmd);
+//const int SW_SHOWNORMAL = 1;
+	#if defined(UNICODE)
+		void *wname = 0;
+		void *wparam = 0;
+		
+		wname = wow_utf8_to_wchar(name);
+		if (!wname)
+		{
+			return -1;
+		}
+		wparam = wow_utf8_to_wchar(param);
+		if (!wparam)
+		{
+			free(wname);
+			return -1;
+		}
+		
+#if 0
+		if (CreateProcessW(
+			wname, wparam
+			, NULL, NULL
+			, FALSE
+			, CREATE_NO_WINDOW
+			, NULL
+			, NULL
+			, &si, &pi)
+		)
+		{
+		//WaitForSingleObject(pi.hProcess, INFINITE);
+		//CloseHandle(pi.hProcess);
+		//CloseHandle(pi.hThread);
+		}
+		else
+			rval = 1;
+#else
+		rval = (int)ShellExecuteW(NULL, L"open", wname, wparam, L".", SW_SHOWNORMAL);
+		rval = rval <= 32;
+#endif
+		
+		free(wname);
+		free(wparam);
+	#else /* win32 non-unicode */
+#if 0
+		if (CreateProcessA(
+			name, x
+			, NULL, NULL
+			, FALSE
+			, CREATE_NO_WINDOW
+			, NULL
+			, NULL
+			, &si, &pi)
+		)
+		{
+		//WaitForSingleObject(pi.hProcess, INFINITE);
+		//CloseHandle(pi.hProcess);
+		//CloseHandle(pi.hThread);
+		}
+		else
+			rval = 1;
+#else
+		rval = (int)ShellExecuteA(NULL, "open", name, param, ".", SW_SHOWNORMAL);
+		rval = rval <= 32;
+#endif
+	#endif
+	return rval;//rval <= 32;
+#else /* not win32 unicode */
+	char *x = malloc_safe(strlen(name) + strlen(param) + 128);
+	if (!x)
+		return -1;
+	strcpy(x, "\"");
+	strcat(x, name);
+	strcat(x, "\" ");
+	strcat(x, param);
+	int rval = system(x);
+	free(x);
+	return rval;
+#endif
+}
+
+#endif /* WOW_IMPLEMENTATION */
+
+#endif /* WOW_H_INCLUDED */
+
diff --git a/tools/z64compress/src/wow_dirent.h b/tools/z64compress/src/wow_dirent.h
new file mode 100644
index 000000000..e9db5ca0b
--- /dev/null
+++ b/tools/z64compress/src/wow_dirent.h
@@ -0,0 +1,61 @@
+/* 
+ * wow_dirent.h
+ * 
+ * dirent wrapper that abstracts unicode/utf8 platforms
+ * 
+ * must be #include'd after dirent.h
+ * 
+ * z64me <z64.me>
+ * 
+ */
+
+#ifndef WOW_DIRENT_INCLUDED
+#define WOW_DIRENT_INCLUDED
+#include "wow.h"
+
+#if defined(_WIN32) && defined(UNICODE)
+#	define  wow_DIR          _WDIR
+#	define  wow_dirent       _wdirent
+static
+wow_DIR *
+wow_opendir(const char *path)
+{
+	void *wpath = wow_utf8_to_wchar(path);
+	if (!wpath)
+		return NULL;
+	
+	wow_DIR *rv = _wopendir(wpath);
+	
+	free(wpath);
+	
+	return rv;
+}
+static
+struct wow_dirent *
+wow_readdir(wow_DIR *dir)
+{
+	struct wow_dirent *ep = _wreaddir(dir);
+	if (!ep)
+		return 0;
+	
+	/* convert d_name to utf8 for working on them directly */
+	char *str = wow_wchar_to_utf8(ep->d_name);
+	memcpy(ep->d_name, str, strlen(str) + 1);
+	free(str);
+	
+	return ep;
+}
+#	define  wow_closedir     _wclosedir
+#	define  wow_dirent_char  wchar_t
+
+#else /* not win32 unicode */
+#	define  wow_DIR          DIR
+#	define  wow_dirent       dirent
+#	define  wow_opendir      opendir
+#	define  wow_readdir      readdir
+#	define  wow_closedir     closedir
+#	define  wow_dirent_char  char
+#endif
+
+#endif /* WOW_DIRENT_INCLUDED */
+