2012-05-21 04:12:37 -07:00
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
2010-08-18 14:30:09 -07:00
2012-02-16 06:31:49 -08:00
import sys , os , subprocess , struct , re
2010-08-18 14:30:09 -07:00
local_file_header = [
( " signature " , " uint32 " ) ,
( " min_version " , " uint16 " ) ,
( " general_flag " , " uint16 " ) ,
( " compression " , " uint16 " ) ,
( " lastmod_time " , " uint16 " ) ,
( " lastmod_date " , " uint16 " ) ,
( " crc32 " , " uint32 " ) ,
( " compressed_size " , " uint32 " ) ,
( " uncompressed_size " , " uint32 " ) ,
( " filename_size " , " uint16 " ) ,
( " extra_field_size " , " uint16 " ) ,
( " filename " , " filename_size " ) ,
( " extra_field " , " extra_field_size " ) ,
( " data " , " compressed_size " )
]
cdir_entry = [
( " signature " , " uint32 " ) ,
( " creator_version " , " uint16 " ) ,
( " min_version " , " uint16 " ) ,
( " general_flag " , " uint16 " ) ,
( " compression " , " uint16 " ) ,
( " lastmod_time " , " uint16 " ) ,
( " lastmod_date " , " uint16 " ) ,
( " crc32 " , " uint32 " ) ,
( " compressed_size " , " uint32 " ) ,
( " uncompressed_size " , " uint32 " ) ,
( " filename_size " , " uint16 " ) ,
( " extrafield_size " , " uint16 " ) ,
( " filecomment_size " , " uint16 " ) ,
( " disknum " , " uint16 " ) ,
( " internal_attr " , " uint16 " ) ,
( " external_attr " , " uint32 " ) ,
( " offset " , " uint32 " ) ,
( " filename " , " filename_size " ) ,
( " extrafield " , " extrafield_size " ) ,
( " filecomment " , " filecomment_size " ) ,
]
cdir_end = [
( " signature " , " uint32 " ) ,
( " disk_num " , " uint16 " ) ,
( " cdir_disk " , " uint16 " ) ,
( " disk_entries " , " uint16 " ) ,
( " cdir_entries " , " uint16 " ) ,
( " cdir_size " , " uint32 " ) ,
( " cdir_offset " , " uint32 " ) ,
( " comment_size " , " uint16 " ) ,
]
type_mapping = { " uint32 " : " I " , " uint16 " : " H " }
def format_struct ( format ) :
string_fields = { }
fmt = " < "
for ( name , value ) in iter ( format ) :
try :
fmt + = type_mapping [ value ] [ 0 ]
except KeyError :
string_fields [ name ] = value
return ( fmt , string_fields )
def size_of ( format ) :
return struct . calcsize ( format_struct ( format ) [ 0 ] )
class MyStruct :
def __init__ ( self , format , string_fields ) :
self . __dict__ [ " struct_members " ] = { }
self . __dict__ [ " format " ] = format
self . __dict__ [ " string_fields " ] = string_fields
def addMember ( self , name , value ) :
self . __dict__ [ " struct_members " ] [ name ] = value
def __getattr__ ( self , item ) :
try :
return self . __dict__ [ " struct_members " ] [ item ]
except :
pass
print ( " no %s " % item )
print ( self . __dict__ [ " struct_members " ] )
raise AttributeError
def __setattr__ ( self , item , value ) :
if item in self . __dict__ [ " struct_members " ] :
self . __dict__ [ " struct_members " ] [ item ] = value
else :
raise AttributeError
def pack ( self ) :
extra_data = " "
values = [ ]
string_fields = self . __dict__ [ " string_fields " ]
struct_members = self . __dict__ [ " struct_members " ]
format = self . __dict__ [ " format " ]
for ( name , _ ) in format :
if name in string_fields :
extra_data = extra_data + struct_members [ name ]
else :
values . append ( struct_members [ name ] ) ;
return struct . pack ( format_struct ( format ) [ 0 ] , * values ) + extra_data
ENDSIG = 0x06054b50
def assert_true ( cond , msg ) :
if not cond :
raise Exception ( msg )
exit ( 1 )
class BinaryBlob :
def __init__ ( self , f ) :
self . data = open ( f , " rb " ) . read ( )
self . offset = 0
self . length = len ( self . data )
def readAt ( self , pos , length ) :
self . offset = pos + length
return self . data [ pos : self . offset ]
def read_struct ( self , format , offset = None ) :
if offset == None :
offset = self . offset
( fstr , string_fields ) = format_struct ( format )
size = struct . calcsize ( fstr )
data = self . readAt ( offset , size )
ret = struct . unpack ( fstr , data )
retstruct = MyStruct ( format , string_fields )
i = 0
for ( name , _ ) in iter ( format ) :
member_desc = None
if not name in string_fields :
member_data = ret [ i ]
i = i + 1
else :
# zip has data fields which are described by other struct fields, this does
# additional reads to fill em in
member_desc = string_fields [ name ]
member_data = self . readAt ( self . offset , retstruct . __getattr__ ( member_desc ) )
retstruct . addMember ( name , member_data )
# sanity check serialization code
data = self . readAt ( offset , self . offset - offset )
out_data = retstruct . pack ( )
assert_true ( out_data == data , " Serialization fail %d != %d " % ( len ( out_data ) , len ( data ) ) )
return retstruct
2010-09-10 09:30:07 -07:00
def optimizejar ( jar , outjar , inlog = None ) :
if inlog is not None :
inlog = open ( inlog ) . read ( ) . rstrip ( )
# in the case of an empty log still move the index forward
if len ( inlog ) == 0 :
inlog = [ ]
else :
inlog = inlog . split ( " \n " )
outlog = [ ]
2010-08-18 14:30:09 -07:00
jarblob = BinaryBlob ( jar )
dirend = jarblob . read_struct ( cdir_end , jarblob . length - size_of ( cdir_end ) )
assert_true ( dirend . signature == ENDSIG , " no signature in the end " ) ;
cdir_offset = dirend . cdir_offset
2010-09-10 09:30:07 -07:00
readahead = 0
if inlog is None and cdir_offset == 4 :
readahead = struct . unpack ( " <I " , jarblob . readAt ( 0 , 4 ) ) [ 0 ]
print ( " %s : startup data ends at byte %d " % ( outjar , readahead ) ) ;
2011-05-23 17:23:13 -07:00
total_stripped = 0 ;
2010-08-18 14:30:09 -07:00
jarblob . offset = cdir_offset
central_directory = [ ]
2011-05-23 17:23:13 -07:00
for i in range ( 0 , dirend . cdir_entries ) :
2010-08-18 14:30:09 -07:00
entry = jarblob . read_struct ( cdir_entry )
2011-05-23 17:23:13 -07:00
if entry . filename [ - 1 : ] == " / " :
total_stripped + = len ( entry . pack ( ) )
else :
total_stripped + = entry . extrafield_size
2010-08-18 14:30:09 -07:00
central_directory . append ( entry )
reordered_count = 0
2010-09-10 09:30:07 -07:00
if inlog is not None :
dup_guard = set ( )
for ordered_name in inlog :
if ordered_name in dup_guard :
continue
else :
dup_guard . add ( ordered_name )
found = False
for i in range ( reordered_count , len ( central_directory ) ) :
if central_directory [ i ] . filename == ordered_name :
# swap the cdir entries
tmp = central_directory [ i ]
central_directory [ i ] = central_directory [ reordered_count ]
central_directory [ reordered_count ] = tmp
reordered_count = reordered_count + 1
found = True
break
if not found :
print ( " Can ' t find ' %s ' in %s " % ( ordered_name , jar ) )
2010-08-18 14:30:09 -07:00
outfd = open ( outjar , " wb " )
2010-09-10 09:30:07 -07:00
out_offset = 0
if inlog is not None :
# have to put central directory at offset 4 cos 0 confuses some tools.
# This also lets us specify how many entries should be preread
dirend . cdir_offset = 4
# make room for central dir + end of dir + 4 extra bytes at front
2011-05-23 17:23:13 -07:00
out_offset = dirend . cdir_offset + dirend . cdir_size + size_of ( cdir_end ) - total_stripped
2010-09-10 09:30:07 -07:00
outfd . seek ( out_offset )
2010-08-18 14:30:09 -07:00
cdir_data = " "
written_count = 0
2011-06-20 12:23:21 -07:00
crc_mapping = { }
dups_found = 0
dupe_bytes = 0
2010-09-10 09:30:07 -07:00
# store number of bytes suggested for readahead
2010-08-18 14:30:09 -07:00
for entry in central_directory :
# read in the header twice..first for comparison, second time for convenience when writing out
jarfile = jarblob . read_struct ( local_file_header , entry . offset )
assert_true ( jarfile . filename == entry . filename , " Directory/Localheader mismatch " )
2011-05-23 17:23:13 -07:00
# drop directory entries
if entry . filename [ - 1 : ] == " / " :
total_stripped + = len ( jarfile . pack ( ) )
dirend . cdir_entries - = 1
continue
# drop extra field data
else :
total_stripped + = jarfile . extra_field_size ;
entry . extrafield = jarfile . extra_field = " "
entry . extrafield_size = jarfile . extra_field_size = 0
# January 1st, 2010
entry . lastmod_date = jarfile . lastmod_date = ( ( 2010 - 1980 ) << 9 ) | ( 1 << 5 ) | 1
entry . lastmod_time = jarfile . lastmod_time = 0
2010-08-18 14:30:09 -07:00
data = jarfile . pack ( )
outfd . write ( data )
2010-09-10 09:30:07 -07:00
old_entry_offset = entry . offset
2010-08-18 14:30:09 -07:00
entry . offset = out_offset
out_offset = out_offset + len ( data )
entry_data = entry . pack ( )
cdir_data + = entry_data
expected_len = entry . filename_size + entry . extrafield_size + entry . filecomment_size
assert_true ( len ( entry_data ) != expected_len ,
" %s entry size - expected: %d got: %d " % ( entry . filename , len ( entry_data ) , expected_len ) )
written_count + = 1
2011-06-20 12:23:21 -07:00
if entry . crc32 in crc_mapping :
dups_found + = 1
dupe_bytes + = entry . compressed_size + len ( data ) + len ( entry_data )
print ( " %s \n \t is a duplicate of \n %s \n --- " % ( entry . filename , crc_mapping [ entry . crc32 ] ) )
else :
crc_mapping [ entry . crc32 ] = entry . filename ;
2010-09-10 09:30:07 -07:00
if inlog is not None :
if written_count == reordered_count :
readahead = out_offset
print ( " %s : startup data ends at byte %d " % ( outjar , readahead ) ) ;
elif written_count < reordered_count :
pass
#print("%s @ %d" % (entry.filename, out_offset))
elif readahead > = old_entry_offset + len ( data ) :
outlog . append ( entry . filename )
reordered_count + = 1
if inlog is None :
dirend . cdir_offset = out_offset
2010-08-18 14:30:09 -07:00
2011-06-20 12:23:21 -07:00
if dups_found > 0 :
print ( " WARNING: Found %d duplicate files taking %d bytes " % ( dups_found , dupe_bytes ) )
2011-05-23 17:23:13 -07:00
dirend . cdir_size = len ( cdir_data )
2011-09-06 12:57:36 -07:00
dirend . disk_entries = dirend . cdir_entries
2010-09-10 09:30:07 -07:00
dirend_data = dirend . pack ( )
2010-08-18 14:30:09 -07:00
assert_true ( size_of ( cdir_end ) == len ( dirend_data ) , " Failed to serialize directory end correctly. Serialized size; %d , expected: %d " % ( len ( dirend_data ) , size_of ( cdir_end ) ) ) ;
2010-09-10 09:30:07 -07:00
2010-08-18 14:30:09 -07:00
outfd . seek ( dirend . cdir_offset )
outfd . write ( cdir_data )
outfd . write ( dirend_data )
2010-09-10 09:30:07 -07:00
# for ordered jars the central directory is written in the begining of the file, so a second central-directory
# entry has to be written in the end of the file
if inlog is not None :
outfd . seek ( 0 )
outfd . write ( struct . pack ( " <I " , readahead ) ) ;
outfd . seek ( out_offset )
outfd . write ( dirend_data )
2011-05-23 17:23:13 -07:00
print " Stripped %d bytes " % total_stripped
2010-09-10 09:30:07 -07:00
print " %s %d / %d in %s " % ( ( " Ordered " if inlog is not None else " Deoptimized " ) ,
reordered_count , len ( central_directory ) , outjar )
2010-08-18 14:30:09 -07:00
outfd . close ( )
2010-09-10 09:30:07 -07:00
return outlog
if len ( sys . argv ) != 5 :
print " Usage: --optimize|--deoptimize %s JAR_LOG_DIR IN_JAR_DIR OUT_JAR_DIR " % sys . argv [ 0 ]
2010-08-18 14:30:09 -07:00
exit ( 1 )
2012-02-16 06:31:49 -08:00
jar_regex = re . compile ( " \\ .jar?$ " )
2010-09-10 09:30:07 -07:00
def optimize ( JAR_LOG_DIR , IN_JAR_DIR , OUT_JAR_DIR ) :
2011-05-23 17:23:13 -07:00
ls = os . listdir ( IN_JAR_DIR )
for jarfile in ls :
2012-02-16 06:31:49 -08:00
if not re . search ( jar_regex , jarfile ) :
2010-09-10 09:30:07 -07:00
continue
2011-05-23 17:23:13 -07:00
injarfile = os . path . join ( IN_JAR_DIR , jarfile )
outjarfile = os . path . join ( OUT_JAR_DIR , jarfile )
logfile = os . path . join ( JAR_LOG_DIR , jarfile + " .log " )
if not os . path . isfile ( logfile ) :
logfile = None
2010-09-10 09:30:07 -07:00
optimizejar ( injarfile , outjarfile , logfile )
def deoptimize ( JAR_LOG_DIR , IN_JAR_DIR , OUT_JAR_DIR ) :
if not os . path . exists ( JAR_LOG_DIR ) :
os . makedirs ( JAR_LOG_DIR )
ls = os . listdir ( IN_JAR_DIR )
for jarfile in ls :
2012-02-16 06:31:49 -08:00
if not re . search ( jar_regex , jarfile ) :
2010-09-10 09:30:07 -07:00
continue
injarfile = os . path . join ( IN_JAR_DIR , jarfile )
outjarfile = os . path . join ( OUT_JAR_DIR , jarfile )
logfile = os . path . join ( JAR_LOG_DIR , jarfile + " .log " )
log = optimizejar ( injarfile , outjarfile , None )
open ( logfile , " wb " ) . write ( " \n " . join ( log ) )
def main ( ) :
MODE = sys . argv [ 1 ]
JAR_LOG_DIR = sys . argv [ 2 ]
IN_JAR_DIR = sys . argv [ 3 ]
OUT_JAR_DIR = sys . argv [ 4 ]
if MODE == " --optimize " :
optimize ( JAR_LOG_DIR , IN_JAR_DIR , OUT_JAR_DIR )
elif MODE == " --deoptimize " :
deoptimize ( JAR_LOG_DIR , IN_JAR_DIR , OUT_JAR_DIR )
else :
print ( " Unknown mode %s " % MODE )
exit ( 1 )
2010-08-18 14:30:09 -07:00
2010-09-10 09:30:07 -07:00
if __name__ == ' __main__ ' :
main ( )