Office 2 Hash Cat
Office 2 Hash Cat
/usr/bin/env python
# ----------
# PIL License:
#
# olefile is based on source code from the OleFileIO module of the Python
# Imaging Library (PIL) published by Fredrik Lundh under the following license:
#-----------------------------------------------------------------------------
# CHANGELOG: (only olefile/OleFileIO_PL changes compared to PIL 1.1.6)
# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility
# (all changes flagged with [PL])
# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise
# exceptions in _OleStream.__init__()
# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat)
# - added some constants
# - added header values checks
# - added some docstrings
# - getsect: bugfix in case sectors >512 bytes
# - getsect: added conformity checks
# - DEBUG_MODE constant to activate debug display
# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments
# - updated license
# - converted tabs to 4 spaces
# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity
# - improved _unicode() to use Python 2.x unicode support
# - fixed bug in _OleDirectoryEntry
# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops
# - fixed _OleStream which didn't check stream size
# - added/improved many docstrings and comments
# - moved helper functions _unicode and _clsid out of
# OleFileIO class
# - improved OleFileIO._find() to add Unix path syntax
# - OleFileIO._find() is now case-insensitive
# - added get_type() and get_rootentry_name()
# - rewritten loaddirectory and _OleDirectoryEntry
# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict
# - added detection of duplicate filenames in storages
# - added detection of duplicate references to streams
# - added get_size() and exists() to _OleDirectoryEntry
# - added isOleFile to check header before parsing
# - added __all__ list to control public keywords in pydoc
# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory
# - improved _unicode(), added workarounds for Python <2.3
# - added set_debug_mode and -d option to set debug mode
# - fixed bugs in OleFileIO.open and _OleDirectoryEntry
# - added safety check in main for large or binary
# properties
# - allow size>0 for storages for some implementations
# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and
# streams
# - added option '-c' in main to check all streams
# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms
# (thanks to Ben G. and Martijn for reporting the bug)
# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str
# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs
# 2012-02-16 v0.22 PL: - fixed bug in getproperties, patch by chuckleberryfinn
# (https://bitbucket.org/decalage/olefileio_pl/issue/7)
# - added close method to OleFileIO (fixed issue #2)
# 2012-07-25 v0.23 PL: - added support for file-like objects (patch by mete0r_kr)
# 2013-05-05 v0.24 PL: - getproperties: added conversion from filetime to python
# datetime
# - main: displays properties with date format
# - new class OleMetadata to parse standard properties
# - added get_metadata method
# 2013-05-07 v0.24 PL: - a few improvements in OleMetadata
# 2013-05-24 v0.25 PL: - getproperties: option to not convert some timestamps
# - OleMetaData: total_edit_time is now a number of seconds,
# not a timestamp
# - getproperties: added support for VT_BOOL, VT_INT, V_UINT
# - getproperties: filter out null chars from strings
# - getproperties: raise non-fatal defects instead of
# exceptions when properties cannot be parsed properly
# 2013-05-27 PL: - getproperties: improved exception handling
# - _raise_defect: added option to set exception type
# - all non-fatal issues are now recorded, and displayed
# when run as a script
# 2013-07-11 v0.26 PL: - added methods to get modification and creation times
# of a directory entry or a storage/stream
# - fixed parsing of direntry timestamps
# 2013-07-24 PL: - new options in listdir to list storages and/or streams
# 2014-02-04 v0.30 PL: - upgraded code to support Python 3.x by Martin Panter
# - several fixes for Python 2.6 (xrange, MAGIC)
# - reused i32 from Pillow's _binary
# 2014-07-18 v0.31 - preliminary support for 4K sectors
# 2014-07-27 v0.31 PL: - a few improvements in OleFileIO.open (header parsing)
# - Fixed loadfat for large files with 4K sectors (issue #3)
# 2014-07-30 v0.32 PL: - added write_sect to write sectors to disk
# - added write_mode option to OleFileIO.__init__ and open
# 2014-07-31 PL: - fixed padding in write_sect for Python 3, added checks
# - added write_stream to write a stream to disk
# 2014-09-26 v0.40 PL: - renamed OleFileIO_PL to olefile
# 2014-11-09 NE: - added support for Jython (Niko Ehrenfeuchter)
# 2014-11-13 v0.41 PL: - improved isOleFile and OleFileIO.open to support OLE
# data in a string buffer and file-like objects.
# 2014-11-21 PL: - updated comments according to Pillow's commits
# 2015-01-24 v0.42 PL: - changed the default path name encoding from Latin-1
# to UTF-8 on Python 2.x (Unicode on Python 3.x)
# - added path_encoding option to override the default
# - fixed a bug in _list when a storage is empty
#-----------------------------------------------------------------------------
# TODO (for version 1.0):
# + get rid of print statements, to simplify Python 2.x and 3.x support
# + add is_stream and is_storage
# + remove leading and trailing slashes where a path is used
# + add functions path_list2str and path_str2list
# + fix how all the methods handle unicode str and/or bytes as arguments
# + add path attrib to _OleDirEntry, set it once and for all in init or
# append_kids (then listdir/_list can be simplified)
# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ...
# - add underscore to each private method, to avoid their display in
# pydoc/epydoc documentation - Remove it for classes to be documented
# - replace all raised exceptions with _raise_defect (at least in OleFileIO)
# - merge code from _OleStream and OleFileIO.getsect to read sectors
# (maybe add a class for FAT and MiniFAT ?)
# - add method to check all streams (follow sectors chains without storing all
# stream in memory, and report anomalies)
# - use _OleDirectoryEntry.kids_dict to improve _find and _list ?
# - fix Unicode names handling (find some way to stay compatible with Py1.5.2)
# => if possible avoid converting names to Latin-1
# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop)
# - rewrite OleFileIO.getproperties
# - improve docstrings to show more sample uses
# - see also original notes and FIXME below
# - remove all obsolete FIXMEs
# - OleMetadata: fix version attrib according to
# http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
# IDEAS:
# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for
# streams with unknown size
# - use arrays of int instead of long integers for FAT/MiniFAT, to improve
# performance and reduce memory usage ? (possible issue with values >2^31)
# - provide tests with unittest (may need write support to create samples)
# - move all debug code (and maybe dump methods) to a separate module, with
# a class which inherits OleFileIO ?
# - fix docstrings to follow epydoc format
# - add support for big endian byte order ?
# - create a simple OLE explorer with wxPython
#-----------------------------------------------------------------------------
# NOTES from PIL 1.1.6:
# History:
# 1997-01-20 fl Created
# 1997-01-22 fl Fixed 64-bit portability quirk
# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle)
# 2004-02-29 fl Changed long hex constants to signed integers
#
# Notes:
# FIXME: sort out sign problem (eliminate long hex constants)
# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"]
# FIXME: provide a glob mechanism function (using fnmatchcase)
#
# Literature:
#
# "FlashPix Format Specification, Appendix A", Kodak and Microsoft,
# September 1996.
#
# Quotes:
#
# "If this document and functionality of the Software conflict,
# the actual functionality of the Software represents the correct
# functionality" -- Microsoft, in the OLE format specification
#------------------------------------------------------------------------------
import io
import sys
import struct, array, os.path, datetime
#[PL] Define explicitly the public API to avoid private objects in pydoc:
#TODO: add more
# __all__ = ['OleFileIO', 'isOleFile', 'MAGIC']
#[PL] workaround to fix an issue with array item size on 64 bits systems:
if array.array('L').itemsize == 4:
# on 32 bits platforms, long integers in an array are 32 bits:
UINT32 = 'L'
elif array.array('I').itemsize == 4:
# on 64 bits platforms, integers in an array are 32 bits:
UINT32 = 'I'
elif array.array('i').itemsize == 4:
# On 64 bit Jython, signed integers ('i') are the only way to store our 32
# bit values in an array in a *somewhat* reasonable way, as the otherwise
# perfectly suited 'H' (unsigned int, 32 bits) results in a completely
# unusable behaviour. This is most likely caused by the fact that Java
# doesn't have unsigned values, and thus Jython's "array" implementation,
# which is based on "jarray", doesn't have them either.
# NOTE: to trick Jython into converting the values it would normally
# interpret as "signed" into "unsigned", a binary-and operation with
# 0xFFFFFFFF can be used. This way it is possible to use the same comparing
# operations on all platforms / implementations. The corresponding code
# lines are flagged with a 'JYTHON-WORKAROUND' tag below.
UINT32 = 'i'
else:
raise ValueError('Need to fix a bug with 32 bit arrays, please contact
author...')
if sys.version_info[0] < 3:
# On Python 2.x, the default encoding for path names is UTF-8:
DEFAULT_PATH_ENCODING = 'utf-8'
else:
# On Python 3.x, the default encoding for path names is Unicode (None):
DEFAULT_PATH_ENCODING = None
def set_debug_mode(debug_mode):
"""
Set debug mode on or off, to control display of debugging messages.
:param mode: True or False
"""
global DEBUG_MODE, debug
DEBUG_MODE = debug_mode
if debug_mode:
debug = debug_print
else:
debug = debug_pass
#[PL]: added constants for Directory Entry IDs (from AAF specifications)
MAXREGSID = 0xFFFFFFFA # (-6) maximum directory entry ID
NOSTREAM = 0xFFFFFFFF # (-1) unallocated directory entry
#
# --------------------------------------------------------------------
# property types
VT = {}
for keyword, var in list(vars().items()):
if keyword[:3] == "VT_":
VT[var] = keyword
#
# --------------------------------------------------------------------
# Some common document types (root.clsid fields)
WORD_CLSID = "00020900-0000-0000-C000-000000000046"
#TODO: check Excel, PPT, ...
# Minimal size of an empty OLE file, with 512-bytes sectors = 1536 bytes
# (this is used in isOleFile and OleFile.open)
MINIMAL_OLEFILE_SIZE = 1536
if bytes is str:
# version for Python 2.x
def i8(c):
return ord(c)
else:
# version for Python 3.x
def i8(c):
return c if c.__class__ is int else c[0]
#TODO: replace i16 and i32 with more readable struct.unpack equivalent?
def i16(c, o = 0):
"""
Converts a 2-bytes (16 bits) string to an integer.
:param c: string containing bytes to convert
:param o: offset of bytes to convert in string
"""
return i8(c[o]) | (i8(c[o+1])<<8)
def _clsid(clsid):
"""
Converts a CLSID to a human-readable string.
:param clsid: string of length 16.
"""
assert len(clsid) == 16
# if clsid is only made of null bytes, return an empty string:
# (PL: why not simply return the string with zeroes?)
if not clsid.strip(b"\0"):
return ""
return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %
((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +
tuple(map(i8, clsid[8:16]))))
def filetime2datetime(filetime):
"""
convert FILETIME (64 bits int) to Python datetime.datetime
"""
# TODO: manage exception when microseconds is too large
# inspired from http://code.activestate.com/recipes/511425-filetime-to-
datetime/
_FILETIME_null_date = datetime.datetime(1601, 1, 1, 0, 0, 0)
#debug('timedelta days=%d' % (filetime//(10*1000000*3600*24)))
return _FILETIME_null_date + datetime.timedelta(microseconds=filetime//10)
class OleMetadata:
"""
class to parse and store metadata from standard properties of OLE files.
Available attributes:
codepage, title, subject, author, keywords, comments, template,
last_saved_by, revision_number, total_edit_time, last_printed, create_time,
last_saved_time, num_pages, num_words, num_chars, thumbnail,
creating_application, security, codepage_doc, category, presentation_target,
bytes, lines, paragraphs, slides, notes, hidden_slides, mm_clips,
scale_crop, heading_pairs, titles_of_parts, manager, company, links_dirty,
chars_with_spaces, unused, shared_doc, link_base, hlinks, hlinks_changed,
version, dig_sig, content_type, content_status, language, doc_version
Note: an attribute is set to None when not present in the properties of the
OLE file.
References for SummaryInformation stream:
- http://msdn.microsoft.com/en-us/library/dd942545.aspx
- http://msdn.microsoft.com/en-us/library/dd925819%28v=office.12%29.aspx
- http://msdn.microsoft.com/en-
us/library/windows/desktop/aa380376%28v=vs.85%29.aspx
- http://msdn.microsoft.com/en-us/library/aa372045.aspx
- http://sedna-soft.de/summary-information-stream/
- http://poi.apache.org/apidocs/org/apache/poi/hpsf/SummaryInformation.html
References for DocumentSummaryInformation stream:
- http://msdn.microsoft.com/en-us/library/dd945671%28v=office.12%29.aspx
- http://msdn.microsoft.com/en-
us/library/windows/desktop/aa380374%28v=vs.85%29.aspx
-
http://poi.apache.org/apidocs/org/apache/poi/hpsf/DocumentSummaryInformation.html
new in version 0.25
"""
def __init__(self):
"""
Constructor for OleMetadata
All attributes are set to None by default
"""
# properties from SummaryInformation stream
self.codepage = None
self.title = None
self.subject = None
self.author = None
self.keywords = None
self.comments = None
self.template = None
self.last_saved_by = None
self.revision_number = None
self.total_edit_time = None
self.last_printed = None
self.create_time = None
self.last_saved_time = None
self.num_pages = None
self.num_words = None
self.num_chars = None
self.thumbnail = None
self.creating_application = None
self.security = None
# properties from DocumentSummaryInformation stream
self.codepage_doc = None
self.category = None
self.presentation_target = None
self.bytes = None
self.lines = None
self.paragraphs = None
self.slides = None
self.notes = None
self.hidden_slides = None
self.mm_clips = None
self.scale_crop = None
self.heading_pairs = None
self.titles_of_parts = None
self.manager = None
self.company = None
self.links_dirty = None
self.chars_with_spaces = None
self.unused = None
self.shared_doc = None
self.link_base = None
self.hlinks = None
self.hlinks_changed = None
self.version = None
self.dig_sig = None
self.content_type = None
self.content_status = None
self.language = None
self.doc_version = None
def dump(self):
"""
Dump all metadata, for debugging purposes.
"""
print('Properties from SummaryInformation stream:')
for prop in self.SUMMARY_ATTRIBS:
value = getattr(self, prop)
print('- %s: %s' % (prop, repr(value)))
print('Properties from DocumentSummaryInformation stream:')
for prop in self.DOCSUM_ATTRIBS:
value = getattr(self, prop)
print('- %s: %s' % (prop, repr(value)))
class _OleStream(io.BytesIO):
"""
OLE2 Stream
Returns a read-only file object which can be used to read
the contents of a OLE stream (instance of the BytesIO class).
To open a stream, use the openstream method in the OleFile class.
This function can be used with either ordinary streams,
or ministreams, depending on the offset, sectorsize, and
fat table arguments.
Attributes:
- size: actual size of data stream, after it was opened.
"""
class _OleDirectoryEntry:
"""
OLE2 Directory Entry
"""
#[PL] parsing code moved from OleFileIO.loaddirectory
# sizeHigh is only used for 4K sectors, it should be zero for 512 bytes
# sectors, BUT apparently some implementations set it as 0xFFFFFFFF, 1
# or some other value so it cannot be raised as a defect in general:
if olefile.sectorsize == 512:
if sizeHigh != 0 and sizeHigh != 0xFFFFFFFF:
debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %
(olefile.sectorsize, sizeLow, sizeHigh, sizeHigh))
olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')
self.size = sizeLow
else:
self.size = sizeLow + (long(sizeHigh)<<32)
debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow,
sizeHigh))
self.clsid = _clsid(clsid)
# a storage should have a null size, BUT some implementations such as
# Word 8 for Mac seem to allow non-null values => Potential defect:
if self.entry_type == STGTY_STORAGE and self.size != 0:
olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')
# check if stream is not already referenced elsewhere:
if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:
if self.size < olefile.minisectorcutoff \
and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT
# ministream object
minifat = True
else:
minifat = False
olefile._check_duplicate_stream(self.isectStart, minifat)
def build_storage_tree(self):
"""
Read and build the red-black tree attached to this _OleDirectoryEntry
object, if it is a storage.
Note that this method builds a tree of all subentries, so it should
only be called for the root object once.
"""
debug('build_storage_tree: SID=%d - %s - sid_child=%d'
% (self.sid, repr(self.name), self.sid_child))
if self.sid_child != NOSTREAM:
# if child SID is not NOSTREAM, then this entry is a storage.
# Let's walk through the tree of children to fill the kids list:
self.append_kids(self.sid_child)
# Reflected __lt__() and __le__() will be used for __gt__() and __ge__()
def getmtime(self):
"""
Return modification time of a directory entry.
:returns: None if modification time is null, a python datetime object
otherwise (UTC timezone)
new in version 0.26
"""
if self.modifyTime == 0:
return None
return filetime2datetime(self.modifyTime)
def getctime(self):
"""
Return creation time of a directory entry.
:returns: None if modification time is null, a python datetime object
otherwise (UTC timezone)
new in version 0.26
"""
if self.createTime == 0:
return None
return filetime2datetime(self.createTime)
class OleFileIO:
"""
OLE container object
This class encapsulates the interface to an OLE 2 structured
storage file. Use the listdir and openstream methods to
access the contents of this file.
Object names are given as a list of strings, one for each subentry
level. The root entry should be omitted. For example, the following
code extracts all image streams from a Microsoft Image Composer file::
ole = OleFileIO("fan.mic")
for entry in ole.listdir():
if entry[1:2] == "Image":
fin = ole.openstream(entry)
fout = open(entry[0:1], "wb")
while True:
s = fin.read(8192)
if not s:
break
fout.write(s)
You can use the viewer application provided with the Python Imaging
Library to view the resulting files (which happens to be standard
TIFF files).
"""
header = self.fp.read(512)
if self.Sig != MAGIC:
# OLE signature should always be present
self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")
if self.clsid != bytearray(16):
# according to AAF specs, CLSID should always be zero
self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")
debug( "MinorVersion = %d" % self.MinorVersion )
debug( "DllVersion = %d" % self.DllVersion )
if self.DllVersion not in [3, 4]:
# version 3: usual format, 512 bytes per sector
# version 4: large format, 4K per sector
self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE
header")
debug( "ByteOrder = %X" % self.ByteOrder )
if self.ByteOrder != 0xFFFE:
# For now only common little-endian documents are handled correctly
self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header")
# TODO: add big-endian support for documents created on Mac ?
# But according to [MS-CFB] ? v20140502, ByteOrder MUST be 0xFFFE.
self.SectorSize = 2**self.SectorShift
debug( "SectorSize = %d" % self.SectorSize )
if self.SectorSize not in [512, 4096]:
self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE
header")
if (self.DllVersion==3 and self.SectorSize!=512) \
or (self.DllVersion==4 and self.SectorSize!=4096):
self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match
DllVersion in OLE header")
self.MiniSectorSize = 2**self.MiniSectorShift
debug( "MiniSectorSize = %d" % self.MiniSectorSize )
if self.MiniSectorSize not in [64]:
self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE
header")
if self.Reserved != 0 or self.Reserved1 != 0:
self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null
reserved bytes)")
debug( "csectDir = %d" % self.csectDir )
# Number of directory sectors (only allowed if DllVersion != 3)
if self.SectorSize==512 and self.csectDir!=0:
self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE
header")
debug( "csectFat = %d" % self.csectFat )
# csectFat = number of FAT sectors in the file
debug( "sectDirStart = %X" % self.sectDirStart )
# sectDirStart = 1st sector containing the directory
debug( "signature = %d" % self.signature )
# Signature should be zero, BUT some implementations do not follow this
# rule => only a potential defect:
# (according to MS-CFB, may be != 0 for applications supporting file
# transactions)
if self.signature != 0:
self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header
(signature>0)")
debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff )
# MS-CFB: This integer field MUST be set to 0x00001000. This field
# specifies the maximum size of a user-defined data stream allocated
# from the mini FAT and mini stream, and that cutoff is 4096 bytes.
# Any user-defined data stream larger than or equal to this cutoff size
# must be allocated as normal sectors from the FAT.
if self.MiniSectorCutoff != 0x1000:
self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorCutoff in OLE
header")
debug( "MiniFatStart = %X" % self.MiniFatStart )
debug( "csectMiniFat = %d" % self.csectMiniFat )
debug( "sectDifStart = %X" % self.sectDifStart )
debug( "csectDif = %d" % self.csectDif )
# file clsid
self.clsid = _clsid(header[8:24])
#TODO: remove redundant attributes, and fix the code which uses them?
self.sectorsize = self.SectorSize #1 << i16(header, 30)
self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32)
self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56)
# check known streams for duplicate references (these are always in FAT,
# never in MiniFAT):
self._check_duplicate_stream(self.sectDirStart)
# check MiniFAT only if it is not empty:
if self.csectMiniFat:
self._check_duplicate_stream(self.MiniFatStart)
# check DIFAT only if it is not empty:
if self.csectDif:
self._check_duplicate_stream(self.sectDifStart)
def close(self):
"""
close the OLE file, to release the file object
"""
self.fp.close()
sect = header[76:512]
debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)//4) )
#fat = []
# [PL] FAT is an array of 32 bits unsigned ints, it's more effective
# to use an array than a list in Python.
# It's initialized as empty first:
self.fat = array.array(UINT32)
self.loadfat_sect(sect)
#self.dumpfat(self.fat)
## for i in range(0, len(sect), 4):
## ix = i32(sect, i)
## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:
## if ix == 0xFFFFFFFE or ix == 0xFFFFFFFF:
## break
## s = self.getsect(ix)
## #fat = fat + [i32(s, i) for i in range(0, len(s), 4)]
## fat = fat + array.array(UINT32, s)
if self.csectDif != 0:
# [PL] There's a DIFAT because file is larger than 6.8MB
# some checks just in case:
if self.csectFat <= 109:
# there must be at least 109 blocks in header and the rest in
# DIFAT, so number of sectors must be >109.
self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough
sectors')
if self.sectDifStart >= self.nb_sect:
# initial DIFAT block index must be valid
self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out
of range')
debug( "DIFAT analysis..." )
# We compute the necessary number of DIFAT sectors :
# Number of pointers per DIFAT sector = (sectorsize/4)-1
# (-1 because the last pointer is the next DIFAT sector number)
nb_difat_sectors = (self.sectorsize//4)-1
# (if 512 bytes: each DIFAT sector = 127 pointers + 1 towards next
DIFAT sector)
nb_difat = (self.csectFat-109 + nb_difat_sectors-1)//nb_difat_sectors
debug( "nb_difat = %d" % nb_difat )
if self.csectDif != nb_difat:
raise IOError('incorrect DIFAT')
isect_difat = self.sectDifStart
for i in iterrange(nb_difat):
debug( "DIFAT block %d, sector %X" % (i, isect_difat) )
#TODO: check if corresponding FAT SID = DIFSECT
sector_difat = self.getsect(isect_difat)
difat = self.sect2array(sector_difat)
self.dumpsect(sector_difat)
self.loadfat_sect(difat[:nb_difat_sectors])
# last DIFAT pointer is next DIFAT sector:
isect_difat = difat[nb_difat_sectors]
debug( "next DIFAT sector: %X" % isect_difat )
# checks:
if isect_difat not in [ENDOFCHAIN, FREESECT]:
# last DIFAT pointer value must be ENDOFCHAIN or FREESECT
raise IOError('incorrect end of DIFAT')
## if len(self.fat) != self.csectFat:
## # FAT should contain csectFat blocks
## print("FAT length: %d instead of %d" % (len(self.fat),
self.csectFat))
## raise IOError('incorrect DIFAT')
# since FAT is read from fixed-size sectors, it may contain more values
# than the actual number of sectors in the file.
# Keep only the relevant sector indexes:
if len(self.fat) > self.nb_sect:
debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat),
self.nb_sect))
self.fat = self.fat[:self.nb_sect]
debug('\nFAT:')
self.dumpfat(self.fat)
def loadminifat(self):
"""
Load the MiniFAT table.
"""
# MiniFAT is stored in a standard sub-stream, pointed to by a header
# field.
# NOTE: there are two sizes to take into account for this stream:
# 1) Stream size is calculated according to the number of sectors
# declared in the OLE header. This allocated stream may be more than
# needed to store the actual sector indexes.
# (self.csectMiniFat is the number of sectors of size self.SectorSize)
stream_size = self.csectMiniFat * self.SectorSize
# 2) Actually used size is calculated by dividing the MiniStream size
# (given by root entry size) by the size of mini sectors, *4 for
# 32 bits indexes:
nb_minisectors = (self.root.size + self.MiniSectorSize-1) //
self.MiniSectorSize
used_size = nb_minisectors * 4
debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d,
stream_size=%d, nb MiniSectors=%d' %
(self.minifatsect, self.csectMiniFat, used_size, stream_size,
nb_minisectors))
if used_size > stream_size:
# This is not really a problem, but may indicate a wrong
implementation:
self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than
MiniFAT')
# In any case, first read stream_size:
s = self._open(self.minifatsect, stream_size, force_FAT=True).read()
#[PL] Old code replaced by an array:
#self.minifat = [i32(s, i) for i in range(0, len(s), 4)]
self.minifat = self.sect2array(s)
# Then shrink the array to used size, to avoid indexes out of MiniStream:
debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat),
nb_minisectors))
self.minifat = self.minifat[:nb_minisectors]
debug('loadminifat(): len=%d' % len(self.minifat))
debug('\nMiniFAT:')
self.dumpfat(self.minifat)
# [PL] the original code in PIL was wrong when sectors are 4KB instead of
# 512 bytes:
#self.fp.seek(512 + self.sectorsize * sect)
#[PL]: added safety checks:
#print("getsect(%X)" % sect)
try:
self.fp.seek(self.sectorsize * (sect+1))
except:
debug('getsect(): sect=%X, seek=%d, filesize=%d' %
(sect, self.sectorsize*(sect+1), self._filesize))
self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
sector = self.fp.read(self.sectorsize)
if len(sector) != self.sectorsize:
debug('getsect(): sect=%X, read=%d, sectorsize=%d' %
(sect, len(sector), self.sectorsize))
self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')
return sector
#[PL] to detect malformed documents and avoid DoS attacks, the maximum
# number of directory entries can be calculated:
max_entries = self.directory_fp.size // 128
debug('loaddirectory: size=%d, max_entries=%d' %
(self.directory_fp.size, max_entries))
def dumpdirectory(self):
"""
Dump directory (for debugging only)
"""
self.root.dump()
fp = self.openstream(filename)
data = {}
try:
# header
s = fp.read(28)
clsid = _clsid(s[8:24])
# format id
s = fp.read(20)
fmtid = _clsid(s[:16])
fp.seek(i32(s, 16))
# get section
s = b"****" + fp.read(i32(fp.read(4))-4)
# number of properties:
num_props = i32(s, 4)
except BaseException as exc:
# catch exception while parsing property header, and only raise
# a DEFECT_INCORRECT then return an empty dict, because this is not
# a fatal error when parsing the whole file
msg = 'Error while parsing properties header in stream %s: %s' % (
repr(streampath), exc)
self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
return data
for i in range(num_props):
try:
id = 0 # just in case of an exception
id = i32(s, 8+i*8)
offset = i32(s, 12+i*8)
type = i32(s, offset)
data[id] = value
except BaseException as exc:
# catch exception while parsing each property, and only raise
# a DEFECT_INCORRECT, because parsing can go on
msg = 'Error while parsing property id %d in stream %s: %s' % (
id, repr(streampath), exc)
self._raise_defect(DEFECT_INCORRECT, msg, type(exc))
return data
def get_metadata(self):
"""
Parse standard properties streams, return an OleMetadata object
containing all the available metadata.
(also stored in the metadata attribute of the OleFileIO object)
new in version 0.25
"""
self.metadata = OleMetadata()
self.metadata.parse_properties(self)
return self.metadata
#
# --------------------------------------------------------------------
# This script can be used to dump the directory of any OLE2 structured
# storage file.
if __name__ == "__main__disabled":
import sys
check_streams = False
for filename in sys.argv[1:]:
## try:
# OPTIONS:
if filename == '-d':
# option to switch debug mode on:
set_debug_mode(True)
continue
if filename == '-c':
# option to switch check streams mode on:
check_streams = True
continue
if check_streams:
# Read all streams to check if there are errors:
print('\nChecking streams...')
for streamname in ole.listdir():
# print name using repr() to convert binary chars to \xNN:
print('-', repr('/'.join(streamname)),'-', end=' ')
st_type = ole.get_type(streamname)
if st_type == STGTY_STREAM:
print('size %d' % ole.get_size(streamname))
# just try to read stream in memory:
ole.openstream(streamname)
else:
print('NOT a stream : type=%d' % st_type)
print()
# this code was developed while listening to The Wedding Present "Sea Monsters"
PY3 = sys.version_info[0] == 3
if not PY3:
reload(sys)
sys.setdefaultencoding("utf8")
if PY3:
from io import BytesIO as StringIO
else:
from StringIO import StringIO
from struct import unpack
import binascii
while True:
pos = stream.tell()
if pos >= stream.size:
break # eof
return None
if major_version == 1 or minor_version == 1:
data = stream.read(48)
salt = data[:16]
verifier = data[16:32]
verifierHash = data[32:48]
return (salt, verifier, verifierHash)
elif major_version >= 2 and minor_version == 2:
# RC4 CryptoAPI Encryption Header
unpack("<I", stream.read(4))[0] # encryptionFlags
headerLength = unpack("<I", stream.read(4))[0]
unpack("<I", stream.read(4))[0] # skipFlags
headerLength -= 4
unpack("<I", stream.read(4))[0] # sizeExtra
headerLength -= 4
unpack("<I", stream.read(4))[0] # algId
headerLength -= 4
unpack("<I", stream.read(4))[0] # algHashId
headerLength -= 4
keySize = unpack("<I", stream.read(4))[0] # keySize
headerLength -= 4
unpack("<I", stream.read(4))[0] # providerType
headerLength -= 4
unpack("<I", stream.read(4))[0] # unused
headerLength -= 4
unpack("<I", stream.read(4))[0] # unused
headerLength -= 4
CSPName = stream.read(headerLength)
provider = CSPName.decode('utf-16').lower()
if keySize == 128:
typ = 4
elif keySize == 40:
typ = 3
else:
sys.stderr.write("%s : invalid keySize\n" % filename)
# Encryption verifier
saltSize = unpack("<I", stream.read(4))[0]
assert(saltSize == 16)
salt = stream.read(saltSize)
encryptedVerifier = stream.read(16)
verifierHashSize = unpack("<I", stream.read(4))[0]
assert(verifierHashSize == 20)
encryptedVerifierHash = stream.read(verifierHashSize)
sys.stdout.write("$oldoffice$%s*%s*%s*%s\n" % (typ,
binascii.hexlify(salt).decode("ascii"),
binascii.hexlify(encryptedVerifier).decode("ascii"),
binascii.hexlify(encryptedVerifierHash).decode("ascii")))
else:
sys.stderr.write("%s : Cannot find RC4 pass info, is document encrypted?\n"
% filename)
def process_new_office(filename):
# detect version of new Office used by reading "EncryptionInfo" stream
ole = OleFileIO(filename)
stream = ole.openstream("EncryptionInfo")
major_version = unpack("<h", stream.read(2))[0]
minor_version = unpack("<h", stream.read(2))[0]
encryptionFlags = unpack("<I", stream.read(4))[0] # encryptionFlags
if encryptionFlags == 16: # fExternal
sys.stderr.write("%s : An external cryptographic provider is not
supported!\n" % filename)
return -1
for node in
tree.getiterator('{http://schemas.microsoft.com/office/2006/keyEncryptor/password}e
ncryptedKey'):
spinCount = node.attrib.get("spinCount")
assert(spinCount)
saltSize = node.attrib.get("saltSize")
assert(saltSize)
blockSize = node.attrib.get("blockSize")
assert(blockSize)
keyBits = node.attrib.get("keyBits")
hashAlgorithm = node.attrib.get("hashAlgorithm")
if hashAlgorithm == "SHA1":
version = 2010
elif hashAlgorithm == "SHA512":
version = 2013
else:
sys.stderr.write("%s uses un-supported hashing algorithm %s, please
file a bug! \n" \
% (filename, hashAlgorithm))
return -3
cipherAlgorithm = node.attrib.get("cipherAlgorithm")
if not cipherAlgorithm.find("AES") > -1:
sys.stderr.write("%s uses un-supported cipher algorithm %s, please
file a bug! \n" \
% (filename, cipherAlgorithm))
return -4
saltValue = node.attrib.get("saltValue")
assert(saltValue)
encryptedVerifierHashInput =
node.attrib.get("encryptedVerifierHashInput")
encryptedVerifierHashValue =
node.attrib.get("encryptedVerifierHashValue")
encryptedVerifierHashValue =
binascii.hexlify(base64.decodestring(encryptedVerifierHashValue.encode()))
sys.stdout.write("$office$*%d*%d*%d*%d*%s*%s*%s\n" % \
(version, int(spinCount), int(keyBits), int(saltSize),
binascii.hexlify(base64.decodestring(saltValue.encode())).decode("ascii"),
binascii.hexlify(base64.decodestring(encryptedVerifierHashInput.encode())).decode("
ascii"),
encryptedVerifierHashValue[0:64].decode("ascii")))
return 0
else:
# Office 2007 file detected, process CryptoAPI Encryption Header
stm = stream
headerLength = unpack("<I", stm.read(4))[0]
unpack("<I", stm.read(4))[0] # skipFlags
headerLength -= 4
unpack("<I", stm.read(4))[0] # sizeExtra
headerLength -= 4
unpack("<I", stm.read(4))[0] # algId
headerLength -= 4
unpack("<I", stm.read(4))[0] # algHashId
headerLength -= 4
keySize = unpack("<I", stm.read(4))[0]
headerLength -= 4
unpack("<I", stm.read(4))[0] # providerType
headerLength -= 4
unpack("<I", stm.read(4))[0] # unused
headerLength -= 4
unpack("<I", stm.read(4))[0] # unused
headerLength -= 4
CSPName = stm.read(headerLength)
provider = CSPName.decode('utf-16').lower()
# Encryption verifier
saltSize = unpack("<I", stm.read(4))[0]
assert(saltSize == 16)
salt = stm.read(saltSize)
encryptedVerifier = stm.read(16)
verifierHashSize = unpack("<I", stm.read(4))[0]
encryptedVerifierHash = stm.read(verifierHashSize)
sys.stdout.write("$office$*%d*%d*%d*%d*%s*%s*%s\n" % \
(2007, verifierHashSize,
keySize, saltSize, binascii.hexlify(salt).decode("ascii"),
binascii.hexlify(encryptedVerifier).decode("ascii"),
binascii.hexlify(encryptedVerifierHash)[0:64].decode("ascii")))
for node in
tree.getiterator('{http://schemas.microsoft.com/office/2006/keyEncryptor/password}e
ncryptedKey'):
spinCount = node.attrib.get("spinCount")
assert(spinCount)
saltSize = node.attrib.get("saltSize")
assert(saltSize)
blockSize = node.attrib.get("blockSize")
assert(blockSize)
keyBits = node.attrib.get("keyBits")
hashAlgorithm = node.attrib.get("hashAlgorithm")
if hashAlgorithm == "SHA1":
version = 2010
elif hashAlgorithm == "SHA512":
version = 2013
else:
sys.stderr.write("%s uses un-supported hashing algorithm %s, please
file a bug! \n" \
% (filename, hashAlgorithm))
return -3
cipherAlgorithm = node.attrib.get("cipherAlgorithm")
if not cipherAlgorithm.find("AES") > -1:
sys.stderr.write("%s uses un-supported cipher algorithm %s, please file
a bug! \n" \
% (filename, cipherAlgorithm))
return -4
saltValue = node.attrib.get("saltValue")
assert(saltValue)
encryptedVerifierHashInput = node.attrib.get("encryptedVerifierHashInput")
encryptedVerifierHashValue = node.attrib.get("encryptedVerifierHashValue")
encryptedVerifierHashValue =
binascii.hexlify(base64.decodestring(encryptedVerifierHashValue.encode()))
sys.stdout.write("$office$*%d*%d*%d*%d*%s*%s*%s\n" % \
(version, int(spinCount), int(keyBits), int(saltSize),
binascii.hexlify(base64.decodestring(saltValue.encode())).decode("ascii"),
binascii.hexlify(base64.decodestring(encryptedVerifierHashInput.encode())).decode("
ascii"),
encryptedVerifierHashValue[0:64].decode("ascii")))
return 0
have_summary = False
summary = []
import re
def remove_html_tags(data):
p = re.compile(r'<.*?>', re.DOTALL)
return p.sub('', str(data))
def remove_extra_spaces(data):
p = re.compile(r'\s+')
return p.sub(' ', data)
def process_file(filename):
# Test if a file is an OLE container:
try:
f = open(filename, "rb")
data = f.read(81920) # is this enough?
if data[0:2] == b"PK":
sys.stderr.write("%s : zip container found, file is " \
"unencrypted?, invalid OLE file!\n" % filename)
f.close()
return 1
f.close()
if not isOleFile(filename):
sys.stderr.write("%s : Invalid OLE file\n" % filename)
return 1
except Exception:
e = sys.exc_info()[1]
import traceback
traceback.print_exc()
sys.stderr.write("%s : OLE check failed, %s\n" % (filename, str(e)))
return 2
stream = None
if ["EncryptionInfo"] in ole.listdir():
# process Office 2003 / 2010 / 2013 files
return process_new_office(filename)
if ["Workbook"] in ole.listdir():
stream = "Workbook"
elif ["WordDocument"] in ole.listdir():
typ = 1
sdoc = ole.openstream("WordDocument")
stream = find_table(filename, sdoc)
if stream == "none":
return 5
try:
workbookStream = ole.openstream(stream)
except:
import traceback
traceback.print_exc()
sys.stderr.write("%s : stream %s not found!\n" % (filename, stream))
return 2
if workbookStream is None:
sys.stderr.write("%s : Error opening stream, %s\n" % filename)
(filename, stream)
return 3
if stream == "Workbook":
typ = 0
passinfo = find_rc4_passinfo_xls(filename, workbookStream)
if passinfo is None:
return 4
elif stream == "0Table" or stream == "1Table":
passinfo = find_rc4_passinfo_doc(filename, workbookStream)
if passinfo is None:
return 4
else:
sppt = ole.openstream("Current User")
offset = find_ppt_type(filename, sppt)
sppt = ole.openstream("PowerPoint Document")
find_rc4_passinfo_ppt(filename, sppt, offset)
return 6
workbookStream.close()
ole.close()
return 0
if __name__ == "__main__":
if len(sys.argv) < 2:
sys.stderr.write("Usage: %s <encrypted Office file(s)>\n" % sys.argv[0])
sys.exit(1)
# set_debug_mode(1)