diff options
author | Camil Staps | 2015-10-23 16:44:24 +0200 |
---|---|---|
committer | Camil Staps | 2015-10-23 16:51:10 +0200 |
commit | 3fd81c73cfd8bad36b2a1cf7955006e35c1d9db5 (patch) | |
tree | 30f99545bbcdd74d8965219faa6bfc7db70e8bc1 /Assignment 3/packages/xlrd/biffh.py | |
parent | Assignment 2 finished (diff) |
Assignment 3: code, plots
Diffstat (limited to 'Assignment 3/packages/xlrd/biffh.py')
-rw-r--r-- | Assignment 3/packages/xlrd/biffh.py | 663 |
1 files changed, 663 insertions, 0 deletions
diff --git a/Assignment 3/packages/xlrd/biffh.py b/Assignment 3/packages/xlrd/biffh.py new file mode 100644 index 0000000..f3a6d4d --- /dev/null +++ b/Assignment 3/packages/xlrd/biffh.py @@ -0,0 +1,663 @@ +# -*- coding: cp1252 -*- + +## +# Support module for the xlrd package. +# +# <p>Portions copyright © 2005-2010 Stephen John Machin, Lingfo Pty Ltd</p> +# <p>This module is part of the xlrd package, which is released under a BSD-style licence.</p> +## + +# 2010-03-01 SJM Reading SCL record +# 2010-03-01 SJM Added more record IDs for biff_dump & biff_count +# 2008-02-10 SJM BIFF2 BLANK record +# 2008-02-08 SJM Preparation for Excel 2.0 support +# 2008-02-02 SJM Added suffixes (_B2, _B2_ONLY, etc) on record names for biff_dump & biff_count +# 2007-12-04 SJM Added support for Excel 2.x (BIFF2) files. +# 2007-09-08 SJM Avoid crash when zero-length Unicode string missing options byte. +# 2007-04-22 SJM Remove experimental "trimming" facility. + +from __future__ import print_function + +DEBUG = 0 + +from struct import unpack +import sys +from .timemachine import * + +class XLRDError(Exception): + pass + +## +# Parent of almost all other classes in the package. Defines a common "dump" method +# for debugging. + +class BaseObject(object): + + _repr_these = [] + + ## + # @param f open file object, to which the dump is written + # @param header text to write before the dump + # @param footer text to write after the dump + # @param indent number of leading spaces (for recursive calls) + + def dump(self, f=None, header=None, footer=None, indent=0): + if f is None: + f = sys.stderr + if hasattr(self, "__slots__"): + alist = [] + for attr in self.__slots__: + alist.append((attr, getattr(self, attr))) + else: + alist = self.__dict__.items() + alist = sorted(alist) + pad = " " * indent + if header is not None: print(header, file=f) + list_type = type([]) + dict_type = type({}) + for attr, value in alist: + if getattr(value, 'dump', None) and attr != 'book': + value.dump(f, + header="%s%s (%s object):" % (pad, attr, value.__class__.__name__), + indent=indent+4) + elif attr not in self._repr_these and ( + isinstance(value, list_type) or isinstance(value, dict_type) + ): + print("%s%s: %s, len = %d" % (pad, attr, type(value), len(value)), file=f) + else: + fprintf(f, "%s%s: %r\n", pad, attr, value) + if footer is not None: print(footer, file=f) + +FUN, FDT, FNU, FGE, FTX = range(5) # unknown, date, number, general, text +DATEFORMAT = FDT +NUMBERFORMAT = FNU + +( + XL_CELL_EMPTY, + XL_CELL_TEXT, + XL_CELL_NUMBER, + XL_CELL_DATE, + XL_CELL_BOOLEAN, + XL_CELL_ERROR, + XL_CELL_BLANK, # for use in debugging, gathering stats, etc +) = range(7) + +biff_text_from_num = { + 0: "(not BIFF)", + 20: "2.0", + 21: "2.1", + 30: "3", + 40: "4S", + 45: "4W", + 50: "5", + 70: "7", + 80: "8", + 85: "8X", + } + +## +# <p>This dictionary can be used to produce a text version of the internal codes +# that Excel uses for error cells. Here are its contents: +# <pre> +# 0x00: '#NULL!', # Intersection of two cell ranges is empty +# 0x07: '#DIV/0!', # Division by zero +# 0x0F: '#VALUE!', # Wrong type of operand +# 0x17: '#REF!', # Illegal or deleted cell reference +# 0x1D: '#NAME?', # Wrong function or range name +# 0x24: '#NUM!', # Value range overflow +# 0x2A: '#N/A', # Argument or function not available +# </pre></p> + +error_text_from_code = { + 0x00: '#NULL!', # Intersection of two cell ranges is empty + 0x07: '#DIV/0!', # Division by zero + 0x0F: '#VALUE!', # Wrong type of operand + 0x17: '#REF!', # Illegal or deleted cell reference + 0x1D: '#NAME?', # Wrong function or range name + 0x24: '#NUM!', # Value range overflow + 0x2A: '#N/A', # Argument or function not available +} + +BIFF_FIRST_UNICODE = 80 + +XL_WORKBOOK_GLOBALS = WBKBLOBAL = 0x5 +XL_WORKBOOK_GLOBALS_4W = 0x100 +XL_WORKSHEET = WRKSHEET = 0x10 + +XL_BOUNDSHEET_WORKSHEET = 0x00 +XL_BOUNDSHEET_CHART = 0x02 +XL_BOUNDSHEET_VB_MODULE = 0x06 + +# XL_RK2 = 0x7e +XL_ARRAY = 0x0221 +XL_ARRAY2 = 0x0021 +XL_BLANK = 0x0201 +XL_BLANK_B2 = 0x01 +XL_BOF = 0x809 +XL_BOOLERR = 0x205 +XL_BOOLERR_B2 = 0x5 +XL_BOUNDSHEET = 0x85 +XL_BUILTINFMTCOUNT = 0x56 +XL_CF = 0x01B1 +XL_CODEPAGE = 0x42 +XL_COLINFO = 0x7D +XL_COLUMNDEFAULT = 0x20 # BIFF2 only +XL_COLWIDTH = 0x24 # BIFF2 only +XL_CONDFMT = 0x01B0 +XL_CONTINUE = 0x3c +XL_COUNTRY = 0x8C +XL_DATEMODE = 0x22 +XL_DEFAULTROWHEIGHT = 0x0225 +XL_DEFCOLWIDTH = 0x55 +XL_DIMENSION = 0x200 +XL_DIMENSION2 = 0x0 +XL_EFONT = 0x45 +XL_EOF = 0x0a +XL_EXTERNNAME = 0x23 +XL_EXTERNSHEET = 0x17 +XL_EXTSST = 0xff +XL_FEAT11 = 0x872 +XL_FILEPASS = 0x2f +XL_FONT = 0x31 +XL_FONT_B3B4 = 0x231 +XL_FORMAT = 0x41e +XL_FORMAT2 = 0x1E # BIFF2, BIFF3 +XL_FORMULA = 0x6 +XL_FORMULA3 = 0x206 +XL_FORMULA4 = 0x406 +XL_GCW = 0xab +XL_HLINK = 0x01B8 +XL_QUICKTIP = 0x0800 +XL_HORIZONTALPAGEBREAKS = 0x1b +XL_INDEX = 0x20b +XL_INTEGER = 0x2 # BIFF2 only +XL_IXFE = 0x44 # BIFF2 only +XL_LABEL = 0x204 +XL_LABEL_B2 = 0x04 +XL_LABELRANGES = 0x15f +XL_LABELSST = 0xfd +XL_LEFTMARGIN = 0x26 +XL_TOPMARGIN = 0x28 +XL_RIGHTMARGIN = 0x27 +XL_BOTTOMMARGIN = 0x29 +XL_HEADER = 0x14 +XL_FOOTER = 0x15 +XL_HCENTER = 0x83 +XL_VCENTER = 0x84 +XL_MERGEDCELLS = 0xE5 +XL_MSO_DRAWING = 0x00EC +XL_MSO_DRAWING_GROUP = 0x00EB +XL_MSO_DRAWING_SELECTION = 0x00ED +XL_MULRK = 0xbd +XL_MULBLANK = 0xbe +XL_NAME = 0x18 +XL_NOTE = 0x1c +XL_NUMBER = 0x203 +XL_NUMBER_B2 = 0x3 +XL_OBJ = 0x5D +XL_PAGESETUP = 0xA1 +XL_PALETTE = 0x92 +XL_PANE = 0x41 +XL_PRINTGRIDLINES = 0x2B +XL_PRINTHEADERS = 0x2A +XL_RK = 0x27e +XL_ROW = 0x208 +XL_ROW_B2 = 0x08 +XL_RSTRING = 0xd6 +XL_SCL = 0x00A0 +XL_SHEETHDR = 0x8F # BIFF4W only +XL_SHEETPR = 0x81 +XL_SHEETSOFFSET = 0x8E # BIFF4W only +XL_SHRFMLA = 0x04bc +XL_SST = 0xfc +XL_STANDARDWIDTH = 0x99 +XL_STRING = 0x207 +XL_STRING_B2 = 0x7 +XL_STYLE = 0x293 +XL_SUPBOOK = 0x1AE # aka EXTERNALBOOK in OOo docs +XL_TABLEOP = 0x236 +XL_TABLEOP2 = 0x37 +XL_TABLEOP_B2 = 0x36 +XL_TXO = 0x1b6 +XL_UNCALCED = 0x5e +XL_UNKNOWN = 0xffff +XL_VERTICALPAGEBREAKS = 0x1a +XL_WINDOW2 = 0x023E +XL_WINDOW2_B2 = 0x003E +XL_WRITEACCESS = 0x5C +XL_WSBOOL = XL_SHEETPR +XL_XF = 0xe0 +XL_XF2 = 0x0043 # BIFF2 version of XF record +XL_XF3 = 0x0243 # BIFF3 version of XF record +XL_XF4 = 0x0443 # BIFF4 version of XF record + +boflen = {0x0809: 8, 0x0409: 6, 0x0209: 6, 0x0009: 4} +bofcodes = (0x0809, 0x0409, 0x0209, 0x0009) + +XL_FORMULA_OPCODES = (0x0006, 0x0406, 0x0206) + +_cell_opcode_list = [ + XL_BOOLERR, + XL_FORMULA, + XL_FORMULA3, + XL_FORMULA4, + XL_LABEL, + XL_LABELSST, + XL_MULRK, + XL_NUMBER, + XL_RK, + XL_RSTRING, + ] +_cell_opcode_dict = {} +for _cell_opcode in _cell_opcode_list: + _cell_opcode_dict[_cell_opcode] = 1 + +def is_cell_opcode(c): + return c in _cell_opcode_dict + +def upkbits(tgt_obj, src, manifest, local_setattr=setattr): + for n, mask, attr in manifest: + local_setattr(tgt_obj, attr, (src & mask) >> n) + +def upkbitsL(tgt_obj, src, manifest, local_setattr=setattr, local_int=int): + for n, mask, attr in manifest: + local_setattr(tgt_obj, attr, local_int((src & mask) >> n)) + +def unpack_string(data, pos, encoding, lenlen=1): + nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] + pos += lenlen + return unicode(data[pos:pos+nchars], encoding) + +def unpack_string_update_pos(data, pos, encoding, lenlen=1, known_len=None): + if known_len is not None: + # On a NAME record, the length byte is detached from the front of the string. + nchars = known_len + else: + nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] + pos += lenlen + newpos = pos + nchars + return (unicode(data[pos:newpos], encoding), newpos) + +def unpack_unicode(data, pos, lenlen=2): + "Return unicode_strg" + nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] + if not nchars: + # Ambiguous whether 0-length string should have an "options" byte. + # Avoid crash if missing. + return UNICODE_LITERAL("") + pos += lenlen + options = BYTES_ORD(data[pos]) + pos += 1 + # phonetic = options & 0x04 + # richtext = options & 0x08 + if options & 0x08: + # rt = unpack('<H', data[pos:pos+2])[0] # unused + pos += 2 + if options & 0x04: + # sz = unpack('<i', data[pos:pos+4])[0] # unused + pos += 4 + if options & 0x01: + # Uncompressed UTF-16-LE + rawstrg = data[pos:pos+2*nchars] + # if DEBUG: print "nchars=%d pos=%d rawstrg=%r" % (nchars, pos, rawstrg) + strg = unicode(rawstrg, 'utf_16_le') + # pos += 2*nchars + else: + # Note: this is COMPRESSED (not ASCII!) encoding!!! + # Merely returning the raw bytes would work OK 99.99% of the time + # if the local codepage was cp1252 -- however this would rapidly go pear-shaped + # for other codepages so we grit our Anglocentric teeth and return Unicode :-) + + strg = unicode(data[pos:pos+nchars], "latin_1") + # pos += nchars + # if richtext: + # pos += 4 * rt + # if phonetic: + # pos += sz + # return (strg, pos) + return strg + +def unpack_unicode_update_pos(data, pos, lenlen=2, known_len=None): + "Return (unicode_strg, updated value of pos)" + if known_len is not None: + # On a NAME record, the length byte is detached from the front of the string. + nchars = known_len + else: + nchars = unpack('<' + 'BH'[lenlen-1], data[pos:pos+lenlen])[0] + pos += lenlen + if not nchars and not data[pos:]: + # Zero-length string with no options byte + return (UNICODE_LITERAL(""), pos) + options = BYTES_ORD(data[pos]) + pos += 1 + phonetic = options & 0x04 + richtext = options & 0x08 + if richtext: + rt = unpack('<H', data[pos:pos+2])[0] + pos += 2 + if phonetic: + sz = unpack('<i', data[pos:pos+4])[0] + pos += 4 + if options & 0x01: + # Uncompressed UTF-16-LE + strg = unicode(data[pos:pos+2*nchars], 'utf_16_le') + pos += 2*nchars + else: + # Note: this is COMPRESSED (not ASCII!) encoding!!! + strg = unicode(data[pos:pos+nchars], "latin_1") + pos += nchars + if richtext: + pos += 4 * rt + if phonetic: + pos += sz + return (strg, pos) + +def unpack_cell_range_address_list_update_pos( + output_list, data, pos, biff_version, addr_size=6): + # output_list is updated in situ + assert addr_size in (6, 8) + # Used to assert size == 6 if not BIFF8, but pyWLWriter writes + # BIFF8-only MERGEDCELLS records in a BIFF5 file! + n, = unpack("<H", data[pos:pos+2]) + pos += 2 + if n: + if addr_size == 6: + fmt = "<HHBB" + else: + fmt = "<HHHH" + for _unused in xrange(n): + ra, rb, ca, cb = unpack(fmt, data[pos:pos+addr_size]) + output_list.append((ra, rb+1, ca, cb+1)) + pos += addr_size + return pos + +_brecstrg = """\ +0000 DIMENSIONS_B2 +0001 BLANK_B2 +0002 INTEGER_B2_ONLY +0003 NUMBER_B2 +0004 LABEL_B2 +0005 BOOLERR_B2 +0006 FORMULA +0007 STRING_B2 +0008 ROW_B2 +0009 BOF_B2 +000A EOF +000B INDEX_B2_ONLY +000C CALCCOUNT +000D CALCMODE +000E PRECISION +000F REFMODE +0010 DELTA +0011 ITERATION +0012 PROTECT +0013 PASSWORD +0014 HEADER +0015 FOOTER +0016 EXTERNCOUNT +0017 EXTERNSHEET +0018 NAME_B2,5+ +0019 WINDOWPROTECT +001A VERTICALPAGEBREAKS +001B HORIZONTALPAGEBREAKS +001C NOTE +001D SELECTION +001E FORMAT_B2-3 +001F BUILTINFMTCOUNT_B2 +0020 COLUMNDEFAULT_B2_ONLY +0021 ARRAY_B2_ONLY +0022 DATEMODE +0023 EXTERNNAME +0024 COLWIDTH_B2_ONLY +0025 DEFAULTROWHEIGHT_B2_ONLY +0026 LEFTMARGIN +0027 RIGHTMARGIN +0028 TOPMARGIN +0029 BOTTOMMARGIN +002A PRINTHEADERS +002B PRINTGRIDLINES +002F FILEPASS +0031 FONT +0032 FONT2_B2_ONLY +0036 TABLEOP_B2 +0037 TABLEOP2_B2 +003C CONTINUE +003D WINDOW1 +003E WINDOW2_B2 +0040 BACKUP +0041 PANE +0042 CODEPAGE +0043 XF_B2 +0044 IXFE_B2_ONLY +0045 EFONT_B2_ONLY +004D PLS +0051 DCONREF +0055 DEFCOLWIDTH +0056 BUILTINFMTCOUNT_B3-4 +0059 XCT +005A CRN +005B FILESHARING +005C WRITEACCESS +005D OBJECT +005E UNCALCED +005F SAVERECALC +0063 OBJECTPROTECT +007D COLINFO +007E RK2_mythical_? +0080 GUTS +0081 WSBOOL +0082 GRIDSET +0083 HCENTER +0084 VCENTER +0085 BOUNDSHEET +0086 WRITEPROT +008C COUNTRY +008D HIDEOBJ +008E SHEETSOFFSET +008F SHEETHDR +0090 SORT +0092 PALETTE +0099 STANDARDWIDTH +009B FILTERMODE +009C FNGROUPCOUNT +009D AUTOFILTERINFO +009E AUTOFILTER +00A0 SCL +00A1 SETUP +00AB GCW +00BD MULRK +00BE MULBLANK +00C1 MMS +00D6 RSTRING +00D7 DBCELL +00DA BOOKBOOL +00DD SCENPROTECT +00E0 XF +00E1 INTERFACEHDR +00E2 INTERFACEEND +00E5 MERGEDCELLS +00E9 BITMAP +00EB MSO_DRAWING_GROUP +00EC MSO_DRAWING +00ED MSO_DRAWING_SELECTION +00EF PHONETIC +00FC SST +00FD LABELSST +00FF EXTSST +013D TABID +015F LABELRANGES +0160 USESELFS +0161 DSF +01AE SUPBOOK +01AF PROTECTIONREV4 +01B0 CONDFMT +01B1 CF +01B2 DVAL +01B6 TXO +01B7 REFRESHALL +01B8 HLINK +01BC PASSWORDREV4 +01BE DV +01C0 XL9FILE +01C1 RECALCID +0200 DIMENSIONS +0201 BLANK +0203 NUMBER +0204 LABEL +0205 BOOLERR +0206 FORMULA_B3 +0207 STRING +0208 ROW +0209 BOF +020B INDEX_B3+ +0218 NAME +0221 ARRAY +0223 EXTERNNAME_B3-4 +0225 DEFAULTROWHEIGHT +0231 FONT_B3B4 +0236 TABLEOP +023E WINDOW2 +0243 XF_B3 +027E RK +0293 STYLE +0406 FORMULA_B4 +0409 BOF +041E FORMAT +0443 XF_B4 +04BC SHRFMLA +0800 QUICKTIP +0809 BOF +0862 SHEETLAYOUT +0867 SHEETPROTECTION +0868 RANGEPROTECTION +""" + +biff_rec_name_dict = {} +for _buff in _brecstrg.splitlines(): + _numh, _name = _buff.split() + biff_rec_name_dict[int(_numh, 16)] = _name +del _buff, _name, _brecstrg + +def hex_char_dump(strg, ofs, dlen, base=0, fout=sys.stdout, unnumbered=False): + endpos = min(ofs + dlen, len(strg)) + pos = ofs + numbered = not unnumbered + num_prefix = '' + while pos < endpos: + endsub = min(pos + 16, endpos) + substrg = strg[pos:endsub] + lensub = endsub - pos + if lensub <= 0 or lensub != len(substrg): + fprintf( + sys.stdout, + '??? hex_char_dump: ofs=%d dlen=%d base=%d -> endpos=%d pos=%d endsub=%d substrg=%r\n', + ofs, dlen, base, endpos, pos, endsub, substrg) + break + hexd = ''.join(["%02x " % BYTES_ORD(c) for c in substrg]) + + chard = '' + for c in substrg: + c = chr(BYTES_ORD(c)) + if c == '\0': + c = '~' + elif not (' ' <= c <= '~'): + c = '?' + chard += c + if numbered: + num_prefix = "%5d: " % (base+pos-ofs) + + fprintf(fout, "%s %-48s %s\n", num_prefix, hexd, chard) + pos = endsub + +def biff_dump(mem, stream_offset, stream_len, base=0, fout=sys.stdout, unnumbered=False): + pos = stream_offset + stream_end = stream_offset + stream_len + adj = base - stream_offset + dummies = 0 + numbered = not unnumbered + num_prefix = '' + while stream_end - pos >= 4: + rc, length = unpack('<HH', mem[pos:pos+4]) + if rc == 0 and length == 0: + if mem[pos:] == b'\0' * (stream_end - pos): + dummies = stream_end - pos + savpos = pos + pos = stream_end + break + if dummies: + dummies += 4 + else: + savpos = pos + dummies = 4 + pos += 4 + else: + if dummies: + if numbered: + num_prefix = "%5d: " % (adj + savpos) + fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies) + dummies = 0 + recname = biff_rec_name_dict.get(rc, '<UNKNOWN>') + if numbered: + num_prefix = "%5d: " % (adj + pos) + fprintf(fout, "%s%04x %s len = %04x (%d)\n", num_prefix, rc, recname, length, length) + pos += 4 + hex_char_dump(mem, pos, length, adj+pos, fout, unnumbered) + pos += length + if dummies: + if numbered: + num_prefix = "%5d: " % (adj + savpos) + fprintf(fout, "%s---- %d zero bytes skipped ----\n", num_prefix, dummies) + if pos < stream_end: + if numbered: + num_prefix = "%5d: " % (adj + pos) + fprintf(fout, "%s---- Misc bytes at end ----\n", num_prefix) + hex_char_dump(mem, pos, stream_end-pos, adj + pos, fout, unnumbered) + elif pos > stream_end: + fprintf(fout, "Last dumped record has length (%d) that is too large\n", length) + +def biff_count_records(mem, stream_offset, stream_len, fout=sys.stdout): + pos = stream_offset + stream_end = stream_offset + stream_len + tally = {} + while stream_end - pos >= 4: + rc, length = unpack('<HH', mem[pos:pos+4]) + if rc == 0 and length == 0: + if mem[pos:] == b'\0' * (stream_end - pos): + break + recname = "<Dummy (zero)>" + else: + recname = biff_rec_name_dict.get(rc, None) + if recname is None: + recname = "Unknown_0x%04X" % rc + if recname in tally: + tally[recname] += 1 + else: + tally[recname] = 1 + pos += length + 4 + slist = sorted(tally.items()) + for recname, count in slist: + print("%8d %s" % (count, recname), file=fout) + +encoding_from_codepage = { + 1200 : 'utf_16_le', + 10000: 'mac_roman', + 10006: 'mac_greek', # guess + 10007: 'mac_cyrillic', # guess + 10029: 'mac_latin2', # guess + 10079: 'mac_iceland', # guess + 10081: 'mac_turkish', # guess + 32768: 'mac_roman', + 32769: 'cp1252', + } +# some more guessing, for Indic scripts +# codepage 57000 range: +# 2 Devanagari [0] +# 3 Bengali [1] +# 4 Tamil [5] +# 5 Telegu [6] +# 6 Assamese [1] c.f. Bengali +# 7 Oriya [4] +# 8 Kannada [7] +# 9 Malayalam [8] +# 10 Gujarati [3] +# 11 Gurmukhi [2] |