In this case only Text and Numeric cells will render properly, formulas and such won't work. Also, the only sheet read is sheet1.
Okay, a little about the file structure of xlsx: Sheet info is stored in xl/worksheets/sheetX.xml, which looks essentially like this:
<sheetdata>
<row collapsed="false" customformat="false" customheight="false" hidden="false" ht="12.8" outlinelevel="0" r="1">
<c r="A1" s="0" t="s"><v>0</v></c>
<c r="B1" s="0" t="n"><v>123</v></c>
</row>
</sheetdata>
Cells (c tags) have several types (denoted by the t attribute) "s" is a string, and "n" is a number. The string's attribute is an index for a string in the xl/sharedStrings.xml file, which removes duplicate strings from the saved data.
''' Opens .xlsx files as if they were .csv files. '''
import zipfile
import xml.dom.minidom
import re
class XLSXReader:
rows=[]
def _nodeText(self, node):
return "".join(t.nodeValue for t in node.childNodes if t.nodeType == t.TEXT_NODE)
def _get_col_num(self, col):
strpart = col.attributes['r'].value
colnum = re.sub('[^A-Z]', '', strpart.upper().strip())
c = 0
for char in colnum:
c += ord(char)
c -= (65) # ASCII to number
print("Colnum for '%s' is %s" % (strpart, c))
return c
def __init__(self, filename):
shared_strings = []
self.rows = []
myFile = zipfile.ZipFile(filename)
# Read the shared strings file.
share = xml.dom.minidom.parseString(myFile.read('xl/sharedStrings.xml'))
j = share.getElementsByTagName("t")
for node in j:
shared_strings.append(self._nodeText(node))
sheet = xml.dom.minidom.parseString(myFile.read('xl/worksheets/sheet1.xml'))
sheetrows = sheet.getElementsByTagName("row")
for row in sheetrows:
cols = row.getElementsByTagName("c")
largest_col_num = 0
for col in cols:
colnum = self._get_col_num(col)
if colnum > largest_col_num:
largest_col_num = colnum
thiscol = ['']*(largest_col_num + 1)
for col in cols:
value = ""
try:
value = self._nodeText(col.getElementsByTagName('v')[0])
except IndexError:
continue
#Get col number (A=0, B=1, etc. up to AA)
colnum = self._get_col_num(col) # ASCII to number
try:
if col.attributes['t'].value == 's':
thiscol[colnum] = shared_strings[int(value)]
else:
thiscol[colnum] = value
except KeyError:
thiscol[colnum] = value
self.rows.append(thiscol)
myFile.close()
def __getitem__(self, i):
return self.rows[i]
To use the code, call the class with a file name, then use a for - each loop to iterate over the rows.
Update 2012-09-14: Mati noted that in some situations the xlsx format didn't have the proper attribute to indicate a number, and sent in a patch. Thanks!
No comments:
Post a Comment