Sunday, January 20, 2013

Extract Email Attachments With Python

If you archive your email messages, like me, you may find that you want to pull out all of the attachments for those files so your desktop search will parse them better, or so you can quickly search through them.

This is a simple script that just recurses through your .eml messages in a directory and pulls out all of the base64 encoded attachments.

For those of you that are wondering what base64 is, it's an encoding that only uses sixty-four different characters to transmit information. The email system uses this to send documents around so that the protocol didn't have to be reconfigured to account for stuff that wasn't text.

Code

#!/usr/bin/env python

import email.parser
import os
import sys
import base64


fileList = []
rootdir = "/path/to/.eml/messages/"
for root, subFolders, files in os.walk(rootdir):
for file in files:
fileList.append(os.path.join(root,file))

id = 0

for path in fileList:
if not path.endswith(".eml"):
continue

fp = email.parser.FeedParser()
fp.feed(open(path).read())

message = fp.close()

for message in message.walk():
fn = message.get_filename()
if fn == None:
continue

try:
with open(fn, 'wb') as out:
out.write(base64.b64decode(message.get_payload()))
except TypeError:
with open(fn, 'wb') as out:
out.write(message.get_payload())

Extensions

  • This script isn't very efficient being that it uses python to decode.
  • It would be nice to pull arguments from the command line using sys.argv

Update 2013-09-04 Python 3


#!/usr/bin/env python3

import email.parser
import os
import sys
import base64
import binascii
import sys


def extract(rootdir):
fileList = []

for root, subFolders, files in os.walk(rootdir):
for file in files:
fileList.append(os.path.join(root,file))

for path in fileList:
if not path.endswith(".eml"):
continue

fp = email.parser.BytesFeedParser()
fp.feed(open(path, "rb").read())

message = fp.close()

print("Checking {}".format(path))

for message in message.walk():
fn = message.get_filename()
if fn == None:
continue
try:
try:
with open(fn, 'wb') as out:
out.write(message.get_payload(decode=True))
except (TypeError, binascii.Error):
with open(fn, 'wb') as out:
print(message.get_payload())
out.write(bytes(message.get_payload(), message.get_charset()))
except Exception:
print("Error extracting item from {}".format(path))

if __name__ == "__main__":
if len(sys.argv) == 1:
print("usage: {} path/to/.eml/files".format(sys.argv[0]))
exit(1)
extract(sys.argv[1])

No comments:

Post a Comment