Saturday, October 23, 2010

Java Source Code Scraping

I am taking a course in Java development whose textbook provides example programs.  Unlike many other books the source code for these programs is trapped inside of HTML pages.  Being a programmer I don't like doing work, and therefore spent some time making a nice Python script that will take in an URL, and folder path, and will scrape the given page for source code, remove all formatting, and output them properly named to the folder specified.



With a little work this script could scrape whatever you wanted, as long as it is between certain tags.

#!/usr/bin/python

import re
import urllib2
import xml.sax.saxutils

numnones = 0

def unescape(string):
'''Unescape the & escapes in html, like " returns a string '''

string = string.replace(""", "\"") #Not done by xml lib
string = xml.sax.saxutils.unescape(string)

return string

def remove_HTML_tags(text):
'''Removes html tags from a supplied string.

Warning: This is accomplished using regular expressions that simply cut
out all text in between and including less than and greater than
characters.

'''

regex_html = re.compile(r'<.*?>')
return regex_html.sub('', text)

def fetch_page(url):
'''Returns the html for the webpage at the supplied url.'''

page = urllib2.urlopen(url)
pagetext = ""

for line in page:
pagetext += line

return pagetext

def get_name(class_text):
''' Returns the name of the abstract, class, or interface, when given the
text of a Java file.

Returns None if the name can not be determined.

Warning: Although Java classes may be named with unicode, this function
will only return characters A-Z, a-z, and 0-9.

'''

#Compile the regular expressions, they all ignore case and accept all chars.
class_ = re.compile(r'public class (.*?) ', re.DOTALL | re.IGNORECASE)
interface = re.compile(r'public interface (.*?) ', re.DOTALL | re.IGNORECASE)
abstract = re.compile(r'public abstract class (.*?) ', re.DOTALL | re.IGNORECASE)

#Find the name of the class/interface/abstract
name = class_.findall(class_text) #Returns [] if none found

if name == []:
name = abstract.findall(class_text)

if name == []:
name = interface.findall(class_text)

#If no name is found return None.
if name == []:
return None

#Remove any remaining non a-z A-Z characters
accepted_chars = re.compile(r'[^A-Za-z0-9]*')
return accepted_chars.sub('', name[0])


def return_between(first_tag, second_tag, text):
'''Returns an array of the text between the delimiters given. All text
between the end and beginning delimiters will be discarded.

Arguments:
first_tag -- The tag to begin text harvesting. (string)
second_tag -- The tag to end text harvesting. (string)
text -- The string in which the tags are to be found. (string)

'''

basic_split = text.split(first_tag)

#select only the sections which contain the close tags, discard the rest.
second_split = []
for i in basic_split:
if second_tag in i:
second_split.append(i)

#Cut out the text before the close tag
between = []

for line in second_split:
value, end = line.split(second_tag, 1)
between.append(value)

return between

def write_file(name, content, location, ext=".java"):
'''Writes a file with the name given, with the content provided, to the
location given, with the given extension.

Arguments:
name -- The name of the file to be written.
content -- The content of the file to be written.
location -- The folder where the file will be created/overwritten.

Keyword Arguments:
ext -- The extension for the file. (default ".java")

Warning: This function will overwrite any pre-existing files without
giving warnings.

'''

if not location.endswith("/"): #FIXME This is not cross platform!
location = location + "/"

print("Location: "+location)

f = open( location + name + ext, "w" )
f.write( content )
f.close()

def main(url, output_folder):
'''Fetches the text at the supplied url and outputs .java files of code
between pre tags.

Arguments:
url -- The url of the page to scrape code from. (string)
output_folder -- The directory that the java files will be created in.

'''
global numnones
page_text = fetch_page(url)

#Get the text between the pre tags
code_and_garbage = return_between("<pre>", "</pre>", page_text)

#Walk through each entry clean it, and write the source file
for i in code_and_garbage:

print i

i = remove_HTML_tags(i)
i = unescape(i)

name = get_name(i)

if name == None:
write_file("None"+str(numnones), i, output_folder)
numnones += 1
else:
write_file(name, i, output_folder)

if __name__ == "__main__":
location = raw_input("Enter the url: ")
output_folder = raw_input("Enter the folder to write to: ")

main( location, output_folder )

No comments:

Post a Comment