import re
def wordList(text, wordChars = None, ignoreCase = True):
"""converts string simple-mindedly to a list of words"""
if wordChars is None:
lcLetters = "abcdefghijklmnopqrstuvwxyz"
ucLetters = lcLetters.upper()
otherLegalChars = """-"""
wordChars = set(lcLetters + ucLetters + otherLegalChars)
# make sure spaces are always allowed-- they are word boundaries
wordChars.add(" ")
# while still a string, convert word boundaries to spaces
whiteSpace = "\n\r\f\t\v"
wordBoundaries = whiteSpace + ".,"
if ignoreCase: text = text.lower()
li = list(text)
# drop unwanted chars
li = [x for x in li if x in wordChars]
# convert back to string
text = "".join(li)
for s in wordBoundaries: text = text.replace(s, " ")
# need to retain at least one space for word boundaries
# collapse groups of spaces to a single space
# Method: replace 64 spaces with a space, 32 spaces with a space, ..., 2 spaces with a space
for i in [x*x for x in [6,5,4,3,2,1]]: text = text.strip(" " * i)
# split into list again at word boundaries
text = text.split(" ")
# if any "empty words" have been manufactured by previous processes, drop them
text = [x for x in text if x != ""]
return text
def reWordList(text, wordChars = None, ignoreCase = True):
"""converts string simple-mindedly to a list of words"""
if wordChars is None:
lcLetters = "abcdefghijklmnopqrstuvwxyz"
ucLetters = lcLetters.upper()
wordChars = lcLetters + ucLetters
# make sure spaces are always allowed-- they are word boundaries
extraWordBoundaries = """.","""
# convert all word boundaries (white spaces and additional chars specified above) to "spaces"
rx = re.compile('[\s' + extraWordBoundaries + ']')
text = rx.sub(" ", text)
# collapse groups of spaces to a single "space"
# Method: replace 64 spaces with a space, 32 spaces with a space, ..., 2 spaces with a "space"
for i in [x*x for x in [6,5,4,3,2,1]]: text = text.strip(" " * i)
if ignoreCase: text = text.lower()
li = list(text)
# drop unwanted chars (which do not delimit words or form a part of them)
# note "space" is retained
rx = re.compile("[^" + wordChars + " ]")
text = rx.sub("", text)
# split into list at word boundaries
text = text.split(" ")
# if any "empty words" have been manufactured by previous processes, drop them
text = [x for x in text if x != ""]
return text
def wordFreqs(text, wordChars = None):
"""takes a list of words and returns list of unique words and their frequencies"""
words = wordList(text, wordChars)
uniqueWords = set(words)
return sorted([(word, words.count(word)) for word in uniqueWords])
if __name__ == "__main__":
# to load a string from a file here, loadFile = True
loadFile = True
if loadFile:
a = list(file(r"d:\partdb.sql"))
a = "\n".join(a)
else:
a = """ Nor again is there anyone who loves or pursues or desires to obtain pain of itself
because it is pain, but because occasionally circumstances occur in which toil and pain can
procure him some great pleasure. To take a trivial example, which of us ever undertakes
laborious physical exercise, except to obtain some advantage from it?"""
b = wordFreqs(a)
# example: find integers only
#b = wordFreqs(a, wordChars = "0123456789")
print b
print "\n" * 5
print "number of words in original: ", len(wordList(a))
print "Number of unique words: ", len(b)