Source:
import re
import operator

def rapid_automatic_keyword_extraction(document, stop_words=[]):
    # parts re
    parts = re.compile(u'[.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]').split(document)

    # stop words
    stop_words = re.compile('|'.join(['\\b' + word + '\\b' for word in stop_words]), re.IGNORECASE)

    phrases = []
    for part in parts:
        phrases.extend([phrase.strip().lower() for phrase in re.sub(stop_words, '|', part.strip()).split('|') if phrase.strip()])

    # get scores
    def split_words(text, size):
        splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
        return [word.strip().lower() for word in splitter.split(text)
                if word.strip() and len(word.strip().lower()) > size and not re.match('^[0-9][0-9\.]+$', word.strip())]

    freq = {}
    degree = {}

    for phrase in phrases:
        words = split_words(phrase, 0)
        words_length = len(words)
        words_degree = words_length - 1
        for word in words:
            freq.setdefault(word, 0)
            freq[word] += 1
            degree.setdefault(word,0)
            degree[word] += words_degree
    degree = dict((item, degree[item] + freq[item]) for item in freq)
    score = {}
    for item in freq:
        score.setdefault(item,0)
        score[item] = degree[item] / (freq[item] * 1.0)

    keywords = {}
    for phrase in phrases:
        keywords.setdefault(phrase, 0)
        word_list = split_words(phrase, 0)
        candidate_score = 0
        for word in word_list:
            candidate_score += score[word]

        keywords[phrase] = candidate_score

    sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
    total_keywords = len(sorted_keywords)

    return sorted_keywords[0 : total_keywords/3]
Sample Usage:
text = """
Marvel's Iron Man 3 pits brash-but-brilliant industrialist Tony Stark/Iron Man against an enemy whose reach knows no bounds. When Stark finds his personal world destroyed at his enemy's hands, he embarks on a harrowing quest to find those responsible. This journey, at every turn, will test his mettle. With his back against the wall, Stark is left to survive by his own devices, relying on his ingenuity and instincts to protect those closest to him. As he fights his way back, Stark discovers the answer to the question that has secretly haunted him: does the man make the suit or does the suit make the man?
"""
import urllib
stop_words = urllib.urlopen('http://www.textfixer.com/resources/common-english-words.txt').read().split(',')
keywords = utils.rapid_automatic_keyword_extraction(text, stop_words)
for keyword in keywords:
    print keyword

"""
Results:
('brilliant industrialist tony stark/iron man against', 31.0)
('enemy whose reach knows', 14.5)
('iron man 3', 9.0)
('protect those closest', 9.0)
('find those responsible', 9.0)
('personal world destroyed', 9.0)
('back against', 6.0)
('man make', 5.0)
('pits brash', 4.0)
('harrowing quest', 4.0)
('way back', 4.0)
('secretly haunted', 4.0)
"""

Extracts keywords from text documents. Rapid Automatic Keyword Extraction for Information Retrieval and Analysis. Implementation base on http://www.google.com/patents/US20110060747

Pythoneer 6 years ago