# Let's pull resume data from craigslist posts # http://.craigslist.org/res/ # yields up to 100 resumes import BeautifulSoup from re import compile from urllib import urlopen def url_to_text(url): # URL -> text contents of resource text = '' try: fh = urlopen(url) text = ''.join(fh.readlines()) fh.close() except Exception, e: print '##### %s' % e return None return text def extract_city_resume_urls(city): # City Name -> List of URLs to resume posts assert compile('[a-z]+').match(city.lower()) baseurl = r'http://%(city)s.craigslist.org/res/' text = url_to_text(baseurl % locals()) if not text: print "##### Couldn't extract any text for %s" % city return [] posturlre = compile(r'^http://[a-z]+.craigslist.org(/[a-z]{3})?/res/[0-9]+.html$') soup = BeautifulSoup.BeautifulSoup(text) urls = soup.findAll('a') urls = [dict(url.attrs).get('href', '') for url in urls] urls = [url for url in urls if posturlre.match(url)] return urls def resume_url_to_text(url): # Resume URL -> Text text = url_to_text(url) if not text: print "##### Couldn't extract resume text for %s" % url return '' soup = BeautifulSoup.BeautifulSoup(text) body = soup.find(name='div', id='userbody') post = body.contents post = [x.string.strip() for x in post if x.string] post = [x for x in post if len(x) > 0] post = '\n'.join(post) post = post.replace('
', '. ') return post