# HG changeset patch # User svartalf # Date 1297254377 -28800 # Node ID a6358968a9180cad4b8b7479b2f5832fc1810893 # Parent e35beb1727f9679597f8bf1a3c9fde856e2590ee Граббер новостей с dsi.ru diff -r e35beb1727f9679597f8bf1a3c9fde856e2590ee -r a6358968a9180cad4b8b7479b2f5832fc1810893 dsi_news_grabber.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dsi_news_grabber.py Wed Feb 09 20:26:17 2011 +0800 @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- + +"""Старый граббер новостей с dsi.ru""" + +import re +import types +import datetime +import urllib2 + +import chardet +import html5lib +import pymongo + +import warnings +warnings.filterwarnings("ignore") + +LINK_REGEXP = re.compile(r'^/news/(?P\d{4})/(?P\d{2})/(?P\d{2})/(?P\d+).html$') + +connection = pymongo.Connection('localhost', 27017) +db = connection['dsi'] + +parser = html5lib.HTMLParser(tree=html5lib.treebuilders.getTreeBuilder("beautifulsoup")) +req = urllib2.Request(url='http://www.dsi.ru/news/%s/' % datetime.date.today().year) +req.add_header('User-agent', 'RSS export 0.1 (See http://bit.ly/gSfFGa)') +document = parser.parse(urllib2.urlopen(req).read().decode('cp1252')) + +def to_string(node): + content = u'' + for part in node.contents: + if isinstance(part, types.StringTypes): + content += unicode(part) + else: + content += unicode(part.text) + + return content + +last_news = document.find('td', {'class': 'contentBody'}).findAll('a') +for entry in last_news: + year, month, day, id_ = map(int, LINK_REGEXP.match(entry['href']).groups()) + date = datetime.datetime(year, month, day) + + is_exists = db.news.find({'_id': id_}).count() + if is_exists: + continue + + news_document = parser.parse(urllib2.urlopen('http://www.dsi.ru%s' % entry['href']).read()) + news_content = news_document.find('td', {'class': 'contentBody'}) + title = to_string(news_content.find('font', {'class': 'topicPage'})) + content = u''.join(map(to_string, news_content.findAll('p'))).strip() + db.news.insert({'_id': id_, 'created': date, 'title': title, 'content': content})