这几天在看python网络编程基础,第6章和第7章介绍了获取web内容和解析html,自己找东西实践了一下
- #!/usr/bin/python
- #conding:utf-8
- import sys,re,urllib2,datetime
- from HTMLParser import HTMLParser
- class main(HTMLParser):
- def __init__(self):
- HTMLParser.__init__(self)
- self.tarsum = ['td','th','br']
- self.readingtitle = 0
- self.curtag = None
- self.curattrs = 0
- self.marktag = None
- self.result = {}
- self.timeclock = 0
- for i in range(0,7):
- self.result[datetime.date.today() + datetime.timedelta(days=i)] = []
- def handle_starttag(self,tag,attrs):
- self.curtag = tag
- self.curattrs = len(attrs)
- if tag == 'table' and len(attrs) == 7:
- self.readingtitle = 1
- self.marktag = tag
- if self.readingtitle == 1:
- if tag == 'tr':
- self.timeclock = -2
- elif tag == 'td':
- self.timeclock += 1
- def handle_data(self,data):
- if self.curtag in self.tarsum and self.curattrs == 0 and self.readingtitle == 1:
- days = datetime.date.today() + datetime.timedelta(days=self.timeclock)
- self.result[days].append(data.strip())
- def handle_endtag(self,tag):
- if tag == self.marktag:
- self.readingtitle = 0
- def getvalue(self):
- return self.result
- if __name__ == '__main__':
- url = urllib2.urlopen('http://qq.ip138.com/weather/guangdong/GuangZhou.htm')
- strhtml = url.read()
- strhtml = unicode(strhtml, 'gb2312','ignore').encode('utf-8','ignore')
- tp = main()
- tp.feed(strhtml)
- for key,value in tp.getvalue().items():
- print key,
- for a in value:
- print a,
结果如下: