在前一版本的基础上,加入了排序。按物品和日期来排序,排除重复的价格和物品
#-*- encoding: gbk -*-
'''
Created on 2011-5-8
@author: fatkun
'''
import urllib2
import re
import time
import string
from operator import itemgetter
#读取网页
def read(url):
opener = urllib2.build_opener()
#opener.handle_open["http"][0].set_http_debuglevel(1)
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
f= opener.open(url)
content = f.read()
return content
#正则表达式返回需要的内容
def display(content):
pattern = re.compile(r"""<tr(?:[^>]+)?> #TR
[\S\s]*?<td[\S\s]*?</td>[\S\s]*?<td(?:[^>]+)?> #Second TD
[\S\s]*?<a [^>]+>([\S\s]*?)</a> # 链接
[\S\s]*?<br/>\s+((?:[\S]+[ ]?)+)\s+</td> # 物品名称
[\S\s]*?<em>(\d+)</em> # 价格
[\S\s]*?<td>[\S\s]*?</td> # TD
[\S\s]*?<td>([\S\s]*?)</td> # 日期
[\S\s]*?</tr> #Last TR
""", flags=re.MULTILINE|re.IGNORECASE|re.VERBOSE)
matchs = re.findall(pattern, content )
alist = []
for match in matchs:
alist.append((match[2].strip(), match[3].strip()[:11], match[1].strip()))
return alist
filepath = 'c:\\log.txt'
resultfilepath = 'c:\\log_result.txt'
open(filepath, 'w').close()
lastpage = 50
#淘宝物品的成交记录下一页的链接,请复制链接,把最后一个页数的数字删掉,放在url变量里
url = 'http://tbskip.taobao.com/json/show_buyer_list.htm?is_offline=&page_size=15&is_start=false&item_type=b&ends=1305198406000&starts=1304593606000&item_id=5964804060&user_tag=475363344&old_quantity=5521&sold_total_num=3057&closed=false&seller_num_id=69211806&zhichong=true&bidPage='
onlyonelist = []
allfieldlist = []
for i in range(1, lastpage + 1):
fullurl = '%s%d' % (url, i)
#不出错运行后设为False
runfail = True
#重试次数
retry = 2
print '第%d页 - %s' % (i, fullurl)
try:
logfile = open(filepath, 'a')
while (runfail and retry >= 0):
try:
content = read(fullurl)
alist = display(content)
str = '';
for record in alist:
price_product_record = (record[0], record[2])
if price_product_record not in onlyonelist:
onlyonelist.append(price_product_record)
allfieldlist.append(record)
str += string.ljust(record[0], 7) + string.ljust(record[1], 15) + record[2]
str += '\n'
if str != '': logfile.write(str)
runfail = False
except IOError:
print 'IO error'
retry = retry - 1
time.sleep(5)
if (retry == 0):time.sleep(10) #最后一次尝试等10秒
logfile.flush()
except:
print 'write file fail!'
finally:
logfile.close()
#保存排序后结果
allfieldlist.sort(key=itemgetter(2, 1))
resultfile = open(resultfilepath, 'w')
str = ''
for record in allfieldlist:
str += string.ljust(record[0], 7) + string.ljust(record[1], 15) + record[2]
str += '\n'
resultfile.write(str)
resultfile.close()
print 'Success!'