Python遍历淘宝成交记录（版本二）

在前一版本的基础上，加入了排序。按物品和日期来排序，排除重复的价格和物品
#-*- encoding: gbk -*-
'''
Created on 2011-5-8

@author: fatkun
'''
import urllib2
import re
import time
import string
from operator import itemgetter

#读取网页
def read(url):
    opener = urllib2.build_opener()
    #opener.handle_open["http"][0].set_http_debuglevel(1)
    opener.addheaders = [('User-agent', 'Mozilla/5.0')]
    f= opener.open(url)
    content = f.read()
    return content

#正则表达式返回需要的内容
def display(content):
    pattern = re.compile(r"""<tr(?:[^>]+)?> #TR
                        [\S\s]*?<td[\S\s]*?</td>[\S\s]*?<td(?:[^>]+)?> #Second TD
                        [\S\s]*?<a [^>]+>([\S\s]*?)</a> # 链接
                        [\S\s]*?<br/>\s+((?:[\S]+[ ]?)+)\s+</td> # 物品名称
                        [\S\s]*?<em>(\d+)</em> # 价格
                        [\S\s]*?<td>[\S\s]*?</td> # TD
                        [\S\s]*?<td>([\S\s]*?)</td> # 日期
                        [\S\s]*?</tr> #Last TR
                        """, flags=re.MULTILINE|re.IGNORECASE|re.VERBOSE)

    matchs = re.findall(pattern, content )
    alist = []
    for match in matchs:
        alist.append((match[2].strip(), match[3].strip()[:11], match[1].strip()))
    return alist

filepath = 'c:\\log.txt'
resultfilepath = 'c:\\log_result.txt'
open(filepath, 'w').close()
lastpage = 50
#淘宝物品的成交记录下一页的链接，请复制链接，把最后一个页数的数字删掉，放在url变量里
url = 'http://tbskip.taobao.com/json/show_buyer_list.htm?is_offline=&page_size=15&is_start=false&item_type=b&ends=1305198406000&starts=1304593606000&item_id=5964804060&user_tag=475363344&old_quantity=5521&sold_total_num=3057&closed=false&seller_num_id=69211806&zhichong=true&bidPage='
onlyonelist = []
allfieldlist = []
for i in range(1, lastpage + 1):
    fullurl = '%s%d' % (url, i)
    #不出错运行后设为False
    runfail = True
    #重试次数
    retry = 2
    print '第%d页 - %s' % (i, fullurl)
    
    try:
        logfile = open(filepath, 'a')
        
        while (runfail and retry >= 0):
            try:
                content = read(fullurl)
                alist = display(content)
                str = '';
                for record in alist:
                    price_product_record = (record[0], record[2])
                    if price_product_record not in onlyonelist:
                        onlyonelist.append(price_product_record)
                        allfieldlist.append(record)
                        str += string.ljust(record[0], 7) + string.ljust(record[1], 15) + record[2]
                        str += '\n'
                if str != '': logfile.write(str)
                runfail = False
            except IOError:
                print 'IO error'
                retry = retry - 1
                time.sleep(5)
                if (retry == 0):time.sleep(10) #最后一次尝试等10秒
                
        logfile.flush()
    except:
        print 'write file fail!'
    finally:
        logfile.close()

#保存排序后结果
allfieldlist.sort(key=itemgetter(2, 1))
resultfile = open(resultfilepath, 'w')
str = ''
for record in allfieldlist:
    str += string.ljust(record[0], 7) + string.ljust(record[1], 15) + record[2]
    str += '\n'
resultfile.write(str)
resultfile.close()
print 'Success!'
Python遍历淘宝成交记录（版本二）

相关文章：