爬取淘宝搜索结果

通过python爬虫可以轻松爬取淘宝搜索关键字返回的内容

效果展示：

jieguo

源码：

淘宝网搜索功能受限：

由于淘宝设置了要登陆了才能使用搜索功能

so，下面这份简单的爬虫代码并不会起作用：

#CrowTaobaoPrice.py
import requests
import re
 
def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
     
def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price , title])
    except:
        print("")
 
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))
         
def main():
    goods = '书包'
    depth = 3
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44*i)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
     
main()

解决办法：

首先我们需要先在浏览器中登录我们的个人淘宝，然后搜索以书包为例的商品，打开开发者模式（我使用的是chrome）或者按F12

这里我们可以看到我们当前的cookie和user-agent（一般是Mozilla/5.0）（注意：如果没有出现这几个name，点击浏览器刷新就会出现了）

通过增加cookie和user-agent，发现代码正常运行

下面代码中getcookiefromchrome函数为获取cookie

然后在gethtmltext函数中增加cookie和user-agent访问身份

import os
import re
import sqlite3
import requests
from win32.win32crypt import CryptUnprotectData

def getcookiefromchrome():
    host = '.taobao.com'
    cookies_str = ''
    cookiepath=os.environ['LOCALAPPDATA']+r"\Google\Chrome\User Data\Default\Cookies"
    sql="select host_key,name,encrypted_value from cookies where host_key='%s'" % host
    with sqlite3.connect(cookiepath) as conn:
        cu=conn.cursor()        
        cookies={name:CryptUnprotectData(encrypted_value)[1].decode() for host_key,name,encrypted_value in cu.execute(sql).fetchall()}
        for key,values in cookies.items():
                cookies_str = cookies_str + str(key)+"="+str(values)+';'
        return cookies_str
    
def getHTMLText(url):
    cookies = getcookiefromchrome()
    kv = {'cookie':cookies,'user-agent':'Mozilla/5.0'}
    try:
        r = requests.get(url, headers=kv, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
    
def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")
    
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))
   
def main():
    goods = input('商品：')
    depth = int(input('页数：'))
    start_url = 'https://s.taobao.com/search?q=' + goods
    infoList = []
    print ("正在爬取···")
    for i in range(depth):
        try:
            url = start_url + '&s=' + str(44*i)
            html = getHTMLText(url)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)
    
main()

实测只有先前已经登陆过淘宝页面才能正常运行以上代码（清楚浏览器数据后运行没成功）

问题后续有时间再研究。。。

补充一些效果图吧：

学完模拟登陆之后，可以来一份简单的爬取代码辣！！！（基础知识去爬虫专题的模拟登陆看）

# -*- coding: utf-8 -*-
import re
import requests
import time
from selenium import webdriver

wd = webdriver.Chrome()
loginUrl = 'https://login.taobao.com/member/login.jhtml' 
wd.get(loginUrl) #进入登陆界面
 
time.sleep(30)#设定30秒睡眠，期间进行手动登陆。十分关键，下面有解释。
cookies = wd.get_cookies()#调出Cookies
req = requests.Session()
for cookie in cookies:
    req.cookies.set(cookie['name'],cookie['value'])
req.headers.clear() 


def getHTMLText(url):
    try:
        r = req.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
    
def parsePage(ilt, html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")
    
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))


goods = input('商品：')
depth = int(input('页数：'))
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
print ("正在爬取···")
for i in range(depth):
    try:
        url = start_url + '&s=' + str(44*i)
        html = getHTMLText(url)
        parsePage(infoList, html)
    except:
        continue
printGoodsList(infoList)