爬取淘宝搜索结果

通过python爬虫可以轻松爬取淘宝搜索关键字返回的内容

效果展示:

jieguo

源码:

淘宝网搜索功能受限:

由于淘宝设置了要登陆了才能使用搜索功能

so, 下面这份简单的爬虫代码并不会起作用:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#CrowTaobaoPrice.py
import requests
import re

def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""

def parsePage(ilt, html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price , title])
except:
print("")

def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号", "价格", "商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))

def main():
goods = '书包'
depth = 3
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
for i in range(depth):
try:
url = start_url + '&s=' + str(44*i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)

main()

解决办法:

首先我们需要先在浏览器中登录我们的个人淘宝,然后搜索以书包为例的商品,打开开发者模式(我使用的是chrome)或者按F12

这里我们可以看到我们当前的cookie和user-agent(一般是Mozilla/5.0)(注意:如果没有出现这几个name,点击浏览器刷新就会出现了)

通过增加cookie和user-agent,发现代码正常运行

下面代码中getcookiefromchrome函数为获取cookie

然后在gethtmltext函数中增加cookie和user-agent访问身份

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os
import re
import sqlite3
import requests
from win32.win32crypt import CryptUnprotectData

def getcookiefromchrome():
host = '.taobao.com'
cookies_str = ''
cookiepath=os.environ['LOCALAPPDATA']+r"\Google\Chrome\User Data\Default\Cookies"
sql="select host_key,name,encrypted_value from cookies where host_key='%s'" % host
with sqlite3.connect(cookiepath) as conn:
cu=conn.cursor()
cookies={name:CryptUnprotectData(encrypted_value)[1].decode() for host_key,name,encrypted_value in cu.execute(sql).fetchall()}
for key,values in cookies.items():
cookies_str = cookies_str + str(key)+"="+str(values)+';'
return cookies_str

def getHTMLText(url):
cookies = getcookiefromchrome()
kv = {'cookie':cookies,'user-agent':'Mozilla/5.0'}
try:
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""

def parsePage(ilt, html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print("")

def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))

def main():
goods = input('商品:')
depth = int(input('页数:'))
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
print ("正在爬取···")
for i in range(depth):
try:
url = start_url + '&s=' + str(44*i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)

main()

实测只有先前已经登陆过淘宝页面才能正常运行以上代码(清楚浏览器数据后运行没成功)

问题后续有时间再研究。。。

补充一些效果图吧:

1

2


学完模拟登陆之后,可以来一份简单的爬取代码辣!!!(基础知识去爬虫专题的模拟登陆看)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
import re
import requests
import time
from selenium import webdriver

wd = webdriver.Chrome()
loginUrl = 'https://login.taobao.com/member/login.jhtml'
wd.get(loginUrl) #进入登陆界面

time.sleep(30)#设定30秒睡眠,期间进行手动登陆。十分关键,下面有解释。
cookies = wd.get_cookies()#调出Cookies
req = requests.Session()
for cookie in cookies:
req.cookies.set(cookie['name'],cookie['value'])
req.headers.clear()


def getHTMLText(url):
try:
r = req.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""

def parsePage(ilt, html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price, title])
except:
print("")

def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号","价格","商品名称"))
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))


goods = input('商品:')
depth = int(input('页数:'))
start_url = 'https://s.taobao.com/search?q=' + goods
infoList = []
print ("正在爬取···")
for i in range(depth):
try:
url = start_url + '&s=' + str(44*i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
Donate? comment?