zzm99


  • Home

  • Tags

  • Categories

  • Archives

  • Search

pythonweb3(未完)

Posted on 2019-05-14 | In python , 网站开发 , 实战 |
Words count in article: 0 | Reading time ≈ 1

awesome-website-2

Posted on 2019-05-14 | In python , 网站开发 , 实战 |
Words count in article: 2k | Reading time ≈ 11

Python 网站开发(5) – 搭建Web框架

搭建Web框架

由于aiohttp作为一个Web框架比较底层,我们还需要基于aiohttp编写一个更方便处理URL的Web框架。

在www目录新建coroweb.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import asyncio, os, inspect, logging, functools

from urllib import parse

from aiohttp import web

## apis是处理分页的模块,代码在本章页面末尾,请将apis.py放在www下以防报错
## APIError 是指API调用时发生逻辑错误
from apis import APIError

## 编写装饰函数 @get()
def get(path):
## Define decorator @get('/path')
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kw):
return func(*args, **kw)
wrapper.__method__ = 'GET'
wrapper.__route__ = path
return wrapper
return decorator

## 编写装饰函数 @post()
def post(path):
## Define decorator @post('/path')
def decorator(func):
@functools.wraps(func)
def wrapper(*args, **kw):
return func(*args, **kw)
wrapper.__method__ = 'POST'
wrapper.__route__ = path
return wrapper
return decorator

## 以下是RequestHandler需要定义的一些函数
def get_required_kw_args(fn):
args = []
params = inspect.signature(fn).parameters
for name, param in params.items():
if param.kind == inspect.Parameter.KEYWORD_ONLY and param.default == inspect.Parameter.empty:
args.append(name)
return tuple(args)

def get_named_kw_args(fn):
args = []
params = inspect.signature(fn).parameters
for name, param in params.items():
if param.kind == inspect.Parameter.KEYWORD_ONLY:
args.append(name)
return tuple(args)

def has_named_kw_args(fn):
params = inspect.signature(fn).parameters
for name, param in params.items():
if param.kind == inspect.Parameter.KEYWORD_ONLY:
return True

def has_var_kw_arg(fn):
params = inspect.signature(fn).parameters
for name, param in params.items():
if param.kind == inspect.Parameter.VAR_KEYWORD:
return True

def has_request_arg(fn):
sig = inspect.signature(fn)
params = sig.parameters
found = False
for name, param in params.items():
if name == 'request':
found = True
continue
if found and (param.kind != inspect.Parameter.VAR_POSITIONAL and param.kind != inspect.Parameter.KEYWORD_ONLY and param.kind != inspect.Parameter.VAR_KEYWORD):
raise ValueError('request parameter must be the last named parameter in function: %s%s' % (fn.__name__, str(sig)))
return found

## 定义RequestHandler从URL函数中分析其需要接受的参数
class RequestHandler(object):

def __init__(self, app, fn):
self._app = app
self._func = fn
self._has_request_arg = has_request_arg(fn)
self._has_var_kw_arg = has_var_kw_arg(fn)
self._has_named_kw_args = has_named_kw_args(fn)
self._named_kw_args = get_named_kw_args(fn)
self._required_kw_args = get_required_kw_args(fn)

async def __call__(self, request):
kw = None
if self._has_var_kw_arg or self._has_named_kw_args or self._required_kw_args:
if request.method == 'POST':
if not request.content_type:
return web.HTTPBadRequest('Missing Content-Type.')
ct = request.content_type.lower()
if ct.startswith('application/json'):
params = await request.json()
if not isinstance(params, dict):
return web.HTTPBadRequest('JSON body must be object.')
kw = params
elif ct.startswith('application/x-www-form-urlencoded') or ct.startswith('multipart/form-data'):
params = await request.post()
kw = dict(**params)
else:
return web.HTTPBadRequest('Unsupported Content-Type: %s' % request.content_type)
if request.method == 'GET':
qs = request.query_string
if qs:
kw = dict()
for k, v in parse.parse_qs(qs, True).items():
kw[k] = v[0]
if kw is None:
kw = dict(**request.match_info)
else:
if not self._has_var_kw_arg and self._named_kw_args:
# remove all unamed kw:
copy = dict()
for name in self._named_kw_args:
if name in kw:
copy[name] = kw[name]
kw = copy
# check named arg:
for k, v in request.match_info.items():
if k in kw:
logging.warning('Duplicate arg name in named arg and kw args: %s' % k)
kw[k] = v
if self._has_request_arg:
kw['request'] = request
# check required kw:
if self._required_kw_args:
for name in self._required_kw_args:
if not name in kw:
return web.HTTPBadRequest('Missing argument: %s' % name)
logging.info('call with args: %s' % str(kw))
try:
r = await self._func(**kw)
return r
except APIError as e:
return dict(error=e.error, data=e.data, message=e.message)
## 定义add_static函数,来注册static文件夹下的文件
def add_static(app):
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static')
app.router.add_static('/static/', path)
logging.info('add static %s => %s' % ('/static/', path))

## 定义add_route函数,来注册一个URL处理函数
def add_route(app, fn):
method = getattr(fn, '__method__', None)
path = getattr(fn, '__route__', None)
if path is None or method is None:
raise ValueError('@get or @post not defined in %s.' % str(fn))
if not asyncio.iscoroutinefunction(fn) and not inspect.isgeneratorfunction(fn):
fn = asyncio.coroutine(fn)
logging.info('add route %s %s => %s(%s)' % (method, path, fn.__name__, ', '.join(inspect.signature(fn).parameters.keys())))
app.router.add_route(method, path, RequestHandler(app, fn))

## 定义add_routes函数,自动把handler模块的所有符合条件的URL函数注册了
def add_routes(app, module_name):
n = module_name.rfind('.')
if n == (-1):
mod = __import__(module_name, globals(), locals())
else:
name = module_name[n+1:]
mod = getattr(__import__(module_name[:n], globals(), locals(), [name]), name)
for attr in dir(mod):
if attr.startswith('_'):
continue
fn = getattr(mod, attr)
if callable(fn):
method = getattr(fn, '__method__', None)
path = getattr(fn, '__route__', None)
if method and path:
add_route(app, fn)

最后,在app.py中加入middleware、jinja2模板和自注册的支持。app.py代码修改如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import logging; logging.basicConfig(level=logging.INFO)
import asyncio, os, json, time
from datetime import datetime
from aiohttp import web
from jinja2 import Environment, FileSystemLoader

## config 配置代码在后面会创建添加, 可先从'https://github.com/yzyly1992/2019_Python_Web_Dev'下载或下一章中复制`config.py`和`config_default.py`到`www`下,以防报错
from config import configs

import orm
from coroweb import add_routes, add_static

## handlers 是url处理模块, 当handlers.py在API章节里完全编辑完再将下一行代码的双井号去掉
## from handlers import cookie2user, COOKIE_NAME

## 初始化jinja2的函数
def init_jinja2(app, **kw):
logging.info('init jinja2...')
options = dict(
autoescape = kw.get('autoescape', True),
block_start_string = kw.get('block_start_string', '{%'),
block_end_string = kw.get('block_end_string', '%}'),
variable_start_string = kw.get('variable_start_string', '{{'),
variable_end_string = kw.get('variable_end_string', '}}'),
auto_reload = kw.get('auto_reload', True)
)
path = kw.get('path', None)
if path is None:
path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates')
logging.info('set jinja2 template path: %s' % path)
env = Environment(loader=FileSystemLoader(path), **options)
filters = kw.get('filters', None)
if filters is not None:
for name, f in filters.items():
env.filters[name] = f
app['__templating__'] = env

## 以下是middleware,可以把通用的功能从每个URL处理函数中拿出来集中放到一个地方
## URL处理日志工厂
async def logger_factory(app, handler):
async def logger(request):
logging.info('Request: %s %s' % (request.method, request.path))
return (await handler(request))
return logger

## 认证处理工厂--把当前用户绑定到request上,并对URL/manage/进行拦截,检查当前用户是否是管理员身份
## 需要handlers.py的支持, 当handlers.py在API章节里完全编辑完再将下面代码的双井号去掉
##async def auth_factory(app, handler):
## async def auth(request):
## logging.info('check user: %s %s' % (request.method, request.path))
## request.__user__ = None
## cookie_str = request.cookies.get(COOKIE_NAME)
## if cookie_str:
## user = await cookie2user(cookie_str)
## if user:
## logging.info('set current user: %s' % user.email)
## request.__user__ = user
## if request.path.startswith('/manage/') and (request.__user__ is None or not request.__user__.admin):
## return web.HTTPFound('/signin')
## return (await handler(request))
## return auth

## 数据处理工厂
async def data_factory(app, handler):
async def parse_data(request):
if request.method == 'POST':
if request.content_type.startswith('application/json'):
request.__data__ = await request.json()
logging.info('request json: %s' % str(request.__data__))
elif request.content_type.startswith('application/x-www-form-urlencoded'):
request.__data__ = await request.post()
logging.info('request form: %s' % str(request.__data__))
return (await handler(request))
return parse_data

## 响应返回处理工厂
async def response_factory(app, handler):
async def response(request):
logging.info('Response handler...')
r = await handler(request)
if isinstance(r, web.StreamResponse):
return r
if isinstance(r, bytes):
resp = web.Response(body=r)
resp.content_type = 'application/octet-stream'
return resp
if isinstance(r, str):
if r.startswith('redirect:'):
return web.HTTPFound(r[9:])
resp = web.Response(body=r.encode('utf-8'))
resp.content_type = 'text/html;charset=utf-8'
return resp
if isinstance(r, dict):
template = r.get('__template__')
if template is None:
resp = web.Response(body=json.dumps(r, ensure_ascii=False, default=lambda o: o.__dict__).encode('utf-8'))
resp.content_type = 'application/json;charset=utf-8'
return resp
else:
## 在handlers.py完全完成后,去掉下一行的双井号
##r['__user__'] = request.__user__
resp = web.Response(body=app['__templating__'].get_template(template).render(**r).encode('utf-8'))
resp.content_type = 'text/html;charset=utf-8'
return resp
if isinstance(r, int) and t >= 100 and t < 600:
return web.Response(t)
if isinstance(r, tuple) and len(r) == 2:
t, m = r
if isinstance(t, int) and t >= 100 and t < 600:
return web.Response(t, str(m))
# default:
resp = web.Response(body=str(r).encode('utf-8'))
resp.content_type = 'text/plain;charset=utf-8'
return resp
return response

## 时间转换
def datetime_filter(t):
delta = int(time.time() - t)
if delta < 60:
return u'1分钟前'
if delta < 3600:
return u'%s分钟前' % (delta // 60)
if delta < 86400:
return u'%s小时前' % (delta // 3600)
if delta < 604800:
return u'%s天前' % (delta // 86400)
dt = datetime.fromtimestamp(t)
return u'%s年%s月%s日' % (dt.year, dt.month, dt.day)

async def init(loop):
await orm.create_pool(loop=loop, **configs.db)
## 在handlers.py完全完成后,在下面middlewares的list中加入auth_factory
app = web.Application(loop=loop, middlewares=[
logger_factory, response_factory
])
init_jinja2(app, filters=dict(datetime=datetime_filter))
add_routes(app, 'handlers')
add_static(app)
srv = await loop.create_server(app.make_handler(), '127.0.0.1', 9000)
logging.info('server started at http://127.0.0.1:9000...')
return srv

loop = asyncio.get_event_loop()
loop.run_until_complete(init(loop))
loop.run_forever()

以下是处理分页和API错误的代码apis.py, 请将之放置在www下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import json, logging, inspect, functools

## 建立Page类来处理分页,可以在page_size更改每页项目的个数
class Page(object):

def __init__(self, item_count, page_index=1, page_size=8):
self.item_count = item_count
self.page_size = page_size
self.page_count = item_count // page_size + (1 if item_count % page_size > 0 else 0)
if (item_count == 0) or (page_index > self.page_count):
self.offset = 0
self.limit = 0
self.page_index = 1
else:
self.page_index = page_index
self.offset = self.page_size * (page_index - 1)
self.limit = self.page_size
self.has_next = self.page_index < self.page_count
self.has_previous = self.page_index > 1

def __str__(self):
return 'item_count: %s, page_count: %s, page_index: %s, page_size: %s, offset: %s, limit: %s' % (self.item_count, self.page_count, self.page_index, self.page_size, self.offset, self.limit)

__repr__ = __str__

## 以下为API的几类错误代码
class APIError(Exception):
def __init__(self, error, data='', message=''):
super(APIError, self).__init__(message)
self.error = error
self.data = data
self.message = message

class APIValueError(APIError):
def __init__(self, field, message=''):
super(APIValueError, self).__init__('value:invalid', field, message)

class APIResourceNotFoundError(APIError):
def __init__(self, field, message=''):
super(APIResourceNotFoundError, self).__init__('value:notfound', field, message)

class APIPermissionError(APIError):
def __init__(self, message=''):
super(APIPermissionError, self).__init__('permission:forbidden', 'permission', message)

if __name__=='__main__':
import doctest
doctest.testmod()

有了Web框架,接下来就可以添加需要的URL到handlers模块来处理了。

awesome-website-1

Posted on 2019-05-14 | In python , 网站开发 , 实战 |
Words count in article: 2.8k | Reading time ≈ 13

Python 网站开发(1) – 搭建开发环境

用pip安装网站开发所需要的第三方库:

  • 异步框架的 aiohttp
  • 前端模板引擎 jinja2
  • 数据库 MySQL的Python异步驱动程序 aiomysql (需要先下载安装最新版的 MySQL, 选择免费的 MySQL Community Server 下载安装就好)
  • 轻量级标记语言Markdown, 将文本转换为有效的HTML
    $ pip3 install aiohttp jinja2 aiomysql markdown

构建项目结构

选择一个工作目录,建立如下的目录结构:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
awesome-website/         <-- 根目录
|
+- backup/ <-- 备份目录
|
+- conf/ <-- 配置文件
|
+- dist/ <-- 打包目录
|
+- www/ <-- Web目录,存放.py文件
| |
| +- static/ <-- 存放静态文件
| |
| +- templates/ <-- 存放模板文件
|
+- LICENSE <-- 代码LICENSE

创建好项目的目录结构后,建议同时建立git仓库并同步到GitHub, 保证代码储存及修改的安全。

Python 网站开发(2) – 编写网站骨架

编写网站骨架

为了搭建一个高效的网站,网站的IO处理要建立在asyncio(异步io)的基础上, 我们可以用 aiohttp 写一个基本的服务器应用 app.py 存放在www目录:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import logging; logging.basicConfig(level=logging.INFO)
import asyncio
from aiohttp import web

## 定义服务器响应请求的的返回为 "Awesome Website"
async def index(request):
return web.Response(body=b'<h1>Awesome Website</h1>', content_type='text/html')

## 建立服务器应用,持续监听本地9000端口的http请求,对首页"/"进行响应
def init():
app = web.Application()
app.router.add_get('/', index)
web.run_app(app, host='127.0.0.1', port=9000)

if __name__ == "__main__":
init()

在www目录下运行这个 app.py, 服务器将在9000端口持续监听 http 请求,并异步对首页 / 进行响应:

1
2
3
$ python3 app.py
======== Running on http://127.0.0.1:9000 ========
(Press CTRL+C to quit)

打开浏览器输入地址 http://127.0.0.1:9000 进行测试,如果返回我们设定好的Awesome Website字符串,就说明我们网站服务器应用的框架已经搭好了。

Python 网站开发(3) – 编写ORM

编写ORM*

对象关系映射 (Object Relational Mapping, 简称ORM) 模式是一种为了解决面向对象与关系数据库存在的互不匹配的现象的技术。换句话说,ORM是通过使用描述对象和数据库之间映射的元数据,将程序中的对象自动持久化到关系数据库中。

在一个网站中,所有的数据(包括用户,日志,评论等)都存储在数据库中。我们的网站awesome-website选择用MySQL作为数据库。访问数据库需要创建数据库连接,游标对象,执行SQL语句,处理异常,清理资源。这些访问数据库的代码如果分散到每个函数中去,十分难以维护,效率低下不利于复用。因此,我们将常用的MySQL数据库操作用函数封装起来,便于网站调用。

由于我们的网站基于异步编程,系统的每一层都必须是异步。aiomysql为MySQL数据库提供了异步IO的驱动。

创建连接池

我们需要创建一个全局的连接池,每个HTTP请求都可以从连接池中直接获取数据库连接。使用连接池的好处是不必频繁地打开和关闭数据库连接,而是能复用就尽量复用。

连接池由全局变量__pool存储,缺省情况下将编码设置为utf8,自动提交事务。在www目录下新建 orm.py加入以下代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import asyncio, logging, aiomysql

def log(sql, args=()):
logging.info('SQL: %s' % sql)

async def create_pool(loop, **kw):
logging.info('create database connection pool...')
global __pool
__pool = await aiomysql.create_pool(
host=kw.get('host', 'localhost'),
port=kw.get('port', 3306),
user=kw['user'],
password=kw['password'],
db=kw['db'],
charset=kw.get('charset', 'utf8'),
autocommit=kw.get('autocommit', True),
maxsize=kw.get('maxsize', 10),
minsize=kw.get('minsize', 1),
loop=loop
)

Select

要执行SELECT语句,我们用select函数执行,需要传入SQL语句和SQL参数。缀加以下代码至orm.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
async def select(sql, args, size=None):
log(sql, args)
global __pool
with (await __pool) as conn:
cur = await conn.cursor(aiomysql.DictCursor)
await cur.execute(sql.replace('?', '%s'), args or ())
if size:
rs = await cur.fetchmany(size)
else:
rs = await cur.fetchall()
await cur.close()
logging.info('rows returned: %s' % len(rs))
return rs

SQL语句的占位符是?,而MySQL的占位符是%s,select()函数在内部自动替换。注意要始终坚持使用带参数的SQL,而不是自己拼接SQL字符串,这样可以防止SQL注入攻击。

Insert, Update, Delete

要执行INSERT、UPDATE、DELETE语句,可以定义一个通用的execute()函数,因为这3种SQL的执行都需要相同的参数,以及返回一个整数表示影响的行数。缀加以下代码至orm.py:

1
2
3
4
5
6
7
8
9
10
11
async def execute(sql, args):
log(sql)
with (await __pool) as conn:
try:
cur = await conn.cursor()
await cur.execute(sql.replace('?', '%s'), args)
affected = cur.rowcount
await cur.close()
except BaseException as e:
raise
return affected

execute()函数和select()函数所不同的是,cursor对象不返回结果集,而是通过rowcount返回结果数。

ORM

首先要定义的是所有ORM映射的基类Model。缀加以下代码至orm.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class Model(dict, metaclass=ModelMetaclass):

def __init__(self, **kw):
super(Model, self).__init__(**kw)

def __getattr__(self, key):
try:
return self[key]
except KeyError:
raise AttributeError(r"'Model' object has no attribute '%s'" % key)

def __setattr__(self, key, value):
self[key] = value

def getValue(self, key):
return getattr(self, key, None)

def getValueOrDefault(self, key):
value = getattr(self, key, None)
if value is None:
field = self.__mappings__[key]
if field.default is not None:
value = field.default() if callable(field.default) else field.default
logging.debug('using default value for %s: %s' % (key, str(value)))
setattr(self, key, value)
return value

以及Field和各种Field子类。缀加以下代码至orm.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class Field(object):

def __init__(self, name, column_type, primary_key, default):
self.name = name
self.column_type = column_type
self.primary_key = primary_key
self.default = default

def __str__(self):
return '<%s, %s:%s>' % (self.__class__.__name__, self.column_type, self.name)

class StringField(Field):

def __init__(self, name=None, primary_key=False, default=None, ddl='varchar(100)'):
super().__init__(name, ddl, primary_key, default)

class BooleanField(Field):

def __init__(self, name=None, default=False):
super().__init__(name, 'boolean', False, default)

class IntegerField(Field):

def __init__(self, name=None, primary_key=False, default=0):
super().__init__(name, 'bigint', primary_key, default)

class FloatField(Field):

def __init__(self, name=None, primary_key=False, default=0.0):
super().__init__(name, 'real', primary_key, default)

class TextField(Field):

def __init__(self, name=None, default=None):
super().__init__(name, 'text', False, default)

注意到Model只是一个基类,要将具体的子类如User的映射信息读取出来需要通过metaclass:ModelMetaclass。 这样,任何继承自Model的类(比如User),会自动通过ModelMetaclass扫描映射关系,并存储到自身的类属性如table、mappings中。缀加以下代码至Model代码之前:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def create_args_string(num):
L = []
for n in range(num):
L.append('?')
return ', '.join(L)

class ModelMetaclass(type):

def __new__(cls, name, bases, attrs):
# 排除Model类本身:
if name=='Model':
return type.__new__(cls, name, bases, attrs)
# 获取table名称:
tableName = attrs.get('__table__', None) or name
logging.info('found model: %s (table: %s)' % (name, tableName))
# 获取所有的Field和主键名:
mappings = dict()
fields = []
primaryKey = None
for k, v in attrs.items():
if isinstance(v, Field):
logging.info(' found mapping: %s ==> %s' % (k, v))
mappings[k] = v
if v.primary_key:
# 找到主键:
if primaryKey:
raise RuntimeError('Duplicate primary key for field: %s' % k)
primaryKey = k
else:
fields.append(k)
if not primaryKey:
raise RuntimeError('Primary key not found.')
for k in mappings.keys():
attrs.pop(k)
escaped_fields = list(map(lambda f: '`%s`' % f, fields))
attrs['__mappings__'] = mappings # 保存属性和列的映射关系
attrs['__table__'] = tableName
attrs['__primary_key__'] = primaryKey # 主键属性名
attrs['__fields__'] = fields # 除主键外的属性名
# 构造默认的SELECT, INSERT, UPDATE和DELETE语句:
attrs['__select__'] = 'select `%s`, %s from `%s`' % (primaryKey, ', '.join(escaped_fields), tableName)
attrs['__insert__'] = 'insert into `%s` (%s, `%s`) values (%s)' % (tableName, ', '.join(escaped_fields), primaryKey, create_args_string(len(escaped_fields) + 1))
attrs['__update__'] = 'update `%s` set %s where `%s`=?' % (tableName, ', '.join(map(lambda f: '`%s`=?' % (mappings.get(f).name or f), fields)), primaryKey)
attrs['__delete__'] = 'delete from `%s` where `%s`=?' % (tableName, primaryKey)
return type.__new__(cls, name, bases, attrs)

然后,我们往Model类添加class方法,就可以让所有子类调用class方法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class Model(dict):

...

@classmethod
async def findAll(cls, where=None, args=None, **kw):
## find objects by where clause
sql = [cls.__select__]
if where:
sql.append('where')
sql.append(where)
if args is None:
args = []
orderBy = kw.get('orderBy', None)
if orderBy:
sql.append('order by')
sql.append(orderBy)
limit = kw.get('limit', None)
if limit is not None:
sql.append('limit')
if isinstance(limit, int):
sql.append('?')
args.append(limit)
elif isinstance(limit, tuple) and len(limit) == 2:
sql.append('?, ?')
args.extend(limit)
else:
raise ValueError('Invalid limit value: %s' % str(limit))
rs = await select(' '.join(sql), args)
return [cls(**r) for r in rs]

@classmethod
async def findNumber(cls, selectField, where=None, args=None):
## find number by select and where
sql = ['select %s _num_ from `%s`' % (selectField, cls.__table__)]
if where:
sql.append('where')
sql.append(where)
rs = await select(' '.join(sql), args, 1)
if len(rs) == 0:
return None
return rs[0]['_num_']

@classmethod
async def find(cls, pk):
## find object by primary key
rs = await select('%s where `%s`=?' % (cls.__select__, cls.__primary_key__), [pk], 1)
if len(rs) == 0:
return None
return cls(**rs[0])

往Model类添加实例方法,就可以让所有子类调用实例方法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
class Model(dict):

...

async def save(self):
args = list(map(self.getValueOrDefault, self.__fields__))
args.append(self.getValueOrDefault(self.__primary_key__))
rows = await execute(self.__insert__, args)
if rows != 1:
logging.warn('failed to insert record: affected rows: %s' % rows)

async def update(self):
args = list(map(self.getValue, self.__fields__))
args.append(self.getValue(self.__primary_key__))
rows = await execute(self.__update__, args)
if rows != 1:
logging.warn('failed to update by primary key: affected rows: %s' % rows)

async def remove(self):
args = [self.getValue(self.__primary_key__)]
rows = await execute(self.__delete__, args)
if rows != 1:
logging.warn('failed to remove by primary key: affected rows: %s' % rows)

Python 网站开发(4) – 编写Model

编写Model

orm.py编写完成后,就可以把网站应用需要的三个表(user, blog, comment)用Model表示出来。在www目录下,新建models.py:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import time, uuid

from orm import Model, StringField, BooleanField, FloatField, TextField

def next_id():
return '%015d%s000' % (int(time.time() * 1000), uuid.uuid4().hex)

class User(Model):
__table__ = 'users'

id = StringField(primary_key=True, default=next_id, ddl='varchar(50)')
email = StringField(ddl='varchar(50)')
passwd = StringField(ddl='varchar(50)')
admin = BooleanField()
name = StringField(ddl='varchar(50)')
image = StringField(ddl='varchar(500)')
created_at = FloatField(default=time.time)

class Blog(Model):
__table__ = 'blogs'

id = StringField(primary_key=True, default=next_id, ddl='varchar(50)')
user_id = StringField(ddl='varchar(50)')
user_name = StringField(ddl='varchar(50)')
user_image = StringField(ddl='varchar(500)')
name = StringField(ddl='varchar(50)')
summary = StringField(ddl='varchar(200)')
content = TextField()
created_at = FloatField(default=time.time)

class Comment(Model):
__table__ = 'comments'

id = StringField(primary_key=True, default=next_id, ddl='varchar(50)')
blog_id = StringField(ddl='varchar(50)')
user_id = StringField(ddl='varchar(50)')
user_name = StringField(ddl='varchar(50)')
user_image = StringField(ddl='varchar(500)')
content = TextField()
created_at = FloatField(default=time.time)

初始化数据库表

由于网站表的数量较少,可以手动创建SQL脚本schema.sql到根目录:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
-- schema.sql

drop database if exists awesome;

create database awesome;

use awesome;

create user 'www-data'@'localhost' identified by 'www-data';
alter user 'www-data'@'localhost' identified with mysql_native_password by 'www-data';
grant select, insert, update, delete on awesome.* to 'www-data'@'localhost';

create table users (
`id` varchar(50) not null,
`email` varchar(50) not null,
`passwd` varchar(50) not null,
`admin` bool not null,
`name` varchar(50) not null,
`image` varchar(500) not null,
`created_at` real not null,
unique key `idx_email` (`email`),
key `idx_created_at` (`created_at`),
primary key (`id`)
) engine=innodb default charset=utf8;

create table blogs (
`id` varchar(50) not null,
`user_id` varchar(50) not null,
`user_name` varchar(50) not null,
`user_image` varchar(500) not null,
`name` varchar(50) not null,
`summary` varchar(200) not null,
`content` mediumtext not null,
`created_at` real not null,
key `idx_created_at` (`created_at`),
primary key (`id`)
) engine=innodb default charset=utf8;

create table comments (
`id` varchar(50) not null,
`blog_id` varchar(50) not null,
`user_id` varchar(50) not null,
`user_name` varchar(50) not null,
`user_image` varchar(500) not null,
`content` mediumtext not null,
`created_at` real not null,
key `idx_created_at` (`created_at`),
primary key (`id`)
) engine=innodb default charset=utf8;

把SQL脚本 schema.sql放到MySQL命令行里执行,就完成了数据库表的初始化:

$ mysql -u root -p < schema.sql
然后我们可以编写数据访问代码test.py测试一下。如新建一个User的对象:

1
2
3
4
5
6
7
8
9
10
11
12
import orm
import asyncio
from models import User, Blog, Comment

async def test(loop):
await orm.create_pool(loop=loop, user='root', password='root', db='awesome')
u = User(name='Test', email='test@qq.com', passwd='1234567890', image='about:blank')
await u.save()
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(test(loop))
loop.close()

运行test.py后,可以在MySQL客户端命令行查询,看看测试的数据是不是正常储存到MySQL里面。

爬取中大招生网2016~2018年专业分数表

Posted on 2019-05-14 | In python , 学习爬虫 , 爬虫小实例 |
Words count in article: 1.5k | Reading time ≈ 9

根据网站具体的css选择器要做修改,还有爬取标题时候中英括号有的不一样。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: utf-8 -*-
from requests_html import HTMLSession
import csv

session = HTMLSession()

file = open('2018.csv', 'w', newline='')
csvwriter = csv.writer(file)
csvwriter.writerow(['科类','院系', '专业', '最高分', '最低分', '平均分', '省份', '年份'])

links = ['http://admission.sysu.edu.cn/zs01/zs01c/beijing/1136901.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/zhejiang/1136947.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/hebei/1136940.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/tianjin/1136943.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shanghai/1136938.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/chongqing/1136961.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/guangdong/1136663.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shandong/1136936.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/jiangsu/1136929.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/sichuan/1136942.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/henan/1136922.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shanxi/1136933.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shanx/1136937.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/liaoning/1136931.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/qinghai/1136935.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/qinghai/1136935.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/ningxia/1136934.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/neimenggu/1136932.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/jiangxi/1136930.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/jilin/1136926.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/hunan/1136925.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/hubei/1136924.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/heilongjiang/1136923.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/hainan/1136920.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/anhui/1136900.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/yunnan/1136946.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/guizhou/1136918.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/guangxi/1136910.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/xinjiang/1136945.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/gansu/1136903.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/fujian/1136902.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/xizang/1136944.htm',
]

for link in links:
r = session.get(link)
for i in range(80):
shengfen = r.html.find('#cont > h1', first = True)
kelei = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+3)+') > td:nth-child(1) > span > span', first=True)
yuanxi = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+3)+') > td:nth-child(2) > span > span', first=True)
zhuanye = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+3)+') > td:nth-child(3) > span > span', first=True)
high = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+3)+') > td:nth-child(4) > span > span', first=True)
low = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+3)+') > td:nth-child(5) > span > span', first=True)
avg = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+3)+') > td:nth-child(6) > span > span', first=True)
if kelei and yuanxi and zhuanye and high and low and avg:
csvwriter.writerow([kelei.text, yuanxi.text, zhuanye.text, high.text, low.text, avg.text, shengfen.text.split('(')[1].split(')')[0], '2018'])
elif kelei and yuanxi and zhuanye and high and low:
csvwriter.writerow([' ',kelei.text, yuanxi.text, zhuanye.text, high.text, low.text,shengfen.text.split('(')[1].split(')')[0], '2018'])
else:
pass

file.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# -*- coding: utf-8 -*-
from requests_html import HTMLSession
import csv

session = HTMLSession()

file = open('2016.csv', 'a+', newline='')
csvwriter = csv.writer(file)
#csvwriter.writerow(['科类','院系', '专业', '最高分', '最低分', '平均分', '省份', '年份'])

links = [#'http://admission.sysu.edu.cn/zs01/zs01c/henan/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/fujian/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/guangxi/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/yunnan/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/anhui/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/hainan/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/heilongjiang/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/beijing/2018n.htm',
#'http://admission.sysu.edu.cn/zs01/zs01c/zhejiang/2017n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shanghai/2017n.htm',
]

for link in links:
r = session.get(link)
for i in range(50):
shengfen = r.html.find('#cont > h1', first = True)
kelei = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(3)', first=True)
yuanxi = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(2)', first=True)
zhuanye = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(1)', first=True)
high = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(4)', first=True)
low = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(5)', first=True)
avg = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(6)', first=True)
if kelei and yuanxi and zhuanye and high and low and avg:
csvwriter.writerow([kelei.text, yuanxi.text, zhuanye.text, high.text, low.text, avg.text, shengfen.text.split('(')[1].split(')')[0], '2016'])
print( shengfen.text.split('(')[1].split(')')[0])
elif kelei and yuanxi and zhuanye and high and low:
csvwriter.writerow([' ',kelei.text, yuanxi.text, zhuanye.text, high.text, low.text,shengfen.text.split('(')[1].split(')')[0], '2016'])
print( shengfen.text.split('(')[1].split(')')[0])
else:
pass

file.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# -*- coding: utf-8 -*-
from requests_html import HTMLSession
import csv

session = HTMLSession()

file = open('2016.csv', 'w', newline='')
csvwriter = csv.writer(file)
csvwriter.writerow(['科类','院系', '专业', '最高分', '最低分', '平均分', '省份', '年份'])

links = ['http://admission.sysu.edu.cn/zs01/zs01c/chongqing/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/guangdong/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shandong/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/jiangsu/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/sichuan/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/henan/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/sichuan/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/hebei/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shanxi/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/shanx/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/liaoning/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/qinghai/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/ningxia/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/neimenggu/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/jiangxi/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/jilin/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/hunan/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/hubei/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/yunnan/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/guizhou/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/xinjiang/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/gansu/2018n.htm',
'http://admission.sysu.edu.cn/zs01/zs01c/xizang/2018n.htm',
]

for link in links:
r = session.get(link)
for i in range(50):
shengfen = r.html.find('#cont > h1', first = True)
kelei = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(3)', first=True)
yuanxi = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(2)', first=True)
zhuanye = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(1)', first=True)
high = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(7)', first=True)
low = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(8)', first=True)
avg = r.html.find('#cont > table > tbody > tr:nth-child('+ str(i+9)+') > td:nth-child(9)', first=True)
if kelei and yuanxi and zhuanye and high and low and avg:
csvwriter.writerow([kelei.text, yuanxi.text, zhuanye.text, high.text, low.text, avg.text, shengfen.text.split('(')[1].split(')')[0], '2016'])
elif kelei and yuanxi and zhuanye and high and low:
csvwriter.writerow([' ',kelei.text, yuanxi.text, zhuanye.text, high.text, low.text,shengfen.text.split('(')[1].split(')')[0], '2016'])
else:
pass

file.close()

1
1
1

微博 热评 情感分析

Posted on 2019-05-13 | In python , 学习爬虫 , 爬虫小实例 |
Words count in article: 2.1k | Reading time ≈ 8

python123学习 https://python123.io/tutorials/weibo_sentiment_anlysis

Read more »

你不知道的词云

Posted on 2019-05-13 | In python , 学习爬虫 , 爬虫小实例 |
Words count in article: 8.6k | Reading time ≈ 40

学习python123 https://python123.io/tutorials/word_cloud

Read more »

百度 AI 搜索引擎

Posted on 2019-05-13 | In python , 学习爬虫 , 爬虫小实例 |
Words count in article: 6.5k | Reading time ≈ 37

学习python123课程:https://python123.io/index/tutorials/web_crawler_intro

Read more »

聚合新闻头条

Posted on 2019-05-13 | In python , 学习爬虫 , 爬虫小实例 |
Words count in article: 1.4k | Reading time ≈ 6

学习python123课程:https://python123.io/index/tutorials/web_crawler_intro

Read more »

爬取豆瓣电源 并把名字和年份记录在csv文件上

Posted on 2019-05-13 | In python , 学习爬虫 , 爬虫小实例 |
Words count in article: 526 | Reading time ≈ 2

1. 爬取豆瓣电源 并把名字和年份记录在csv文件上:

1

这里用到的是比较新的requests_html的HTMLSession

官方文档: https://cncert.github.io/requests-html-doc-cn/#/?id=%E4%BD%BF%E7%94%A8%E6%96%B9%E6%B3%95

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# -*- coding: utf-8 -*-
from requests_html import HTMLSession
import csv

session = HTMLSession()

file = open('movies.csv', 'w', newline='')
csvwriter = csv.writer(file)
csvwriter.writerow(['名称', '年份'])

links = ['https://movie.douban.com/subject/1292052/', 'https://movie.douban.com/subject/26752088/', 'https://movie.douban.com/subject/1962665/']

for link in links:
r = session.get(link)
title = r.html.find('#content > h1 > span:nth-child(1)', first=True)
year = r.html.find('#content > h1 > span.year', first=True)
csvwriter.writerow([title.text, year.text])

file.close()

  1. 爬取北京地区的爬虫工程师薪资数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# -*- coding: utf-8 -*-
from requests_html import HTMLSession
import re
from matplotlib import pyplot as plt

salary_element = '<p.*>(\d+)K-(\d+)K</p>'
salary = []
disabled_button_element = '<button.* disabled="disabled">下一页</button>'
disabled_button = None
p = 1

while not disabled_button:
print('正在爬取第' + str(p) + '页')
url = 'https://sou.zhaopin.com/?p=' + str(p) + '&jl=530&kw=爬虫工程师&kt=3'
session = HTMLSession()
page = session.get(url)
page.html.render(sleep=20)
# 提取出薪资,保存为形如 [[10,20], [15,20], [12, 15]] 的数组
salary += re.findall(salary_element, page.html.html)
# 判断页面中下一页按钮还能不能点击
disabled_button = re.findall(disabled_button_element, page.html.html)
p = p + 1
session.close()

# 求出每家公司的平均薪资,比如 [12, 15] 的平均值为 13
salary = [(int(s[0]) + int(s[1])) / 2 for s in salary]
# 划定薪资范围,便于展示,你也可以尝试其它展示方案
low_salary, middle_salary, high_salary = [0, 0, 0]
for s in salary:
if s <= 15:
low_salary += 1
elif s > 15 and s <= 30:
middle_salary += 1
else:
high_salary += 1
# 调节图形大小,宽,高
plt.figure(figsize=(6, 9))
# 定义饼状图的标签,标签是列表
labels = [u'<15K', u'15K-30K', u'>30K']
data = [low_salary, middle_salary, high_salary]
plt.pie(data, labels=labels)
# 设置x,y轴刻度一致,这样饼图才能是圆的
plt.axis('equal')
plt.legend()
plt.show()

1

报错:
RuntimeError: Cannot use HTMLSession within an existing event loop. Use AsyncHTMLSession instead.

详见:
https://github.com/kennethreitz/requests-html/issues 未解决,连创始人都没回,好像跟异步有关,之后有时间啃了那个库的文档再说(别用这个库了(谷歌很难找,还是找对应github开源库的issue)

scrapy爬虫框架入门

Posted on 2019-05-13 | In python , 学习爬虫 , scrapy框架学习 |
Words count in article: 1.3k | Reading time ≈ 5

1

  • Scrapy Engine(引擎): 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯,信号、数据传递等。

  • Scheduler(调度器): 它负责接受引擎发送过来的Request请求,并按照一定的方式进行整理排列,入队,当引擎需要时,交还给引擎。

  • Downloader(下载器):负责下载Scrapy Engine(引擎)发送的所有Requests请求,并将其获取到的Responses交还给Scrapy Engine(引擎),由引擎交给Spider来处理,

  • Spider(爬虫):它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给引擎,再次进入Scheduler(调度器),

  • Item Pipeline(管道):它负责处理Spider中获取到的Item,并进行进行后期处理(详细分析、过滤、存储等)的地方.

  • Downloader Middlewares(下载中间件):你可以当作是一个可以自定义扩展下载功能的组件。

  • Spider Middlewares(Spider中间件):你可以理解为是一个可以自定扩展和操作引擎和Spider中间通信的功能组件(比如进入Spider的Responses;和从Spider出去的Requests)

  • 创建一个Scrapy项目
  • 定义提取的Item
  • 编写爬取网站的 spider 并提取 Item
  • 编写 Item Pipeline 来存储提取到的Item(即数据)
Read more »
1…323334…38
zzm99

zzm99

372 posts
40 categories
3 tags
GitHub
0%
© 2020 zzm99 | Site words total count: 409.1k
Powered by Hexo
|
Theme — NexT.Gemini v5.1.4