代码
px500.py
一些自定义的类,包括用户,画廊,图片
# -*- coding: utf-8 -*-
# 500px 用户
class User(object):
id = 0
name = ''
def __init__(self, id, name):
self.id = id
self.name = name
# 500px 画廊
class Gallery(object):
id = 0
name = ''
path = ''
images = ()
def __init__(self, id, name, path):
self.id = id
self.name = name
self.path = path
def __repr__(self):
return 'Gallery [id=' + str(self.id) + ', name=' + self.name + ', path=' + self.path + ',images=' + str(self.images) + ']'
# 500px 图片
class Image(object):
id = 0
name = ''
url = ''
format = ''
def __init__(self, id, name, url, format):
self.id = id
self.name = name
self.url = url
self.format = format
# 部分图片的名字包含了非法字符,如果不做处理则无法作为文件名
illegals = '\\/:*?"<>|'
for c in illegals:
self.name = self.name.replace(c, '_')
def __repr__(self):
return 'Image [id=' + str(self.id) + ', name=' + self.name + ', url=' + self.url + ', format=' + \
self.format + ']'
csrf.py
用于从 html 里解析出 csrf token
# -*- coding: utf-8 -*-
from html.parser import HTMLParser
# 从 500px.com 登录页解析出 CSRF token 要解析的标签如下
#
# <meta name="csrf-param" content="authenticity_token" />
# <meta name="csrf-token" content="LeMJgYNZrozw+7niJ8OsZy0AzxT2DwuJPTV+cwR8ZX0QBHhG49..." />
class Csrf(HTMLParser):
param = ''
token = ''
def handle_startendtag(self, tag, attrs):
if tag == 'meta' and len(attrs) == 2:
name = attrs[0]
content = attrs[1]
if name[0] == 'name' and content[0] == 'content':
if name[1] == 'csrf-param':
self.param = content[1]
elif name[1] == 'csrf-token':
self.token = content[1]
spider.py
主程序
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import http.cookiejar
import json
import os
import os.path
import urllib.parse
import urllib.request
from px500 import User, Gallery, Image
from csrf import Csrf
# 保存图片的主目录
home = 'd:\\500px'
# 登录 500px 的帐号和密码
email = '[email protected]'
password = '12345678'
# 是否重新下载并覆盖已存在的图片
overwrite = False
# 500px CSRF token
csrf = Csrf()
# 500px 用户
user = None
# 画廊列表
galleries = ()
# 下载计数
count = 1
# 使 urllib.request 库支持 cookie
def enable_cookie():
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()))
urllib.request.install_opener(opener)
# 访问登录页,拿到 CSRF TOKEN
def visit_login_page():
global csrf
url = 'https://500px.com/login'
with urllib.request.urlopen(url) as rsp:
html = rsp.read().decode('utf8')
csrf.feed(html)
# 登录到 500px
def login():
global user
# 登录获得用户信息
login_url = 'https://api.500px.com/v1/session'
data = {
'session[email]': email,
'session[password]': password,
csrf.param: csrf.token
}
req = urllib.request.Request(login_url, urllib.parse.urlencode(data).encode('ascii'))
with urllib.request.urlopen(req) as rsp:
html = rsp.read().decode('utf8')
json_obj = json.loads(html, 'utf8')
user = User(json_obj['user']['id'], json_obj['user']['username'])
# 创建用户目录
def create_user_dir():
if not os.path.exists(home):
os.mkdir(home)
os.chdir(home)
if not os.path.exists(user.name):
os.mkdir(user.name)
os.chdir(user.name)
# 获取画廊信息,并创建画廊目录
def get_galleries():
global galleries
data = {
'privacy': 'both',
'include_cover': 1,
'cover_size': 440,
'kinds': '0,1,2,4,5',
'rpp': 50,
'sort': 'position',
'sort_direction': 'asc',
'page': 1
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-CSRF-Token': csrf.token,
'Referer': 'https://500px.com/' + user.name + '/galleries',
'Accept': 'application/json, text/javascript, */*; q=0.01'
}
# 必须用 GET 请求
galleries_url = 'https://api.500px.com/v1/users/' + str(user.id) + '/galleries?' + urllib.parse.urlencode(data)
req = urllib.request.Request(url=galleries_url, headers=headers)
with urllib.request.urlopen(req) as rsp:
html = rsp.read().decode('utf8')
json_obj = json.loads(html, 'utf8')
galleries_list = []
for gallery in json_obj['galleries']:
galleries_list.append(Gallery(gallery['id'], gallery['name'], gallery['custom_path']))
galleries = tuple(galleries_list)
# 创建画廊目录
def create_gallery_dir():
for item in galleries:
if not os.path.exists(item.name):
os.mkdir(item.name)
# 获取画廊里的图片
def get_images(gallery):
page = 1
while True:
data = {
'formats': 'jpeg,lytro',
'image_size[]': 2048,
'include_licensing': 'true',
'page': page,
'rpp': 50,
'sort': 'position',
'sort_direction': 'asc'
}
# 这个请求头必须的,否则返回 401
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-CSRF-Token': csrf.token,
'Referer': 'https://500px.com/' + user.name + '/galleries/' + gallery.path,
'Accept': 'application/json, text/javascript, */*; q=0.01'
}
image_url = 'https://api.500px.com/v1/users/' + str(user.id) + '/galleries/' + str(gallery.id) + '/items?' + \
urllib.parse.urlencode(data)
req = urllib.request.Request(image_url, None, headers)
with urllib.request.urlopen(req) as rsp:
html = rsp.read().decode('utf8')
images = list(gallery.images)
json_obj = json.loads(html, 'utf8')
photos = json_obj['photos']
for photo in photos:
photo_images = photo['images']
for photo_image in photo_images:
if photo_image['size'] == 2048:
images.append(Image(photo['id'], photo['name'], photo_image['url'], photo_image['format']))
gallery.images = tuple(images)
if page == json_obj["total_pages"]:
break
else:
page += 1
# 保存图片,下载 30 秒超时,下载或保存失败会删除已创建的文件(多半是空文件)
def save(image):
global count
filename = image.name + '_' + str(image.id) + '.' + image.format
tips = str(count) + ': ' + filename
if overwrite or not os.path.exists(filename):
try:
with open(filename, 'wb') as file:
with urllib.request.urlopen(image.url, timeout=30) as rsp:
file.write(rsp.read())
print(tips, "saved")
except Exception as e:
print(tips, e, image.url)
if os.path.exists(filename):
os.remove(filename)
else:
print(tips, 'exist,ignored.')
count += 1
# ------------ 主程序开始运行 ------------
print('start to fetch your 5oopx.com galleries images, please wait...')
# 开启 cookie 支持
enable_cookie()
# 登录
visit_login_page()
login()
print('login 500px.com success.')
# 创建用户目录及画廊目录
create_user_dir()
# 获取用户画廊
get_galleries()
print('get your galleries information success.')
create_gallery_dir()
# 获取各画廊的图片
for gallery in galleries:
get_images(gallery)
print('get your galleries images information success.')
# 下载画廊图片到磁盘
for gallery in galleries:
os.chdir(gallery.name)
for image in gallery.images:
save(image)
os.chdir('..')
print('\n\nfetch your 5oopx.com galleries images done.bye~bye~')