代码

px500.py

一些自定义的类,包括用户,画廊,图片

# -*- coding: utf-8 -*-


# 500px 用户
class User(object):
    id = 0
    name = ''

    def __init__(self, id, name):
        self.id = id
        self.name = name


# 500px 画廊
class Gallery(object):
    id = 0
    name = ''
    path = ''
    images = ()

    def __init__(self, id, name, path):
        self.id = id
        self.name = name
        self.path = path

    def __repr__(self):
        return 'Gallery [id=' + str(self.id) + ', name=' + self.name + ', path=' + self.path + ',images=' + str(self.images) + ']'


# 500px 图片
class Image(object):
    id = 0
    name = ''
    url = ''
    format = ''

    def __init__(self, id, name, url, format):
        self.id = id
        self.name = name
        self.url = url
        self.format = format

        # 部分图片的名字包含了非法字符,如果不做处理则无法作为文件名
        illegals = '\\/:*?"<>|'
        for c in illegals:
            self.name = self.name.replace(c, '_')

    def __repr__(self):
        return 'Image [id=' + str(self.id) + ', name=' + self.name + ', url=' + self.url + ', format=' + \
               self.format + ']'

csrf.py

用于从 html 里解析出 csrf token

# -*- coding: utf-8 -*-
from html.parser import HTMLParser


# 从 500px.com 登录页解析出 CSRF token 要解析的标签如下
#
# <meta name="csrf-param" content="authenticity_token" />
# <meta name="csrf-token" content="LeMJgYNZrozw+7niJ8OsZy0AzxT2DwuJPTV+cwR8ZX0QBHhG49..." />
class Csrf(HTMLParser):

    param = ''
    token = ''

    def handle_startendtag(self, tag, attrs):
        if tag == 'meta' and len(attrs) == 2:
            name = attrs[0]
            content = attrs[1]
            if name[0] == 'name' and content[0] == 'content':
                if name[1] == 'csrf-param':
                    self.param = content[1]
                elif name[1] == 'csrf-token':
                    self.token = content[1]

spider.py

主程序

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import http.cookiejar
import json
import os
import os.path
import urllib.parse
import urllib.request

from px500 import User, Gallery, Image
from csrf import Csrf

# 保存图片的主目录
home = 'd:\\500px'

# 登录 500px 的帐号和密码
email = '[email protected]'
password = '12345678'

# 是否重新下载并覆盖已存在的图片
overwrite = False

# 500px CSRF token
csrf = Csrf()
# 500px 用户
user = None
# 画廊列表
galleries = ()
# 下载计数
count = 1


# 使 urllib.request 库支持 cookie
def enable_cookie():
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()))
    urllib.request.install_opener(opener)


# 访问登录页,拿到 CSRF TOKEN
def visit_login_page():

    global csrf

    url = 'https://500px.com/login'
    with urllib.request.urlopen(url) as rsp:
        html = rsp.read().decode('utf8')
    csrf.feed(html)


# 登录到 500px
def login():

    global user

    # 登录获得用户信息
    login_url = 'https://api.500px.com/v1/session'
    data = {
        'session[email]': email,
        'session[password]': password,
        csrf.param: csrf.token
    }
    req = urllib.request.Request(login_url, urllib.parse.urlencode(data).encode('ascii'))
    with urllib.request.urlopen(req) as rsp:
        html = rsp.read().decode('utf8')
    json_obj = json.loads(html, 'utf8')
    user = User(json_obj['user']['id'], json_obj['user']['username'])


# 创建用户目录
def create_user_dir():
    if not os.path.exists(home):
        os.mkdir(home)
    os.chdir(home)
    if not os.path.exists(user.name):
        os.mkdir(user.name)
    os.chdir(user.name)


# 获取画廊信息,并创建画廊目录
def get_galleries():

    global galleries

    data = {
        'privacy': 'both',
        'include_cover': 1,
        'cover_size': 440,
        'kinds': '0,1,2,4,5',
        'rpp': 50,
        'sort': 'position',
        'sort_direction': 'asc',
        'page': 1
    }
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'X-CSRF-Token': csrf.token,
        'Referer': 'https://500px.com/' + user.name + '/galleries',
        'Accept': 'application/json, text/javascript, */*; q=0.01'
    }
    # 必须用 GET 请求
    galleries_url = 'https://api.500px.com/v1/users/' + str(user.id) + '/galleries?' + urllib.parse.urlencode(data)
    req = urllib.request.Request(url=galleries_url, headers=headers)
    with urllib.request.urlopen(req) as rsp:
        html = rsp.read().decode('utf8')
    json_obj = json.loads(html, 'utf8')
    galleries_list = []
    for gallery in json_obj['galleries']:
        galleries_list.append(Gallery(gallery['id'], gallery['name'], gallery['custom_path']))
    galleries = tuple(galleries_list)


# 创建画廊目录
def create_gallery_dir():
    for item in galleries:
        if not os.path.exists(item.name):
            os.mkdir(item.name)


# 获取画廊里的图片
def get_images(gallery):

    page = 1
    while True:
        data = {
               'formats': 'jpeg,lytro',
               'image_size[]': 2048,
               'include_licensing': 'true',
               'page': page,
               'rpp': 50,
               'sort': 'position',
               'sort_direction': 'asc'
        }

        # 这个请求头必须的,否则返回 401
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'X-CSRF-Token': csrf.token,
            'Referer': 'https://500px.com/' + user.name + '/galleries/' + gallery.path,
            'Accept': 'application/json, text/javascript, */*; q=0.01'
        }
        image_url = 'https://api.500px.com/v1/users/' + str(user.id) + '/galleries/' + str(gallery.id) + '/items?' + \
                    urllib.parse.urlencode(data)
        req = urllib.request.Request(image_url, None, headers)
        with urllib.request.urlopen(req) as rsp:
            html = rsp.read().decode('utf8')

        images = list(gallery.images)
        json_obj = json.loads(html, 'utf8')
        photos = json_obj['photos']
        for photo in photos:
            photo_images = photo['images']
            for photo_image in photo_images:
                if photo_image['size'] == 2048:
                    images.append(Image(photo['id'], photo['name'], photo_image['url'], photo_image['format']))
        gallery.images = tuple(images)

        if page == json_obj["total_pages"]:
            break
        else:
            page += 1


# 保存图片,下载 30 秒超时,下载或保存失败会删除已创建的文件(多半是空文件)
def save(image):

    global count

    filename = image.name + '_' + str(image.id) + '.' + image.format
    tips = str(count) + ': ' + filename
    if overwrite or not os.path.exists(filename):
        try:
            with open(filename, 'wb') as file:
                with urllib.request.urlopen(image.url, timeout=30) as rsp:
                    file.write(rsp.read())
            print(tips, "saved")
        except Exception as e:
            print(tips, e, image.url)
            if os.path.exists(filename):
                os.remove(filename)
    else:
        print(tips, 'exist,ignored.')
    count += 1


# ------------ 主程序开始运行 ------------
print('start to fetch your 5oopx.com galleries images, please wait...')

# 开启 cookie 支持
enable_cookie()

# 登录
visit_login_page()
login()

print('login 500px.com success.')

# 创建用户目录及画廊目录
create_user_dir()

# 获取用户画廊
get_galleries()

print('get your galleries information success.')

create_gallery_dir()

# 获取各画廊的图片
for gallery in galleries:
    get_images(gallery)

print('get your galleries images information success.')

# 下载画廊图片到磁盘
for gallery in galleries:
    os.chdir(gallery.name)
    for image in gallery.images:
        save(image)
    os.chdir('..')

print('\n\nfetch your 5oopx.com galleries images done.bye~bye~')

results matching ""

    No results matching ""