pyspider 常用模板

from pyspider.libs.base_handler import *
import time
import re


class Handler(BaseHandler):
    crawl_config = {

        "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 8_0_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4"

    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(
            'http://www.yntv.cn/page_list_text/picindex.html?sectionid=425&page=1&title=%E7%B2%BE%E5%BD%A9%E8%A7%86%E9%A2%91',
            callback=self.index_page, fetch_type='js')
        self.crawl(
            'http://www.yntv.cn/page_list_text/picindex.html?sectionid=429&page=1&title=%E7%83%AD%E6%92%AD%E5%89%A7%E5%9C%BA',
            callback=self.index_page, fetch_type='js')

    @config(age=23 * 60 * 60)
    def index_page(self, response):

        for x, y in zip(response.doc('.video_item_img').items(), response.doc('.item_img').items()):
            href = x('a').attr.href

            self.crawl(href, callback=self.detail_page, fetch_type='js', save={'p': y.attr.src},
                       js_script='''
               function() {
                   return video
               }
               ''')

    @config(priority=10)
    def detail_page(self, response):
        try:
            a = re.findall(r'编辑:(\S*)', response.text)[0]

        except:
            a = ''
        try:
            t = re.findall(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}', response.text)[0]
            s = time.mktime(time.strptime(t, "%Y-%m-%d %H:%M"))

        except:
            s = time.time()

        try:
            vurl = response.js_script_result[0]
            print(111, vurl)
        except:
            vurl = ''

        tag1 = response.doc('.b02 > a').text()
        tag = tag1.split()
        print(tag)
        try:
            b = tag[1]
            c = tag[2]
            d = b + ";" + c
        except:
            b, c, d = '', '', ''

        return {
            "url": response.url,
            "project": self.project_name,
            "program_name": response.doc('.text_title').text(),
            "content": "",
            "actor": "",
            "spider_time": time.time(),
            "poster": response.save['p'],
            "create_time": time.time(),
            "publish_time": s,
            "director": "",
            "author": a,
            "source": "云南网络广播电视台",
            "accountcode": "15_STWZ_YNTVCN_00_530000",
            "video_url": vurl,
            "root_column_name": b,
            "root_column_id": "",
            "column_id": "",
            "column_name": c,
            "program_id": "",
            "tags": "视听",
            "episode": 1
        }