pyspider—爬取视频链接

#!/usr/bin/env python# -*- encoding: utf-8 -*-# Created on 2015-03-20 09:46:20# Project: fly_spiderimport reimport time#from pyspider.database.mysql.mysqldb import SQLfrom pyspider.libs.base_handler import *from pyquery import PyQuery as pqclass Handler(BaseHandler):    headers= {    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",    "Accept-Encoding":"gzip, deflate, sdch",    "Accept-Language":"zh-CN,zh;q=0.8",    "Cache-Control":"max-age=0",    "Connection":"keep-alive",    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36"    }    crawl_config = {        "headers" : headers,        "timeout" : 100    }    @every(minutes= 1)    def on_start(self):        self.crawl('http://www.zhanqi.tv/games',callback=self.index_page)    @config(age=10 * 24 * 60 * 60)    def index_page(self, response):        print(response)        for each in response.doc('a[href^="http://www.zhanqi.tv/games/"]').items():            if re.match("http://www.zhanqi.tv/games/\w+", each.attr.href, re.U):                self.crawl(each.attr.href,                 fetch_type='js',                                   js_script="""                function() {                    setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);                         }                """,callback=self.list_page)      @config(age=1*60*60, priority=2)                    def list_page(self, response):        for each in response.doc('.active > div.live-list-tabc > ul#hotList.clearfix > li > a').items():            if re.match("http://www.zhanqi.tv/\w+", each.attr.href, re.U):                self.crawl(each.attr.href,                fetch_type='js',                                   js_script="""                function() {                    setTimeout(window.scrollTo(0,document.body.scrollHeight), 5000);                        }                """,callback=self.detail_page)        @config(age=1*60*60, priority=2)    def detail_page(self, response):        for each in response.doc('.video-flash-cont').items():            d = pq(each)            print(d.html())        return {            "url": response.url,            "author":response.doc('.meat > span').text(),            "title":response.doc('.title-name').text(),            "game-name":response.doc('span > .game-name').text(),            "users2":response.doc('div.live-anchor-info.clearfix > div.sub-anchor-info > div.clearfix > div.meat-info > span.num.dv.js-onlines-panel > span.dv.js-onlines-txt > span').text(),            "flash-cont":d.html(),            "picture":response.doc('.active > img').text(),            }

文章转载于:https://www.cnblogs.com/panliu/p/4849217.html

原著是一个有趣的人,若有侵权,请通知删除

本博客所有文章如无特别注明均为原创。
复制或转载请以超链接形式注明转自起风了，原文地址《pyspider—爬取视频链接》