纵有疾风起
人生不言弃

pyspider—爬取下载图片

以第一ppt网站为例:http://www.1ppt.com/

from pyspider.libs.base_handler import *import urllib2,HTMLParser,reimport urllib2,HTMLParser,re#根urlhost = "http://www.1ppt.com/"#本地保存地址localSavePath = '/data/girls/'#起始图片html地址startHtmlUrl = ''#图片页Html的地址htmlUrlList = []#图片Url地址imageUrlList = []patter = '[0-9]*\.jpg';#根据得到的图片路径URL将图片下载下来保存本地def downloadImage(url):    print url    cont = urllib2.urlopen(url).read()    match = re.search(patter,url);    if match:        print '正在下载文件:',match.group()        filename = localSavePath+match.group()        f = open(filename,'w+')        f.write(cont)        f.close()    else:        print 'no match'#根据首页得到的图片集遍历每个图片集def getImageUrlByHtmlUrl(htmlUrl):    parser = MyHtmlParse(False)    request = urllib2.Request(htmlUrl)    try:        response = urllib2.urlopen(request)        content = response.read()        parser.feed(content)    except urllib2.URLError,e:        print e.reason        returnclass MyHtmlParse(HTMLParser.HTMLParser):    def __init__(self,isIndex):        self.isIndex = isIndex;        HTMLParser.HTMLParser.__init__(self)    def handle_starttag(self,tag,attrs):        #print tag        #print attrs        if(self.isIndex):            if(tag == 'a'):                if(len(attrs) == 3):                    #print attrs[0]                    if(attrs[1][0] =='title'):                        newUrl = host + attrs[0][1]                        #    print '找到一处图片的网页链接:',newUrl                        global startHtml                        startHtmlUrl = newUrl                        getImageUrlByHtmlUrl(newUrl)        else:            #print tag            if(tag == 'img'):                #    print attrs                #print attrs[0][0]                #print attrs[1][0]                if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ):                    imageUrl = attrs[0][1]                    match = re.search(patter,imageUrl)                    if match:                        print '找到一张图片:',imageUrl                        downloadImage(imageUrl)                        imageUrlList.append(imageUrl)                            #if (tag == 'a'):                               #if (len(attrs) == 4):                        ##if (attrs[1] == ('class','next')):                        #nextUrl = host + attrs[2][1]                        #print '找到一处图片的网页链接:',nextUrl                        #global startHtmlUrl                        #if (startHtmlUrl != nextUrl):                        #getImageUrlByHtmlUrl(nextUrl)#分析首页得到每个图片集的链接def parse_url_picture(indexUrl):    #indexUrl = 'http://desk.zol.com.cn/meinv/'    #分析首页得到每个图片集的链接    #indexUrl = 'http://www.1ppt.com'    m = urllib2.urlopen(indexUrl).read()    #print m    parserIndex = MyHtmlParse(True)    parserIndex.feed(m)picture_website = r'http://www.1ppt.com/'class Handler(BaseHandler):    crawl_config = {    }        @every(minutes=24 * 60)    def on_start(self):        self.crawl(picture_website, callback=self.index_page)        return    @config(age= 10 * 24 * 60 * 60)    def index_page(self, response):        for each in response.doc('a[href^="http"]').items():            print each.attr.href            parse_url_picture(each.attr.href)            self.crawl(each.attr.href, callback=self.detail_page)        return        @config(priority=2)    def detail_page(self, response):        return{        }

 

下面脚本是直接运行(不用放到爬虫平台上):

#coding: utf-8 ############################################################## File Name: girls.py# Author: mylonly# mail: mylonly@gmail.com# Created Time: Mon 09 Jun 2014 09:23:18 PM CST##########################################################################!/usr/bin/pythonimport urllib2,HTMLParser,re#根urlhost = "http://1ppt.com"#本地保存地址localSavePath = '/data/girls/'#起始图片html地址startHtmlUrl = ''#图片页Html的地址htmlUrlList = []#图片Url地址imageUrlList = []patter = '[0-9]*\.jpg';#根据得到的图片路径URL将图片下载下来保存本地def downloadImage(url):    print url    cont = urllib2.urlopen(url).read()    match = re.search(patter,url);    if match:        print '正在下载文件:',match.group()        filename = localSavePath+match.group()        f = open(filename,'w+')        f.write(cont)        f.close()    else:        print 'no match'#根据首页得到的图片集遍历每个图片集def getImageUrlByHtmlUrl(htmlUrl):    parser = MyHtmlParse(False)    request = urllib2.Request(htmlUrl)    try:        response = urllib2.urlopen(request)        content = response.read()        parser.feed(content)    except urllib2.URLError,e:        print e.reasonclass MyHtmlParse(HTMLParser.HTMLParser):    def __init__(self,isIndex):        self.isIndex = isIndex;        HTMLParser.HTMLParser.__init__(self)            def handle_starttag(self,tag,attrs):        #print tag        #print attrs                if(self.isIndex):            if(tag == 'a'):                if(len(attrs) == 3):                    #print attrs[0]                    if(attrs[1][0] =='title'):                        newUrl = host + attrs[0][1]                    #    print '找到一处图片的网页链接:',newUrl                        global startHtml                        startHtmlUrl = newUrl                        getImageUrlByHtmlUrl(newUrl)        else:            #print tag            if(tag == 'img'):            #    print attrs                print attrs[0][0]                print attrs[1][0]                if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ):                    imageUrl = attrs[0][1]                    match = re.search(patter,imageUrl)                    if match:                        print '找到一张图片:',imageUrl                        downloadImage(imageUrl)                        imageUrlList.append(imageUrl)                #if (tag == 'a'):                #if (len(attrs) == 4):                    ##if (attrs[1] == ('class','next')):                    #nextUrl = host + attrs[2][1]                    #print '找到一处图片的网页链接:',nextUrl                    #global startHtmlUrl                    #if (startHtmlUrl != nextUrl):                        #getImageUrlByHtmlUrl(nextUrl)#分析首页得到每个图片集的链接indexUrl = 'http://www.1ppt.com'm = urllib2.urlopen(indexUrl).read()#print mparserIndex = MyHtmlParse(True)parserIndex.feed(m)

文章转载于:https://www.cnblogs.com/panliu/p/4849212.html

原著是一个有趣的人,若有侵权,请通知删除

未经允许不得转载:起风网 » pyspider—爬取下载图片
分享到: 生成海报

评论 抢沙发

评论前必须登录!

立即登录