以第一ppt网站为例:http://www.1ppt.com/
from pyspider.libs.base_handler import *import urllib2,HTMLParser,reimport urllib2,HTMLParser,re#根urlhost = "http://www.1ppt.com/"#本地保存地址localSavePath = '/data/girls/'#起始图片html地址startHtmlUrl = ''#图片页Html的地址htmlUrlList = []#图片Url地址imageUrlList = []patter = '[0-9]*\.jpg';#根据得到的图片路径URL将图片下载下来保存本地def downloadImage(url): print url cont = urllib2.urlopen(url).read() match = re.search(patter,url); if match: print '正在下载文件:',match.group() filename = localSavePath+match.group() f = open(filename,'w+') f.write(cont) f.close() else: print 'no match'#根据首页得到的图片集遍历每个图片集def getImageUrlByHtmlUrl(htmlUrl): parser = MyHtmlParse(False) request = urllib2.Request(htmlUrl) try: response = urllib2.urlopen(request) content = response.read() parser.feed(content) except urllib2.URLError,e: print e.reason returnclass MyHtmlParse(HTMLParser.HTMLParser): def __init__(self,isIndex): self.isIndex = isIndex; HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): #print tag #print attrs if(self.isIndex): if(tag == 'a'): if(len(attrs) == 3): #print attrs[0] if(attrs[1][0] =='title'): newUrl = host + attrs[0][1] # print '找到一处图片的网页链接:',newUrl global startHtml startHtmlUrl = newUrl getImageUrlByHtmlUrl(newUrl) else: #print tag if(tag == 'img'): # print attrs #print attrs[0][0] #print attrs[1][0] if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ): imageUrl = attrs[0][1] match = re.search(patter,imageUrl) if match: print '找到一张图片:',imageUrl downloadImage(imageUrl) imageUrlList.append(imageUrl) #if (tag == 'a'): #if (len(attrs) == 4): ##if (attrs[1] == ('class','next')): #nextUrl = host + attrs[2][1] #print '找到一处图片的网页链接:',nextUrl #global startHtmlUrl #if (startHtmlUrl != nextUrl): #getImageUrlByHtmlUrl(nextUrl)#分析首页得到每个图片集的链接def parse_url_picture(indexUrl): #indexUrl = 'http://desk.zol.com.cn/meinv/' #分析首页得到每个图片集的链接 #indexUrl = 'http://www.1ppt.com' m = urllib2.urlopen(indexUrl).read() #print m parserIndex = MyHtmlParse(True) parserIndex.feed(m)picture_website = r'http://www.1ppt.com/'class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl(picture_website, callback=self.index_page) return @config(age= 10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('a[href^="http"]').items(): print each.attr.href parse_url_picture(each.attr.href) self.crawl(each.attr.href, callback=self.detail_page) return @config(priority=2) def detail_page(self, response): return{ }
下面脚本是直接运行(不用放到爬虫平台上):
#coding: utf-8 ############################################################## File Name: girls.py# Author: mylonly# mail: mylonly@gmail.com# Created Time: Mon 09 Jun 2014 09:23:18 PM CST##########################################################################!/usr/bin/pythonimport urllib2,HTMLParser,re#根urlhost = "http://1ppt.com"#本地保存地址localSavePath = '/data/girls/'#起始图片html地址startHtmlUrl = ''#图片页Html的地址htmlUrlList = []#图片Url地址imageUrlList = []patter = '[0-9]*\.jpg';#根据得到的图片路径URL将图片下载下来保存本地def downloadImage(url): print url cont = urllib2.urlopen(url).read() match = re.search(patter,url); if match: print '正在下载文件:',match.group() filename = localSavePath+match.group() f = open(filename,'w+') f.write(cont) f.close() else: print 'no match'#根据首页得到的图片集遍历每个图片集def getImageUrlByHtmlUrl(htmlUrl): parser = MyHtmlParse(False) request = urllib2.Request(htmlUrl) try: response = urllib2.urlopen(request) content = response.read() parser.feed(content) except urllib2.URLError,e: print e.reasonclass MyHtmlParse(HTMLParser.HTMLParser): def __init__(self,isIndex): self.isIndex = isIndex; HTMLParser.HTMLParser.__init__(self) def handle_starttag(self,tag,attrs): #print tag #print attrs if(self.isIndex): if(tag == 'a'): if(len(attrs) == 3): #print attrs[0] if(attrs[1][0] =='title'): newUrl = host + attrs[0][1] # print '找到一处图片的网页链接:',newUrl global startHtml startHtmlUrl = newUrl getImageUrlByHtmlUrl(newUrl) else: #print tag if(tag == 'img'): # print attrs print attrs[0][0] print attrs[1][0] if(attrs[0][0] == 'src' and attrs[1][0] == 'alt' and attrs[0][1] ): imageUrl = attrs[0][1] match = re.search(patter,imageUrl) if match: print '找到一张图片:',imageUrl downloadImage(imageUrl) imageUrlList.append(imageUrl) #if (tag == 'a'): #if (len(attrs) == 4): ##if (attrs[1] == ('class','next')): #nextUrl = host + attrs[2][1] #print '找到一处图片的网页链接:',nextUrl #global startHtmlUrl #if (startHtmlUrl != nextUrl): #getImageUrlByHtmlUrl(nextUrl)#分析首页得到每个图片集的链接indexUrl = 'http://www.1ppt.com'm = urllib2.urlopen(indexUrl).read()#print mparserIndex = MyHtmlParse(True)parserIndex.feed(m)
还没有人抢沙发呢~