抽屉网
==========================(一)=========================
# -*- coding: utf-8 -*-
import scrapy
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/']
# def start_requests(self):
# print('//////////////')
# 第一次请求之后返回的响应
# 有的网站在返回登录页面时,会携带一些登录需要的参数,例如csrf_token,xsrf等等,需要先从登录页面中提取所需参数,再发送POST请求
def parse(self, response):
'''
通过发送POST请求,模拟登录
response 可以从response中提取一些登录需要参数
'''
# 重新发起一次POST登录请求
# FormRequest() 是scrapy提供的用于发送POST请求的类
yield scrapy.FormRequest(
# 登录地址
url='http://dig.chouti.com/login',
# 请求参数
formdata={
'phone':'8615237034401',
'password':'13243259989',
'oneMonth':'1'
},
# 回调函数
callback=self.parse_index
)
def parse_index(self, response):
'''
{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_52091364220"}}}
'''
print(response.text)
print('...')
yield scrapy.Request(
url='http://dig.chouti.com/user/link/saved/1',
callback=self.parse_index
)
=======================(二)==================================
# -*- coding: utf-8 -*-
import scrapy
import codecs
import requests
class Chouti2Spider(scrapy.Spider):
name = 'chouti2'
allowed_domains = ['chouti.com']
# start_urls 可以不用设置
# start_urls = ['http://dig.chouti.com/']
# 整个爬虫程序第一个调用的函数,第一个请求就是从这发出去的,
def start_requests(self):
# for循环遍历start_urls,根据取出url创建request对象,yield request对象
# 发送登录的POST请求
yield scrapy.FormRequest(
url='http://dig.chouti.com/login',
formdata={
'phone':'8615237034401',
'password':'13243259989',
'oneMonth':'1'
},
# callback如果不指定,回调parse()函数
# callback=self.parse
headers={
'User-Agent':'Mozilla/5.0 (windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'
}
)
def parse(self, response):
# 取出cookies
headers = response.headers.iteritems()
for x in headers:
if b'Set-Cookie' in x:
cookies = x[1]
cookies = [ck.decode('utf-8').split(';')[0] for ck in cookies]
cookie_str = ';'.join(cookies[:2])
# 直接访问登录后才能访问页面
yield scrapy.Request(
url='http://dig.chouti.com/user/link/saved/1',
callback=self.parse_center,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
'Host':'dig.chouti.com',
'Accept':'text/html,APPlication/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'referer':'http://dig.chouti.com/',
'Connection':'keep-alive',
'Cookie':cookie_str,# gpsd 这个cookie有问题
'Upgrade-Insecure-Requests':'1'
}
)
def parse_center(self, response):
print(response.text)
with codecs.open('1.html','w+',encoding='utf-8') as f:
f.write(response.text)
'd217b0e9ece60ada35f57015f54e8f59'
'd217b0e9ece60ada35f57015f54e8f59'
'd217b0e9ece60ada35f57015f54e8f59'
相关阅读
爬取抽屉新热榜 爬取段子 抽屉网址https://dig.chouti.com/r/scoff/hot/ 爬虫的常规操作,根据需求进行分析。我们要爬取段子,也就
设想你需要设计一个含有许多页面和模块,不能在一屏内显示完全的应用。你一定会首先想到去设计一个底部或顶部的Tab导航。等一下,多