「抽屉网」抽屉网

抽屉网

==========================（一）=========================

# -*- coding: utf-8 -*-

import scrapy

class ChoutiSpider(scrapy.Spider):

name = 'chouti'

allowed_domains = ['chouti.com']

start_urls = ['http://dig.chouti.com/']

# def start_requests(self):

# print('//////////////')

# 第一次请求之后返回的响应

# 有的网站在返回登录页面时,会携带一些登录需要的参数,例如csrf_token,xsrf等等,需要先从登录页面中提取所需参数,再发送POST请求

def parse(self, response):

'''

通过发送POST请求,模拟登录

response 可以从response中提取一些登录需要参数

'''

# 重新发起一次POST登录请求

# FormRequest() 是scrapy提供的用于发送POST请求的类

yield scrapy.FormRequest(

# 登录地址

url='http://dig.chouti.com/login',

# 请求参数

formdata={

'phone':'8615237034401',

'password':'13243259989',

'oneMonth':'1'

# 回调函数

callback=self.parse_index

)

def parse_index(self, response):

'''

{"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_52091364220"}}}

'''

print(response.text)

print('...')

yield scrapy.Request(

url='http://dig.chouti.com/user/link/saved/1',

callback=self.parse_index

)

=======================（二）==================================

# -*- coding: utf-8 -*-

import scrapy

import codecs

import requests

class Chouti2Spider(scrapy.Spider):

name = 'chouti2'

allowed_domains = ['chouti.com']

# start_urls 可以不用设置

# start_urls = ['http://dig.chouti.com/']

# 整个爬虫程序第一个调用的函数,第一个请求就是从这发出去的,

def start_requests(self):

# for循环遍历start_urls,根据取出url创建request对象,yield request对象

# 发送登录的POST请求

yield scrapy.FormRequest(

url='http://dig.chouti.com/login',

formdata={

'phone':'8615237034401',

'password':'13243259989',

'oneMonth':'1'

# callback如果不指定,回调parse()函数

# callback=self.parse

headers={

'User-Agent':'Mozilla/5.0 (windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'

}

)

def parse(self, response):

# 取出cookies

headers = response.headers.iteritems()

for x in headers:

if b'Set-Cookie' in x:

cookies = x[1]

cookies = [ck.decode('utf-8').split(';')[0] for ck in cookies]

cookie_str = ';'.join(cookies[:2])

# 直接访问登录后才能访问页面

yield scrapy.Request(

url='http://dig.chouti.com/user/link/saved/1',

callback=self.parse_center,

headers={

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',

'Host':'dig.chouti.com',

'Accept':'text/html,APPlication/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

'referer':'http://dig.chouti.com/',

'Connection':'keep-alive',

'Cookie':cookie_str,# gpsd 这个cookie有问题

'Upgrade-Insecure-Requests':'1'

}

)

def parse_center(self, response):

print(response.text)

with codecs.open('1.html','w+',encoding='utf-8') as f:

f.write(response.text)

'd217b0e9ece60ada35f57015f54e8f59'

抽屉网

抽屉网

相关阅读

栏目导航

推荐阅读

热门阅读