scrapy itemloader example
def parse_first_page(self, response):
count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0])
title = response.request.cookies['title']
albumURL = response.url.replace(".shtml", '')
# print u'', count, title, albumURL
for x in xrange(1,count+1):
suffix = ".shtml"
if x > 1:
suffix = "_"+str(x)+".shtml"
# print u'',albumURL+suffix
request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
yield request
l = ItemLoader(item=PageItem(), response=response)
l.add_value('title', title)
l.add_value('name', self.name)
l.add_value('url', response.url)
l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
yield l.load_item()
def parse_item(self, response):
loader = ItemLoader(EolZhiyeItem(), response)
loader.add_value('url', response.url)
loader.add_value('code', response.url, re=r'/(\w+)\.shtml')
loader.add_css('name', 'h1#pagetitle::text')
loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n'))
yield loader.load_item()
def parse_question(self, response):
# ??question??? ??????????question item
if "QuestionHeader-title" in response.text:
# ?????
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
item_loader.add_css("title", "h1.QuestionHeader-title::text")
item_loader.add_css("content", ".QuestionHeader-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", ".List-headerText span::text")
item_loader.add_css("comments_num", ".QuestionHeader-Comment button::text")
item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
question_item = item_loader.load_item()
else:
# ????????item??
match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
if match_obj:
question_id = int(match_obj.group(2))
item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
# item_loader.add_css("title", ".zh-question-title h2 a::text")
item_loader.add_xpath("title",
"//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
item_loader.add_css("content", "#zh-question-detail")
item_loader.add_value("url", response.url)
item_loader.add_value("zhihu_id", question_id)
item_loader.add_css("answer_num", "#zh-question-answer-num::text")
item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
# item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
item_loader.add_xpath("watch_user_num",
"//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
question_item = item_loader.load_item()
yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers,
callback=self.parse_answer)
yield question_item
def parse_item(self, response):
url = response.url
item_idx = self.all_urls[url]
self.logger.info("Trying page %s %s" % (item_idx, url))
resp_dct = json.loads(response.body)
l = ItemLoader(item=HeatMapItem(), response=response)
current_hour = time.strftime("%Y%m%d%H", time.localtime())
l.add_value('cur_hour', current_hour)
l.add_value('serial', item_idx)
l.add_value('data', resp_dct.pop('data'))
l.add_value('timestamp', resp_dct.pop('nt'))
l.add_value('others', resp_dct)
l.add_value('url', url)
l.add_value('is_parsed', 0)
self.finished.add(item_idx)
self.logger.info(u"Crawling %s, %s successfully. :)" % (item_idx, url))
self.claim_completeness()
yield l.load_item()
# else:
# if resp_dct.get("data") == "\\u8be5\\u7528\\u6237\\u8bbf\\u95ee\\u6b21\\u6570\\u8fc7\\u591a".decode(
# 'unicode_escape'): # ??????
# banned_cookie = response.request.cookies
# self.logger.warning("%s has been BANNED today." % banned_cookie)
# self.cookies.remove(banned_cookie)
# yield {"BannedCookieToday": banned_cookie}
# else:
# yield {}
# self.logger.error(u"Crawling %s, %s failed. :(" % (item_idx, response.url))
def parse(self, response):
# l = ItemLoader(item = ItjuziItem(),response=response)
jsonresponse = json.loads(response.body_as_unicode())
for i in range(0,len(jsonresponse['data']['list'])):
l = ItemLoader(item = LianjiaErshouItem(),response=response)
house_code = jsonresponse['data']['list'][i]['house_code']
price_total = jsonresponse['data']['list'][i]['price_total']
ctime = jsonresponse['data']['list'][i]['ctime']
title = jsonresponse['data']['list'][i]['title']
frame_hall_num = jsonresponse['data']['list'][i]['frame_hall_num']
tags = jsonresponse['data']['list'][i]['tags']
house_area = jsonresponse['data']['list'][i]['house_area']
community_id = jsonresponse['data']['list'][i]['community_id']
community_name = jsonresponse['data']['list'][i]['community_name']
is_two_five = jsonresponse['data']['list'][i]['is_two_five']
frame_bedroom_num = jsonresponse['data']['list'][i]['frame_bedroom_num']
l.add_value('house_code',house_code)
l.add_value('price_total',price_total)
l.add_value('ctime',ctime)
l.add_value('title',title)
l.add_value('frame_hall_num',frame_hall_num)
l.add_value('tags',tags)
l.add_value('house_area',house_area)
l.add_value('community_id',community_id)
l.add_value('community_name',community_name)
l.add_value('is_two_five',is_two_five)
l.add_value('frame_bedroom_num',frame_bedroom_num)
print l
yield l.load_item()
def parse_item(self, response):
loader = ItemLoader(GaokaopaiZhuanyeItem(), response)
loader.add_value('url', response.url)
loader.add_css('name', u'.majorTitle>h1::text')
loader.add_xpath('code', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('degree', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('period', u'//div[@class="majorBase"]/h3[starts-with(., "?????")]/text()', re=ur'?(.+)')
loader.add_xpath('courses', u'//div[@class="course"]/h3[.="?????"]/following-sibling::p/text()')
def parse_related():
for e in response.xpath(u'//div[@class="course"]/h3[.="?????"]/following-sibling::a'):
yield {
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
'name': e.css('::text').extract_first(),
}
loader.add_value('related', list(parse_related()))
def parse_category():
category = []
for i in [u"????", u"????", u"????"]:
x = u'//h3[.="{}"]/following-sibling::ul[1]/li[@class="current"]/a'.format(i)
e = response.xpath(x)
category.append({
'url': e.css('::attr(href)').extract_first(),
'code': e.css('::attr(href)').re_first(ur'/zhuanye([-0-9]*)\.html').strip('-'),
'name': e.css('::text').extract_first(),
})
return category
loader.add_value('category', parse_category())
loader.add_css('detail', u'.majorCon')
item = loader.load_item()
return Request(
url='http://www.gaokaopai.com/zhuanye-jiuye-{}.html'.format(item['code'][0]),
meta={'item': item},
callback=self.parse_jiuye
)
def parse_first_page(self, response):
count = int(response.xpath('//div[@id="aplist"]/ul/li[1]/a/text()')[0].re(r'.*?(\d+).*?')[0])
title = response.request.cookies['title']
albumURL = response.url.replace(".html", '')
for x in xrange(1,count+1):
suffix = ".html"
if x > 1:
suffix = "_"+str(x)+".html"
request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
yield request
l = ItemLoader(item=PageItem(), response=response)
l.add_value('title', title)
l.add_value('name', self.name)
l.add_value('url', response.url)
l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src')
yield l.load_item()
def parse(self, response):
try:
l = ItemLoader(item=MovieItem(), response=response)
l.add_value('name',
response.css('div#content h1 [property="v:itemreviewed"]::text').extract_first().strip())
year = response.css('div#content h1 span.year::text').extract_first()
if year.startswith('('):
year = year[1:-1]
l.add_value('year', year)
newStrL = []
for val in response.css('div#info::text').extract():
newStr = val.strip().strip('/')
if newStr != '':
newStrL.append(newStr)
if len(newStrL) == 2:
break
if len(newStrL) == 2:
l.add_value('region', newStrL[0].split('/'))
l.add_value('language', newStrL[1].split('/'))
l.add_value('duration', response.css('div#info [property="v:runtime"]::attr(content)').extract_first())
l.add_value('types', response.css('div#info [property="v:genre"]::text').extract())
l.add_value('directors', response.css('div#info [rel="v:directedBy"]::text').extract())
l.add_value('actors', response.css('div#info [rel="v:starring"]::text').extract())
l.add_value('runtime', response.css('div#info [property="v:initialReleaseDate"]::text').extract())
l.add_value('detailurl', response.url)
l.add_value('IMDburl', response.css('div#info [rel="nofollow"]::attr(href)').extract())
l.add_value('stars', response.css('strong[property="v:average"]::text').extract_first())
return l.load_item()
except Exception:
pass
def parse_song_list(self, response):
selector = Selector(response)
song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
title = selector.xpath('//title/text()').extract()
for index, id_ in enumerate(song_id_list):
l = ItemLoader(item=PlayListItem())
l.add_value('song_name', song_name_list[index])
l.add_value('title', title)
yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
headers=self.headers, callback=self.parse_single_song)