python - Scrapy not working as expected -
this scrapy
spider. i'm trying scrape data web. don't know how force scrapy
follow links recursively. mistake?
import re scrapy.selector import htmlxpathselector scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.selector import selector yellowpagesofmoldova.items import yellowpagesofmoldovaitem scrapy.item import item class yellowspider(crawlspider): name = 'yellow' allowed_domains = ['yellowpages.md'] start_urls = [ 'http://www.yellowpages.md/eng/companies/info/8939-arc-publishing-house'] rules = ( rule(sgmllinkextractor(allow=('eng.+')), follow=true), ) def parse(self, response): sel = selector(response) = yellowpagesofmoldovaitem() i['url'] = response.url i['locality'] = sel.xpath("//tr[3]/td/p[1]/span[1]/text()").extract() i['title'] = sel.xpath('//title/text()').extract() i['title2'] = sel.xpath("//td/h1/text()").extract() i['website'] = sel.xpath("//p[2]/a/text()").extract() i['activity'] = sel.xpath("//tbody/tr[4]/td/p/text()").extract() i['street'] = sel.xpath("//tr/td/p[1]/span[2]/text()").extract() return
thanks.
i resolved trouble. works perfectly. looks so:
import re scrapy.selector import htmlxpathselector scrapy.contrib.spiders import crawlspider, rule scrapy.contrib.linkextractors.sgml import sgmllinkextractor scrapy.selector import selector yellowpagesofmoldova.items import yellowpagesofmoldovaitem scrapy.item import item class yellowspider(crawlspider): name = 'yellow' allowed_domains = ['yellowpages.md'] start_urls = [ 'http://www.yellowpages.md/eng/companies/info/8939-arc-publishing-house'] rules = ( rule(sgmllinkextractor(allow=('eng.+')),callback='parse_items', follow=true), ) def parse_items(self, response): sel = selector(response) = yellowpagesofmoldovaitem() i['url'] = response.url i['locality'] = sel.xpath("//tr[3]/td/p[1]/span[1]/text()").extract() i['title'] = sel.xpath('//title/text()').extract() i['title2'] = sel.xpath("//td/h1/text()").extract() i['website'] = sel.xpath("//p[2]/a/text()").extract() i['activity'] = sel.xpath("//tbody/tr[4]/td/p/text()").extract() i['street'] = sel.xpath("//tr/td/p[1]/span[2]/text()").extract() return
crawlspider's parse
method should not overridden because that's "magic" happens. (see warning parse
here).
change def parse
def parse_page
, reference callback in rule: rule(sgmllinkextractor(allow=('eng.+')), callback='parse_page', follow=true),
Comments
Post a Comment