python issue to join relative url to absolute url for img
up vote
0
down vote
favorite
I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:
Current relative path (this is what I get with normal response.xpath crawl):
/imagename.jpg
This is my current code:
class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0
rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)
def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item
python scrapy
add a comment |
up vote
0
down vote
favorite
I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:
Current relative path (this is what I get with normal response.xpath crawl):
/imagename.jpg
This is my current code:
class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0
rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)
def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item
python scrapy
It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20
Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10
@GabrielAlejandro you should useurljoin()
function to get full image path!
– Sohan Das
Nov 10 at 3:24
add a comment |
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:
Current relative path (this is what I get with normal response.xpath crawl):
/imagename.jpg
This is my current code:
class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0
rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)
def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item
python scrapy
I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:
Current relative path (this is what I get with normal response.xpath crawl):
/imagename.jpg
This is my current code:
class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0
rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)
def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item
python scrapy
python scrapy
asked Nov 9 at 21:11
Gabriel Alejandro
85
85
It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20
Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10
@GabrielAlejandro you should useurljoin()
function to get full image path!
– Sohan Das
Nov 10 at 3:24
add a comment |
It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20
Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10
@GabrielAlejandro you should useurljoin()
function to get full image path!
– Sohan Das
Nov 10 at 3:24
It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20
It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20
Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10
Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10
@GabrielAlejandro you should use
urljoin()
function to get full image path!– Sohan Das
Nov 10 at 3:24
@GabrielAlejandro you should use
urljoin()
function to get full image path!– Sohan Das
Nov 10 at 3:24
add a comment |
1 Answer
1
active
oldest
votes
up vote
1
down vote
accepted
try
absolute_url = response.urljoin(your_url_from_xpath)
scrapy documentation
I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29
Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48
add a comment |
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
1
down vote
accepted
try
absolute_url = response.urljoin(your_url_from_xpath)
scrapy documentation
I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29
Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48
add a comment |
up vote
1
down vote
accepted
try
absolute_url = response.urljoin(your_url_from_xpath)
scrapy documentation
I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29
Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48
add a comment |
up vote
1
down vote
accepted
up vote
1
down vote
accepted
try
absolute_url = response.urljoin(your_url_from_xpath)
scrapy documentation
try
absolute_url = response.urljoin(your_url_from_xpath)
scrapy documentation
answered Nov 10 at 5:12
E. Amanatov
464
464
I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29
Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48
add a comment |
I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29
Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48
I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29
I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29
Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48
Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48
add a comment |
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53233318%2fpython-issue-to-join-relative-url-to-absolute-url-for-img%23new-answer', 'question_page');
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20
Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10
@GabrielAlejandro you should use
urljoin()
function to get full image path!– Sohan Das
Nov 10 at 3:24