python issue to join relative url to absolute url for img









up vote
0
down vote

favorite












I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:



Current relative path (this is what I get with normal response.xpath crawl):



/imagename.jpg


This is my current code:



class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0

rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)


def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item









share|improve this question





















  • It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
    – Simon Fromme
    Nov 9 at 21:20











  • Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
    – Gabriel Alejandro
    Nov 10 at 2:10










  • @GabrielAlejandro you should use urljoin() function to get full image path!
    – Sohan Das
    Nov 10 at 3:24















up vote
0
down vote

favorite












I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:



Current relative path (this is what I get with normal response.xpath crawl):



/imagename.jpg


This is my current code:



class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0

rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)


def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item









share|improve this question





















  • It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
    – Simon Fromme
    Nov 9 at 21:20











  • Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
    – Gabriel Alejandro
    Nov 10 at 2:10










  • @GabrielAlejandro you should use urljoin() function to get full image path!
    – Sohan Das
    Nov 10 at 3:24













up vote
0
down vote

favorite









up vote
0
down vote

favorite











I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:



Current relative path (this is what I get with normal response.xpath crawl):



/imagename.jpg


This is my current code:



class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0

rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)


def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item









share|improve this question













I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:



Current relative path (this is what I get with normal response.xpath crawl):



/imagename.jpg


This is my current code:



class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0

rules =
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[@id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)


def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[@id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[@id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[@class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[@id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[@id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[@id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = 'http://www.website.com.ar'
xpath2 = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[@class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item






python scrapy






share|improve this question













share|improve this question











share|improve this question




share|improve this question










asked Nov 9 at 21:11









Gabriel Alejandro

85




85











  • It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
    – Simon Fromme
    Nov 9 at 21:20











  • Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
    – Gabriel Alejandro
    Nov 10 at 2:10










  • @GabrielAlejandro you should use urljoin() function to get full image path!
    – Sohan Das
    Nov 10 at 3:24

















  • It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
    – Simon Fromme
    Nov 9 at 21:20











  • Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
    – Gabriel Alejandro
    Nov 10 at 2:10










  • @GabrielAlejandro you should use urljoin() function to get full image path!
    – Sohan Das
    Nov 10 at 3:24
















It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20





It's not clear from the question what you are trying to achieve. What is the expected behaviour and what are you getting instead? Instead of posting parts of the code that are unrelevant for answering the question, please provide a minimal working example.
– Simon Fromme
Nov 9 at 21:20













Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10




Im needing to crawl images from a ecommerce website (product photo) but the html of the img code is only pointing with the relative path (on src of the img tag) so scrapy is giving me several errors because it requires the absolute image path to crawl them.
– Gabriel Alejandro
Nov 10 at 2:10












@GabrielAlejandro you should use urljoin() function to get full image path!
– Sohan Das
Nov 10 at 3:24





@GabrielAlejandro you should use urljoin() function to get full image path!
– Sohan Das
Nov 10 at 3:24













1 Answer
1






active

oldest

votes

















up vote
1
down vote



accepted










try




absolute_url = response.urljoin(your_url_from_xpath)




scrapy documentation






share|improve this answer




















  • I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
    – Gabriel Alejandro
    Nov 10 at 19:29











  • Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
    – Gabriel Alejandro
    Nov 10 at 19:48










Your Answer






StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "1"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader:
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
,
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);













 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53233318%2fpython-issue-to-join-relative-url-to-absolute-url-for-img%23new-answer', 'question_page');

);

Post as a guest















Required, but never shown

























1 Answer
1






active

oldest

votes








1 Answer
1






active

oldest

votes









active

oldest

votes






active

oldest

votes








up vote
1
down vote



accepted










try




absolute_url = response.urljoin(your_url_from_xpath)




scrapy documentation






share|improve this answer




















  • I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
    – Gabriel Alejandro
    Nov 10 at 19:29











  • Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
    – Gabriel Alejandro
    Nov 10 at 19:48














up vote
1
down vote



accepted










try




absolute_url = response.urljoin(your_url_from_xpath)




scrapy documentation






share|improve this answer




















  • I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
    – Gabriel Alejandro
    Nov 10 at 19:29











  • Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
    – Gabriel Alejandro
    Nov 10 at 19:48












up vote
1
down vote



accepted







up vote
1
down vote



accepted






try




absolute_url = response.urljoin(your_url_from_xpath)




scrapy documentation






share|improve this answer












try




absolute_url = response.urljoin(your_url_from_xpath)




scrapy documentation







share|improve this answer












share|improve this answer



share|improve this answer










answered Nov 10 at 5:12









E. Amanatov

464




464











  • I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
    – Gabriel Alejandro
    Nov 10 at 19:29











  • Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
    – Gabriel Alejandro
    Nov 10 at 19:48
















  • I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
    – Gabriel Alejandro
    Nov 10 at 19:29











  • Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
    – Gabriel Alejandro
    Nov 10 at 19:48















I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29





I've replaced response.xpath to response.urljoin but I get the following error on console:ml_item['image_urls'] = response.urljoin('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src').extract() AttributeError: 'str' object has no attribute 'extract'
– Gabriel Alejandro
Nov 10 at 19:29













Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48




Ok, I was able to fix it, I made the following correction: image = response.xpath('//*[@id="main-container"]/div/div[2]/div[1]/div[1]/p/img/@src') ml_item['image_urls'] = [urlparse.urljoin(response.url, u) for u in image.extract()]
– Gabriel Alejandro
Nov 10 at 19:48

















 

draft saved


draft discarded















































 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53233318%2fpython-issue-to-join-relative-url-to-absolute-url-for-img%23new-answer', 'question_page');

);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

How to how show current date and time by default on contact form 7 in WordPress without taking input from user in datetimepicker

Syphilis

Darth Vader #20