Return None – scraping p by Python
Question:
I want to scrap name and number of product in stock but it return None.
data_insight = []
for n in range(pagenum):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('[class="prod-section-container"]'):
data_insight.append({
'title' : item.find("a", class_="select-prod").text,
'name' : item.find("p", string="Insight Part"),
'link' : item.find("a", class_="select-prod")['href'],
'price' : item.find("span", class_="c-currency__value"),
'stock' : item.find("p", class_="prod-stock").text
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight
Answers:
What happens?
With string
you can search for strings instead of tags, but in the way you do it looks for exact match.
How to fix?
You can go with string=re.compile()
but this would be only second best solution.
Better one would be to adjust your css selector
to get all the itemprop attributes:
soup.select('#js-search-product-items [itemprop="itemListElement"]')
Example
...
data_insight = []
for n in range(1):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('#js-search-product-items [itemprop="itemListElement"]'):
data_insight.append({
'title' : item.find(attrs={'itemprop':'name'}).text,
'sku' : item.find(attrs={'itemprop':'sku'}).text,
'link' : item.find(attrs={'itemprop':'url'}).text,
'price' : item.find("span", class_="c-currency__value").text,
'stock' : item.find("p", class_="prod-stock").get_text(strip=True).split(' ')[0]
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight
Output
I want to scrap name and number of product in stock but it return None.
data_insight = []
for n in range(pagenum):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('[class="prod-section-container"]'):
data_insight.append({
'title' : item.find("a", class_="select-prod").text,
'name' : item.find("p", string="Insight Part"),
'link' : item.find("a", class_="select-prod")['href'],
'price' : item.find("span", class_="c-currency__value"),
'stock' : item.find("p", class_="prod-stock").text
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight
What happens?
With string
you can search for strings instead of tags, but in the way you do it looks for exact match.
How to fix?
You can go with string=re.compile()
but this would be only second best solution.
Better one would be to adjust your css selector
to get all the itemprop attributes:
soup.select('#js-search-product-items [itemprop="itemListElement"]')
Example
...
data_insight = []
for n in range(1):
pages_url = f"https://www.insight.com/en_US/search.html?qtype=all&q=tp-link&qsrc=k&pq=%7B%22pageSize%22%3A100%2C%22currentPage%22%3A{n+1}%2C%22shownFlag%22%3Afalse%2C%22priceRangeLower%22%3Anull%2C%22priceRangeUpper%22%3Anull%2C%22cmtStandards%22%3Atrue%2C%22categoryId%22%3Anull%2C%22setType%22%3Anull%2C%22setId%22%3Anull%2C%22shared%22%3Anull%2C%22groupId%22%3Anull%2C%22cmtCustomerNumber%22%3Anull%2C%22groupName%22%3Anull%2C%22fromLicense%22%3Atrue%2C%22licenseContractIds%22%3Anull%2C%22assortmentIds%22%3Anull%2C%22controller%22%3Anull%2C%22fromcs%22%3Afalse%2C%22searchTerms%22%3A%7B%22TP-LINK%2520TECHNOLOGY%22%3A%7B%22field%22%3A%22field%22%2C%22value%22%3A%22A-HYBRIS-ManufacturerId~0007045098%22%7D%7D%2C%22sortBy%22%3A%22BestMatch%22%7D"
driver.get(pages_url)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '[class="prod-section-container"]')))
html = driver.page_source
soup = Soup(html)
soup
for item in soup.select('#js-search-product-items [itemprop="itemListElement"]'):
data_insight.append({
'title' : item.find(attrs={'itemprop':'name'}).text,
'sku' : item.find(attrs={'itemprop':'sku'}).text,
'link' : item.find(attrs={'itemprop':'url'}).text,
'price' : item.find("span", class_="c-currency__value").text,
'stock' : item.find("p", class_="prod-stock").get_text(strip=True).split(' ')[0]
})
df_insight = pd.DataFrame(data_insight)
df_insight.drop_duplicates()
df_insight