Python 3 BeautifulSoup get URL (href or baseURL) if div Class caption text contains "English"
Question:
<div class="gallery" data-tags="19 16 40193 41706 40476 7921 815 425 900 362 229 154 146 13 65 129 766 25 9 51931 188">
<a href="/g/987654/" class="cover" style="padding:0 0 142.79999999999998% 0">
<img is="lazyload-image" class="" width="250" height="357" data-src="https://abc.cloud.xyz/galleries/123456/thumb.jpg" alt="" src="https://abc.cloud.xyz/galleries/123456/thumb.jpg">
<div class="caption">[User] Text ABCDEFGH [English] </div>
</a>
</div>
The Programm don’t save the URLs/hrefs in to the txt file. I think that it can’t find the href
If div element with class caption contains the Word English then the href (/g/987654/) of the beloging a element class cover should be saved in a txt file.
from bs4 import BeautifulSoup
import requests
url = "https://google.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
base_urls = []
for div in soup.find_all("div", {"class": "caption"}):
if "English" in div.text:
a_tag = div.find_previous_sibling("a")
if a_tag:
base_urls.append(a_tag["baseURL"])
with open("base_urls.txt", "w") as f:
for base_url in base_urls:
f.write(base_url + "n")
**What i tried so far
**
This Code works BUT it saves all the hrefs in to the txt file…
from bs4 import BeautifulSoup
import requests
url = "https://google.com"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
links = soup.find_all("a")
hrefs = [link["href"] for link in links]
with open("links_test1.txt", "w") as file:
for href in hrefs:
file.write(href + "n")
#######################################################################
NEW PART
from bs4 import BeautifulSoup
import requests
lurl = ["https://web.com/page1","https://web.com/page2","https://web.com/page3"]
for url in lurl:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
base_urls = []
for div in soup.find_all("div", {"class": "caption"}):
if "English" in div.text:
a_tag = div.find_previous("a")
if a_tag:
base_urls.append(a_tag["href"])
with open("base_urls2.txt", "w") as f:
for base_url in base_urls:
f.write(base_url + "n")
It would be ideal if I could input a list or a txt file containing all the URLs. Any Idea??? For me trying to import a txt file in beautifulsoup was not possible…
I’m pretty new to python…
The txt file coul/would look like this
https://web1.com
https.//user1.com
https://web.com
Each line only 1 url
Answers:
Looking at the HTML snippet you should use .find_previous
instead of .find_previous_sibling
. Also, use a_tag['href']
, not a_tag['baseURL']
:
from bs4 import BeautifulSoup
html_doc = """
<div class="gallery" data-tags="19 16 40193 41706 40476 7921 815 425 900 362 229 154 146 13 65 129 766 25 9 51931 188">
<a href="/g/987654/" class="cover" style="padding:0 0 142.79999999999998% 0">
<img is="lazyload-image" class="" width="250" height="357" data-src="https://abc.cloud.xyz/galleries/123456/thumb.jpg" alt="" src="https://abc.cloud.xyz/galleries/123456/thumb.jpg">
<div class="caption">[User] Text ABCDEFGH [English] </div>
</a>
</div>"""
soup = BeautifulSoup(html_doc, "html.parser")
base_urls = []
for div in soup.find_all("div", {"class": "caption"}):
if "English" in div.text:
a_tag = div.find_previous("a")
if a_tag:
base_urls.append(a_tag["href"])
print(base_urls)
Prints:
['/g/987654/']
<div class="gallery" data-tags="19 16 40193 41706 40476 7921 815 425 900 362 229 154 146 13 65 129 766 25 9 51931 188">
<a href="/g/987654/" class="cover" style="padding:0 0 142.79999999999998% 0">
<img is="lazyload-image" class="" width="250" height="357" data-src="https://abc.cloud.xyz/galleries/123456/thumb.jpg" alt="" src="https://abc.cloud.xyz/galleries/123456/thumb.jpg">
<div class="caption">[User] Text ABCDEFGH [English] </div>
</a>
</div>
The Programm don’t save the URLs/hrefs in to the txt file. I think that it can’t find the href
If div element with class caption contains the Word English then the href (/g/987654/) of the beloging a element class cover should be saved in a txt file.
from bs4 import BeautifulSoup
import requests
url = "https://google.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
base_urls = []
for div in soup.find_all("div", {"class": "caption"}):
if "English" in div.text:
a_tag = div.find_previous_sibling("a")
if a_tag:
base_urls.append(a_tag["baseURL"])
with open("base_urls.txt", "w") as f:
for base_url in base_urls:
f.write(base_url + "n")
**What i tried so far
**
This Code works BUT it saves all the hrefs in to the txt file…
from bs4 import BeautifulSoup
import requests
url = "https://google.com"
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
links = soup.find_all("a")
hrefs = [link["href"] for link in links]
with open("links_test1.txt", "w") as file:
for href in hrefs:
file.write(href + "n")
#######################################################################
NEW PART
from bs4 import BeautifulSoup
import requests
lurl = ["https://web.com/page1","https://web.com/page2","https://web.com/page3"]
for url in lurl:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
base_urls = []
for div in soup.find_all("div", {"class": "caption"}):
if "English" in div.text:
a_tag = div.find_previous("a")
if a_tag:
base_urls.append(a_tag["href"])
with open("base_urls2.txt", "w") as f:
for base_url in base_urls:
f.write(base_url + "n")
It would be ideal if I could input a list or a txt file containing all the URLs. Any Idea??? For me trying to import a txt file in beautifulsoup was not possible…
I’m pretty new to python…
The txt file coul/would look like this
https://web1.com
https.//user1.com
https://web.com
Each line only 1 url
Looking at the HTML snippet you should use .find_previous
instead of .find_previous_sibling
. Also, use a_tag['href']
, not a_tag['baseURL']
:
from bs4 import BeautifulSoup
html_doc = """
<div class="gallery" data-tags="19 16 40193 41706 40476 7921 815 425 900 362 229 154 146 13 65 129 766 25 9 51931 188">
<a href="/g/987654/" class="cover" style="padding:0 0 142.79999999999998% 0">
<img is="lazyload-image" class="" width="250" height="357" data-src="https://abc.cloud.xyz/galleries/123456/thumb.jpg" alt="" src="https://abc.cloud.xyz/galleries/123456/thumb.jpg">
<div class="caption">[User] Text ABCDEFGH [English] </div>
</a>
</div>"""
soup = BeautifulSoup(html_doc, "html.parser")
base_urls = []
for div in soup.find_all("div", {"class": "caption"}):
if "English" in div.text:
a_tag = div.find_previous("a")
if a_tag:
base_urls.append(a_tag["href"])
print(base_urls)
Prints:
['/g/987654/']