Add Hyperlinks to HTML using BeautifulSoup in Python using Anchor Text and URL stored in a CSV File

Question:

I want to write a program in python beautiful soup to hyperlink words in html using the csv file with anchor_text and hyperlink

The CSV file that has 2 columns:

anchor_text hyperlink
Google https://www.google.com
Bing https://bing.com
Yahoo https://yahoo.com
Active Campaign https://activecampaign.com

Here is sample HTML

<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another multi word Active Campaign Text</p>
<!-- /wp:paragraph -->

I want the output to be

<!-- wp:paragraph -->
<p>This is a existing link <a href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another <a href="https://www.google.com/">Google</a> Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another lowercase <a href="https://bing.com/">bing</a> Text</p>
<!-- /wp:paragraph -->

<!-- wp:paragraph -->
<p>This is another multi word <a href="https://activecampaign.com/">Active Campaign</a> Text</p>
<!-- /wp:paragraph -->

Any help is appreciated

Asked By: Abhishek R

||

Answers:

You should try with anchor/links on the outer loop and then break down the matching strings in the inner loop:

import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from bs4 import element as bs4_element
import csv

html_doc = """
<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
 
<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->
 
<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->
 
<!-- wp:paragraph -->
<p>This is another multi word Active Campaign.</p>
<!-- /wp:paragraph -->
"""

soup = BeautifulSoup(html_doc, 'html.parser')

# read the CSV file with anchor text and hyperlinks
with open('file.csv', 'r') as csv_file:
  reader = csv.reader(csv_file)
  hyperlinks = dict(reader)


# from bs4 import element as bs4_element
be_navStr = bs4_element.NavigableString
 
hList = [
    (anchor_text.strip(), hyperlink.strip()) for
    anchor_text, hyperlink in hyperlinks.items()
    if anchor_text.strip() and hyperlink.strip() # no blanks
]
 

print('#'*35, 'OLD', '#'*35, 'n')
print(soup, 'n')
print('#'*75, 'nnn')

for txt, link in hList:
    navStrs = [
        d for d in soup.descendants if type(d) == be_navStr 
        # and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
        # and f' {txt.lower()} ' in f' {d.string.strip().lower()} ' # same as
        # and (' '+txt.lower()+' ') in (' '+d.string.strip().lower()+' ')
        and (' '+re.sub('W+',' ',txt.lower())+' ') in (' '+re.sub('W+',' ',d.string.strip().lower())+' ') # Handles special characters like ?.!
    ]


    

    for ns in navStrs: 
        # tLen, remStr = len(txt), f' {ns.get_text().strip()} '
        tLen, remStr = len(txt), f' {ns.string} '
        remLen = len(remStr)
        # tLen, remStr = len(txt), f' {ns.text.strip()} '

        if remStr[1:-1].lower() == txt.lower():
            # to skip if it's already a hyperlink
            if ns.parent.name == 'a': 
                ns.parent['href'] = link # comment if you dont want to replace/update link
                continue 
        # Skip creating nested hyperlinks inside existing hyperlinks       
        if ns.parent.name == 'a': 
          continue 


        i = 0        
        while ' '+re.sub('W+',' ',txt.lower())+' ' in re.sub('W+',' ',remStr.lower()) and remStr.lower().find(f'{txt.lower()}') > -1:
            
            #print(txt.lower())
            #print(re.sub('W+',' ',remStr.lower()))

            sInd = remStr.lower().find(f'{txt.lower()}')
            #print(remStr.lower())
            #print(sInd)
            hlTag = soup.new_tag('a', href=link)
            hlTag.append(remStr[sInd:sInd + tLen])
            #print(hlTag)
            if i == 0:
              newCont = [remStr[1:sInd], hlTag]
            else:
              newCont = [remStr[:sInd], hlTag]
            #print(newCont)

            for addn in newCont: ns.insert_before(addn)
            #print(soup)

            remStr = remStr[sInd + tLen:remLen-1]
            #print(remStr)
            i += 1

        ns.replace_with(remStr)
        #print(soup)

print('#'*35, 'NEW', '#'*35, 'n')
print(soup, 'n')
print('#'*75)

printed output:

################################### OLD ################################### 

<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another Google Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase bing Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word Active Campaign.</p>
<!-- /wp:paragraph --> 

########################################################################### 



################################### NEW ################################### 

<!-- wp:paragraph -->
<p>This is a existing link <a class="test" href="https://yahoo.com/">Yahoo</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another <a href="https://www.google.com">Google</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another lowercase <a href="https://bing.com">bing</a> Text</p>
<!-- /wp:paragraph -->
<!-- wp:paragraph -->
<p>This is another multi word <a href="https://activecampaign.com">Active Campaign</a>.</p>
<!-- /wp:paragraph --> 

###########################################################################

This should work even with multiple matches in the same string as long as they don’t overlap (like "Google Chrome" and "Chrome Beta")

Answered By: Driftr95
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.