is there a better way to get the data form beautiful soup query?

Question:

I am trying to extract the m/z data for different ions from "https://www.lipidmaps.org/databases/lmsd/LMFA08040013".
I can get access to the ions and thier data, however to extract the formula and m/z, I am thinking to convert it to a string and the use striping tool to extract it. Is thier another way using beautifulsoup?


from bs4 import BeautifulSoup #used to interact with the website
import requests
soup = BeautifulSoup(requests.get("https://www.lipidmaps.org/databases/lmsd/LMFA08040013").text, "html.parser")

for option in soup.find_all('option'):
    ion = option.text
    option = str(option)
    m_z = ion
    ion_formula = 
    return ([m_z,ion-formula,ion]

example of option data:

<option data-display-formula="C&lt;sub&gt;18&lt;/sub&gt;H&lt;sub&gt;38&lt;/sub&gt;NO&lt;sub&gt;2&lt;/sub&gt;" data-formula="C18H38NO2" data-mass-z-ratio="300.2897" value="MplusH">
                                    [M+H]+
                                </option>

Example of output data:


m_z = 300.2897
ion-formula = C18H38NO2
ion = [M+H]+
Asked By: Nima Hojat

||

Answers:

Not sure what you mean by more elegant but if all you want is the first option with the given ion value you can get the output you want this way:

import requests
from bs4 import BeautifulSoup

url = "https://www.lipidmaps.org/databases/lmsd/LMFA08040013"
soup = (
    BeautifulSoup(requests.get(url).text, "lxml")
    .select_one(".change:calculate-mz > option:nth-child(2)")
)

mz = soup["data-mass-z-ratio"]
formula = soup["data-formula"]
ion = soup.getText(strip=True)

print(f"{mz} {formula} {ion}")

Output:

300.2897 C18H38NO2 [M+H]+

To list them all, try this:

import requests
from bs4 import BeautifulSoup

url = "https://www.lipidmaps.org/databases/lmsd/LMFA08040013"
options = (
    BeautifulSoup(requests.get(url).text, "lxml")
    .select(".change:calculate-mz > option")[1:]
)

for option in options:
    mz = option["data-mass-z-ratio"]
    formula = option["data-formula"]
    ion = option.getText(strip=True)
    print(f"m_z = {mz}nion-formula = {formula}nion = {ion}")
    print("-" * 30)

Output:

m_z = 300.2897
ion-formula = C18H38NO2
ion = [M+H]+
------------------------------
m_z = 282.2791
ion-formula = C18H36NO
ion = [M+H-H2O]+
------------------------------
m_z = 299.2819
ion-formula = C18H37NO2
ion = [M.]+
------------------------------
m_z = 150.6485
ion-formula = C18H39NO2
ion = [M+2H]2+
------------------------------
m_z = 100.7681
ion-formula = C18H40NO2
ion = [M+3H]3+
------------------------------
m_z = 75.8279
ion-formula = C18H41NO2
ion = [M+4H]4+
------------------------------
m_z = 338.2456
ion-formula = C18H37KNO2
ion = [M+K]+
------------------------------
m_z = 188.6044
ion-formula = C18H37K2NO2
ion = [M+2K]2+
------------------------------
m_z = 376.2015
ion-formula = C18H36K2NO2
ion = [M+2K-H]+
------------------------------
m_z = 322.2716
ion-formula = C18H37NNaO2
ion = [M+Na]+
------------------------------
m_z = 172.6304
ion-formula = C18H37NNa2O2
ion = [M+2Na]2+
------------------------------
m_z = 344.2536
ion-formula = C18H36NNa2O2
ion = [M+2Na-H]+
------------------------------
m_z = 306.2979
ion-formula = C18H37LiNO2
ion = [M+Li]+
------------------------------
m_z = 156.6567
ion-formula = C18H37Li2NO2
ion = [M+2Li]2+
------------------------------
m_z = 317.3162
ion-formula = C18H41N2O2
ion = [M+NH4]+
------------------------------
m_z = 298.2752
ion-formula = C18H36NO2
ion = [M-H]-
------------------------------
m_z = 148.6339
ion-formula = C18H35NO2
ion = [M-2H]2-
------------------------------
m_z = 98.7535
ion-formula = C18H34NO2
ion = [M-3H]3-
------------------------------
m_z = 73.8133
ion-formula = C18H33NO2
ion = [M-4H]4-
------------------------------
m_z = 334.2518
ion-formula = C18H37ClNO2
ion = [M+Cl]-
------------------------------
m_z = 358.2963
ion-formula = C20H40NO4
ion = [M+OAc]-
------------------------------
m_z = 344.2806
ion-formula = C19H38NO4
ion = [M+HCOO]-
------------------------------
m_z = 299.2824
ion-formula = C18H37NO2
ion = M(neutral)
------------------------------
Answered By: baduker

I can’t say what could be considered elegant, but I usually get data like this using .get and list comprehension

ionOptions = [{
    'ion': option.get_text(' ').strip(),
    'ion-formula': option.get('data-formula'),
    'm_z': option.get('data-mass-z-ratio')
} for option in soup.select('option[data-mass-z-ratio][data-formula]')]

and ionOptions would look like

[{'ion': '[M+H]+', 'ion-formula': 'C18H38NO2', 'm_z': '300.2897'},
 {'ion': '[M+H-H2O]+', 'ion-formula': 'C18H36NO', 'm_z': '282.2791'},
 {'ion': '[M.]+', 'ion-formula': 'C18H37NO2', 'm_z': '299.2819'},
 {'ion': '[M+2H]2+', 'ion-formula': 'C18H39NO2', 'm_z': '150.6485'},
 {'ion': '[M+3H]3+', 'ion-formula': 'C18H40NO2', 'm_z': '100.7681'},
 {'ion': '[M+4H]4+', 'ion-formula': 'C18H41NO2', 'm_z': '75.8279'},
 {'ion': '[M+K]+', 'ion-formula': 'C18H37KNO2', 'm_z': '338.2456'},
 {'ion': '[M+2K]2+', 'ion-formula': 'C18H37K2NO2', 'm_z': '188.6044'},
 {'ion': '[M+2K-H]+', 'ion-formula': 'C18H36K2NO2', 'm_z': '376.2015'},
 {'ion': '[M+Na]+', 'ion-formula': 'C18H37NNaO2', 'm_z': '322.2716'},
 {'ion': '[M+2Na]2+', 'ion-formula': 'C18H37NNa2O2', 'm_z': '172.6304'},
 {'ion': '[M+2Na-H]+', 'ion-formula': 'C18H36NNa2O2', 'm_z': '344.2536'},
 {'ion': '[M+Li]+', 'ion-formula': 'C18H37LiNO2', 'm_z': '306.2979'},
 {'ion': '[M+2Li]2+', 'ion-formula': 'C18H37Li2NO2', 'm_z': '156.6567'},
 {'ion': '[M+NH4]+', 'ion-formula': 'C18H41N2O2', 'm_z': '317.3162'},
 {'ion': '[M-H]-', 'ion-formula': 'C18H36NO2', 'm_z': '298.2752'},
 {'ion': '[M-2H]2-', 'ion-formula': 'C18H35NO2', 'm_z': '148.6339'},
 {'ion': '[M-3H]3-', 'ion-formula': 'C18H34NO2', 'm_z': '98.7535'},
 {'ion': '[M-4H]4-', 'ion-formula': 'C18H33NO2', 'm_z': '73.8133'},
 {'ion': '[M+Cl]-', 'ion-formula': 'C18H37ClNO2', 'm_z': '334.2518'},
 {'ion': '[M+OAc]-', 'ion-formula': 'C20H40NO4', 'm_z': '358.2963'},
 {'ion': '[M+HCOO]-', 'ion-formula': 'C19H38NO4', 'm_z': '344.2806'},
 {'ion': 'M(neutral)', 'ion-formula': 'C18H37NO2', 'm_z': '299.2824'}]

Note: You can use .find_all('option') or .select('option') instead of .select('option[data-mass-z-ratio][data-formula]'), but then the first option tag would also get included:

{'ion': '(Select m/z)', 'ion-formula': None, 'm_z': None}

You could print it in sections

for o in ionOptions:
    for k, v in o.items(): print(f'{k:>15} = {v}')
    print('-'*40)

output:

            ion = [M+H]+
    ion-formula = C18H38NO2
            m_z = 300.2897
----------------------------------------
            ion = [M+H-H2O]+
    ion-formula = C18H36NO
            m_z = 282.2791
----------------------------------------
            ion = [M.]+
    ion-formula = C18H37NO2
            m_z = 299.2819
----------------------------------------
            ion = [M+2H]2+
    ion-formula = C18H39NO2
            m_z = 150.6485
----------------------------------------
            ion = [M+3H]3+
    ion-formula = C18H40NO2
            m_z = 100.7681
----------------------------------------
            ion = [M+4H]4+
    ion-formula = C18H41NO2
            m_z = 75.8279
----------------------------------------
            ion = [M+K]+
    ion-formula = C18H37KNO2
            m_z = 338.2456
----------------------------------------
            ion = [M+2K]2+
    ion-formula = C18H37K2NO2
            m_z = 188.6044
----------------------------------------
            ion = [M+2K-H]+
    ion-formula = C18H36K2NO2
            m_z = 376.2015
----------------------------------------
            ion = [M+Na]+
    ion-formula = C18H37NNaO2
            m_z = 322.2716
----------------------------------------
            ion = [M+2Na]2+
    ion-formula = C18H37NNa2O2
            m_z = 172.6304
----------------------------------------
            ion = [M+2Na-H]+
    ion-formula = C18H36NNa2O2
            m_z = 344.2536
----------------------------------------
            ion = [M+Li]+
    ion-formula = C18H37LiNO2
            m_z = 306.2979
----------------------------------------
            ion = [M+2Li]2+
    ion-formula = C18H37Li2NO2
            m_z = 156.6567
----------------------------------------
            ion = [M+NH4]+
    ion-formula = C18H41N2O2
            m_z = 317.3162
----------------------------------------
            ion = [M-H]-
    ion-formula = C18H36NO2
            m_z = 298.2752
----------------------------------------
            ion = [M-2H]2-
    ion-formula = C18H35NO2
            m_z = 148.6339
----------------------------------------
            ion = [M-3H]3-
    ion-formula = C18H34NO2
            m_z = 98.7535
----------------------------------------
            ion = [M-4H]4-
    ion-formula = C18H33NO2
            m_z = 73.8133
----------------------------------------
            ion = [M+Cl]-
    ion-formula = C18H37ClNO2
            m_z = 334.2518
----------------------------------------
            ion = [M+OAc]-
    ion-formula = C20H40NO4
            m_z = 358.2963
----------------------------------------
            ion = [M+HCOO]-
    ion-formula = C19H38NO4
            m_z = 344.2806
----------------------------------------
            ion = M(neutral)
    ion-formula = C18H37NO2
            m_z = 299.2824
----------------------------------------

I personally prefer to use pandas to get a tabular format:

# import pandas

print(pandas.DataFrame(ionOptions).to_markdown(index=False))    

output:

| ion        | ion-formula   |      m_z |
|:-----------|:--------------|---------:|
| [M+H]+     | C18H38NO2     | 300.29   |
| [M+H-H2O]+ | C18H36NO      | 282.279  |
| [M.]+      | C18H37NO2     | 299.282  |
| [M+2H]2+   | C18H39NO2     | 150.649  |
| [M+3H]3+   | C18H40NO2     | 100.768  |
| [M+4H]4+   | C18H41NO2     |  75.8279 |
| [M+K]+     | C18H37KNO2    | 338.246  |
| [M+2K]2+   | C18H37K2NO2   | 188.604  |
| [M+2K-H]+  | C18H36K2NO2   | 376.202  |
| [M+Na]+    | C18H37NNaO2   | 322.272  |
| [M+2Na]2+  | C18H37NNa2O2  | 172.63   |
| [M+2Na-H]+ | C18H36NNa2O2  | 344.254  |
| [M+Li]+    | C18H37LiNO2   | 306.298  |
| [M+2Li]2+  | C18H37Li2NO2  | 156.657  |
| [M+NH4]+   | C18H41N2O2    | 317.316  |
| [M-H]-     | C18H36NO2     | 298.275  |
| [M-2H]2-   | C18H35NO2     | 148.634  |
| [M-3H]3-   | C18H34NO2     |  98.7535 |
| [M-4H]4-   | C18H33NO2     |  73.8133 |
| [M+Cl]-    | C18H37ClNO2   | 334.252  |
| [M+OAc]-   | C20H40NO4     | 358.296  |
| [M+HCOO]-  | C19H38NO4     | 344.281  |
| M(neutral) | C18H37NO2     | 299.282  |
Answered By: Driftr95
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.