BeautifulSoup - scraping paragraphs from html

Cerulean Fish

Code: Python

2021-07-24 12:43:13

from bs4 import BeautifulSoup

# Simple HTML
SIMPLE_HTML = '''<html>
<head></head>
<body>
<h1>This is a title</h1>
<p class="subtitle">Lorem ipsum dolor sit amet.</p>
<p>Here's another p without a class</p>
<ul>
    <li>Sarah</li>
    <li>Mary</li>
    <li>Charlotte</li>
    <li>Carl</li>
</ul>
</body>
</html>'''

simple_soup = BeautifulSoup(SIMPLE_HTML, 'html.parser')      # use html.parser in order to understand the simple HTML

# Find paragraph
def find_paragraph():
    print(simple_soup.find('p', {'class': 'subtitle'}).string)


def find_other_paragraph():
    paragraphs = simple_soup.find_all('p')                                                     # give all the paragraphs
    other_paragraph = [p for p in paragraphs if 'subtitle' not in p.attrs.get('class', [])]    # iterate over the paragraphs and give back if not a class paragraph
    print(other_paragraph[0].string)                                                           # attrs.get() give back None if paragraph not found
                                                                                               # instead of None we return an empty list [] is case paragraph not found
    
find_paragraph() 
find_other_paragraph()

New to Communities?

Join the community

BeautifulSoup - scraping paragraphs from html

Tags

Related

New to Communities?