$ pip install requests beautifulsoup4
from bs4 import BeautifulSoup
text = '''<div><h1>My Header</h1></div>'''
soup = BeautifulSoup(text, 'html.parser')
print(soup.prettify())
<div>
<h1>
My Header
</h1>
</div>
import requests
from bs4 import BeautifulSoup
url = 'https://devbyexample.com/test-scraping'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
<h1 id="article-title">Hello Everyone</h1>
header = soup.find(id="article-id")
print(header)
<h1 id="article-title">Hello Everyone</h1>
print(header.string)
Hello Everyone
<div id="articles">
<div class='article'>...</div>
<div class='article'>...</div>
<div class='article'>...</div>
<div class='article'>...</div>
<div class='end'><button>Next Page</button></div>
</div>
articles = soup.select('.article')
print(articles)
[ <div class="article">...</div>,
<div class="article">...</div>,
<div class="article">...</div>,
<div class="article">...</div>]
<ul>
<li><a href="https://google.com">Google</a></li>
<li><a href="https://bing.com">Bing</a></li>
<li><a href="https://apple.com">Apple</a></li>
</ul>
# Get First Link
print(soup.a)
<a href="https://google.com">Google</a>
# Get all Link elements on page
print(soup.find_all("a"))
[ <a href="https://google.com">Google</a>,
<a href="https://bing.com">Bing</a>,
<a href="https://apple.com">Apple</a>]
# Print all hrefs on page
for link in soup.find_all("a"):
print(link['href'])
https://google.com
https://bing.com
https://apple.com
<div id="article-10" class="article">
<h3>Header</h3>
<p>First Paragraph</p>
<p>Second Paragraph</p>
</div>
print(soup.div.name)
div
print(soup.div.contents)
[ '\n',
<h3>Header</h3>,
'\n',
<p>First Paragraph</p>,
'\n',
<p>Second Paragraph</p>,
'\n']
for strings in div.strings:
print(repr(strings))
'\n'
'Header'
'\n'
'First Paragraph'
'\n'
'Second Paragraph'
'\n'
for strings in soup.div.stripped_strings:
print(repr(strings))
'Header'
'First Paragraph'
'Second Paragraph'
<div>
<head><title>Sample Title</title></head>
<h1>Title Header</h1>
<hr>
<div>A description of something</div>
<h2>Section Header</h2>
<p>...</p>
<h2>Another Header</h2>
<p>...</p>
</div>
import re
headers = soup.find_all(re.compile('^h[1-6]'))
print(headers)
[ <h1>Title Header</h1>,
<h2>Section Header</h2>,
<h2>Another Header</h2>]
<div>
<h3><a href="/sites">Sites</a></h3>
<ul class="site-list">
<li><a href="https://google.com">Google</a></li>
<li><a href="https://bing.com">Bing</a></li>
<li><a href="https://apple.com">Apple</a></li>
</ul>
</div>
print(soup.select('div a'))
[ <a href="/sites">Sites</a>,
<a href="https://google.com">Google</a>,
<a href="https://bing.com">Bing</a>,
<a href="https://apple.com">Apple</a>]
print(soup.select('div > h3 > a'))
[<a href="/sites">Sites</a>]
print(soup.select('li:nth-child(odd)'))
[ <li><a href="https://google.com">Google</a></li>,
<li><a href="https://apple.com">Apple</a></li>]
print(soup.select('a[href*="http"]'))
[ <a href="https://google.com">Google</a>,
<a href="https://bing.com">Bing</a>,
<a href="https://apple.com">Apple</a>]
<div>
<ul>
<li><a href="https://google.com">Google</a></li>
<li><a href="https://bing.com">Bing</a></li>
<li><a href="https://apple.com">Apple</a></li>
</ul>
</div>
# Get Parent Name
ul_element = soup.find('ul')
print(ul_element.parent.name)
div
# Print all text in children
for child in ul_element.children:
print(child.string)
Google
Bing
Apple
# Siblings
first_li_element = soup.find('li')
print(first_li_element)
for sibling in first_li_element.next_siblings:
print(sibling)
<li><a href="https://google.com">Google</a></li>
<li><a href="https://bing.com">Bing</a></li>
<li><a href="https://apple.com">Apple</a></li>