Web Scraping with Python Cheat Sheet

Install

$ pip install requests beautifulsoup4

BeautifulSoup on Text

from bs4 import BeautifulSoup

text = '''<div><h1>My Header</h1></div>'''

soup = BeautifulSoup(text, 'html.parser')
print(soup.prettify())

<div>
 <h1>
  My Header
 </h1>
</div>

Fetch Webpage and Create Soup

import requests
from bs4 import BeautifulSoup

url = 'https://devbyexample.com/test-scraping'
r = requests.get(url)

soup = BeautifulSoup(r.text, 'html.parser')

Find By ID

<h1 id="article-title">Hello Everyone</h1>

header = soup.find(id="article-id")
print(header)

<h1 id="article-title">Hello Everyone</h1>

print(header.string)

Hello Everyone

Find By Class

<div id="articles">
    <div class='article'>...</div>
    <div class='article'>...</div>
    <div class='article'>...</div>
    <div class='article'>...</div>
    <div class='end'><button>Next Page</button></div>
</div>

articles = soup.select('.article')
print(articles)

[   <div class="article">...</div>,
    <div class="article">...</div>,
    <div class="article">...</div>,
    <div class="article">...</div>]

Navigating Elements in Tree

<ul>
    <li><a href="https://google.com">Google</a></li>
    <li><a href="https://bing.com">Bing</a></li>
    <li><a href="https://apple.com">Apple</a></li>
</ul>

# Get First Link
print(soup.a)

<a href="https://google.com">Google</a>

# Get all Link elements on page
print(soup.find_all("a"))

[   <a href="https://google.com">Google</a>,
    <a href="https://bing.com">Bing</a>,
    <a href="https://apple.com">Apple</a>]

# Print all hrefs on page
for link in soup.find_all("a"):
    print(link['href'])

https://google.com
https://bing.com
https://apple.com

Element Attributes

<div id="article-10" class="article">
    <h3>Header</h3>
    <p>First Paragraph</p>
    <p>Second Paragraph</p>
</div>

print(soup.div.name)

div

print(soup.div.contents)

[   '\n', 
    <h3>Header</h3>, 
    '\n', 
    <p>First Paragraph</p>, 
    '\n', 
    <p>Second Paragraph</p>, 
    '\n']

for strings in div.strings:
    print(repr(strings))

'\n'
'Header'
'\n'
'First Paragraph'
'\n'
'Second Paragraph'
'\n'

for strings in soup.div.stripped_strings:
    print(repr(strings))

'Header'
'First Paragraph'
'Second Paragraph'

Find By Regex

<div>
    <head><title>Sample Title</title></head>
    <h1>Title Header</h1> 
    <hr>
    <div>A description of something</div>
    <h2>Section Header</h2>
    <p>...</p>
    <h2>Another Header</h2>
    <p>...</p>
</div>

import re

headers = soup.find_all(re.compile('^h[1-6]'))
print(headers)

[   <h1>Title Header</h1>, 
    <h2>Section Header</h2>, 
    <h2>Another Header</h2>]

Search with CSS Select

<div>
    <h3><a href="/sites">Sites</a></h3>
    <ul class="site-list">
        <li><a href="https://google.com">Google</a></li>
        <li><a href="https://bing.com">Bing</a></li>
        <li><a href="https://apple.com">Apple</a></li>
    </ul>
</div>

print(soup.select('div a'))

[   <a href="/sites">Sites</a>,
    <a href="https://google.com">Google</a>,
    <a href="https://bing.com">Bing</a>,
    <a href="https://apple.com">Apple</a>]

print(soup.select('div > h3 > a'))

[<a href="/sites">Sites</a>]

print(soup.select('li:nth-child(odd)'))

[   <li><a href="https://google.com">Google</a></li>,
    <li><a href="https://apple.com">Apple</a></li>]

print(soup.select('a[href*="http"]'))

[   <a href="https://google.com">Google</a>,
    <a href="https://bing.com">Bing</a>,
    <a href="https://apple.com">Apple</a>]

Parent, Children and Siblings

<div>
    <ul>
        <li><a href="https://google.com">Google</a></li>
        <li><a href="https://bing.com">Bing</a></li>
        <li><a href="https://apple.com">Apple</a></li>
    </ul>
</div>

# Get Parent Name
ul_element = soup.find('ul')
print(ul_element.parent.name)

div

# Print all text in children
for child in ul_element.children:
    print(child.string)

Google
Bing
Apple

# Siblings
first_li_element = soup.find('li')
print(first_li_element)
for sibling in first_li_element.next_siblings:
    print(sibling)

<li><a href="https://google.com">Google</a></li>
<li><a href="https://bing.com">Bing</a></li>
<li><a href="https://apple.com">Apple</a></li>

A Python Web Scraping How-To Guide

Install

BeautifulSoup on Text

Fetch Webpage and Create Soup

Find By ID

Find By Class

Navigating Elements in Tree

Element Attributes

Find By Regex

Search with CSS Select

Parent, Children and Siblings