Title here
Summary here
re Module FunctionsRegular expressions (regex) are a powerful language for describing text patterns. They let you:
Python’s re module provides full regular expression support. In this lesson, we’ll learn:
re module functions: search(), match(), findall(), sub()Compare these approaches to validating an email:
# Without regex (incomplete and brittle)
def is_valid_email_simple(email):
return '@' in email and '.' in email
# With regex (more robust)
import re
def is_valid_email_regex(email):
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return re.match(pattern, email) is not None
# Test both
emails = ["[email protected]", "invalid@", "@example.com", "no-at-sign.com"]
for email in emails:
simple = is_valid_email_simple(email)
regex = is_valid_email_regex(email)
print(f"{email:25} Simple: {simple:5} Regex: {regex}")Output:
[email protected] Simple: True Regex: True
invalid@ Simple: True Regex: False
@example.com Simple: True Regex: False
no-at-sign.com Simple: True Regex: FalseRegex provides much better validation!
Let’s start with simple patterns:
import re
text = "The book costs $29.99"
# Literal match
print(re.search(r'book', text)) # Match object
print(re.search(r'magazine', text)) # None
# Case-sensitive by default
print(re.search(r'Book', text)) # None
print(re.search(r'Book', text, re.IGNORECASE)) # Match objectSpecial Characters:
. - any character (except newline)^ - start of string$ - end of string* - 0 or more repetitions+ - 1 or more repetitions? - 0 or 1 repetition\d - any digit (0-9)\w - any word character (a-z, A-Z, 0-9, _)\s - any whitespace characterimport re
# . matches any character
print(re.findall(r'b..k', 'book back b@ek')) # ['book', 'back', 'b@ek']
# \d matches digits
print(re.findall(r'\d+', 'Book costs $29.99')) # ['29', '99']
# \w matches word characters
print(re.findall(r'\w+', 'Hello, World!')) # ['Hello', 'World']
# \s matches whitespace
print(re.findall(r'\s+', 'Hello World\t!')) # [' ', '\t']re Module Functionsre.search() - Find First Matchimport re
text = "ISBN: 978-0-13-110362-7"
# Find the first occurrence
match = re.search(r'\d{3}-\d-\d{2}-\d{6}-\d', text)
if match:
print(f"Found: {match.group()}")
print(f"Position: {match.start()} to {match.end()}")Output:
Found: 978-0-13-110362-7
Position: 6 to 23re.match() - Match from Beginningimport re
# match() only checks the beginning
text = "Book: Python Basics"
print(re.match(r'Book', text)) # Match
print(re.match(r'Python', text)) # None (not at start)
print(re.search(r'Python', text)) # Match (searches anywhere)re.findall() - Find All Matchesimport re
text = "Books cost $29.99, $39.99, and $49.99"
# Find all prices
prices = re.findall(r'\$\d+\.\d{2}', text)
print(prices) # ['$29.99', '$39.99', '$49.99']
# Extract just the numbers
numbers = re.findall(r'\$(\d+\.\d{2})', text)
print(numbers) # ['29.99', '39.99', '49.99']re.sub() - Replace Matchesimport re
text = "Book costs $29.99"
# Replace prices with "PRICE"
result = re.sub(r'\$\d+\.\d{2}', 'PRICE', text)
print(result) # Book costs PRICE
# Use captured groups in replacement
text = "First: John, Last: Doe"
result = re.sub(r'First: (\w+), Last: (\w+)', r'\2, \1', text)
print(result) # Doe, Johnre.split() - Split by Patternimport re
text = "Python,Basics;Advanced|Expert"
# Split by multiple delimiters
parts = re.split(r'[,;|]', text)
print(parts) # ['Python', 'Basics', 'Advanced', 'Expert']import re
text = "Books: Python3, Java8, C++11"
# [abc] - any of a, b, or c
print(re.findall(r'[PJC]', text)) # ['P', 'J', 'C']
# [a-z] - any lowercase letter
print(re.findall(r'[a-z]+', text)) # ['ooks', 'ython', 'ava']
# [A-Z] - any uppercase letter
print(re.findall(r'[A-Z]', text)) # ['B', 'P', 'J', 'C']
# [A-Za-z] - any letter
print(re.findall(r'[A-Za-z]+', text)) # ['Books', 'Python', 'Java', 'C']
# [^0-9] - anything except digits (negation)
print(re.findall(r'[^0-9,: ]+', text)) # ['Books', 'Python', 'Java', 'C++']import re
text = "ISBN: 978-0-13-110362-7"
# {n} - exactly n times
print(re.findall(r'\d{3}', text)) # ['978', '110', '362']
# {n,m} - n to m times
print(re.findall(r'\d{1,2}', text)) # ['97', '8', '0', '13', '11', '03', '62', '7']
# {n,} - n or more times
print(re.findall(r'\d{3,}', text)) # ['978', '110', '362']
# * - 0 or more (same as {0,})
print(re.findall(r'\d*', "a1bb22ccc")) # ['', '1', '', '', '22', '', '', '', '']
# + - 1 or more (same as {1,})
print(re.findall(r'\d+', "a1bb22ccc")) # ['1', '22']
# ? - 0 or 1 (same as {0,1})
print(re.findall(r'colou?r', "color colour")) # ['color', 'colour']import re
text = "Book: Python Basics by John Doe, $29.99"
# Capturing groups with ()
pattern = r'Book: (.+) by (.+), \$(\d+\.\d{2})'
match = re.search(pattern, text)
if match:
print(f"Title: {match.group(1)}")
print(f"Author: {match.group(2)}")
print(f"Price: ${match.group(3)}")
print(f"Full match: {match.group(0)}")Output:
Title: Python Basics
Author: John Doe
Price: $29.99
Full match: Book: Python Basics by John Doe, $29.99import re
text = "ISBN: 978-0-13-110362-7"
pattern = r'ISBN: (?P<prefix>\d{3})-(?P<group>\d)-(?P<publisher>\d{2})-(?P<title>\d{6})-(?P<check>\d)'
match = re.search(pattern, text)
if match:
print(f"Prefix: {match.group('prefix')}")
print(f"Group: {match.group('group')}")
print(f"Publisher: {match.group('publisher')}")
print(f"Title: {match.group('title')}")
print(f"Check: {match.group('check')}")Output:
Prefix: 978
Group: 0
Publisher: 13
Title: 110362
Check: 7import re
def validate_isbn(isbn):
"""Validate ISBN-10 or ISBN-13"""
# Remove hyphens and spaces
isbn_clean = re.sub(r'[-\s]', '', isbn)
# ISBN-10: 10 digits (or 9 digits + X)
isbn10_pattern = r'^\d{9}[\dX]$'
# ISBN-13: 13 digits
isbn13_pattern = r'^\d{13}$'
if re.match(isbn10_pattern, isbn_clean):
return ("ISBN-10", isbn_clean)
elif re.match(isbn13_pattern, isbn_clean):
return ("ISBN-13", isbn_clean)
else:
return (None, None)
# Test ISBNs
isbns = [
"978-0-13-110362-7",
"0-13-110362-7",
"978-0-13-110362-X", # Invalid
"123", # Too short
]
for isbn in isbns:
isbn_type, clean = validate_isbn(isbn)
if isbn_type:
print(f"{isbn:25} → Valid {isbn_type}: {clean}")
else:
print(f"{isbn:25} → Invalid")Output:
978-0-13-110362-7 → Valid ISBN-13: 9780131103627
0-13-110362-7 → Valid ISBN-10: 0131103627
978-0-13-110362-X → Invalid
123 → Invalidimport re
catalog_text = """
Title: Python Crash Course, Author: Eric Matthes, Price: $39.99, Year: 2019
Title: Automate the Boring Stuff, Author: Al Sweigart, Price: $29.99, Year: 2020
Title: Learning Python, Author: Mark Lutz, Price: $64.99, Year: 2013
"""
pattern = r'Title: ([^,]+), Author: ([^,]+), Price: \$(\d+\.\d{2}), Year: (\d{4})'
books = []
for match in re.finditer(pattern, catalog_text):
book = {
'title': match.group(1),
'author': match.group(2),
'price': float(match.group(3)),
'year': int(match.group(4))
}
books.append(book)
# Display extracted books
for book in books:
print(f"{book['title']} by {book['author']}")
print(f" ${book['price']} ({book['year']})")Output:
Python Crash Course by Eric Matthes
$39.99 (2019)
Automate the Boring Stuff by Al Sweigart
$29.99 (2020)
Learning Python by Mark Lutz
$64.99 (2013)import re
def clean_book_title(title):
"""Clean and normalize book title"""
# Remove extra whitespace
title = re.sub(r'\s+', ' ', title)
# Remove special characters except basic punctuation
title = re.sub(r'[^\w\s:,\-\']', '', title)
# Capitalize properly
title = title.strip().title()
return title
# Test with messy titles
messy_titles = [
" python basics ",
"Data@Science#101",
"LEARNING MACHINE LEARNING!!!",
"The Complete Python Course",
]
print("Cleaned Titles:")
for title in messy_titles:
clean = clean_book_title(title)
print(f"{title:45} → {clean}")Output:
Cleaned Titles:
python basics → Python Basics
Data@Science#101 → Datascience101
LEARNING MACHINE LEARNING!!! → Learning Machine Learning
The Complete Python Course → The Complete Python Courseimport re
def extract_prices(text):
"""Extract prices in various formats"""
# Match various price formats
patterns = [
r'\$(\d+(?:\.\d{2})?)', # $29.99 or $29
r'(\d+(?:\.\d{2})?)\s*(?:dollars|USD)', # 29.99 dollars
r'€(\d+(?:\.\d{2})?)', # €29.99
]
prices = []
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
prices.extend([float(m) for m in matches])
return prices
text = """
Books on sale:
- Python Basics: $29.99
- Java Programming: 39.99 USD
- Data Science: €44.99
- Web Development: $59
"""
prices = extract_prices(text)
print("Extracted prices:", prices)
print(f"Total: ${sum(prices):.2f}")
print(f"Average: ${sum(prices)/len(prices):.2f}")Output:
Extracted prices: [29.99, 39.99, 44.99, 59.0]
Total: $173.97
Average: $43.49Advanced patterns for matching based on context:
import re
text = "Python3 Java8 C++11"
# Positive lookahead: match word followed by digit
print(re.findall(r'\w+(?=\d)', text)) # ['Python', 'Java', 'C']
# Negative lookahead: match word NOT followed by digit
print(re.findall(r'\w+(?!\d)', text)) # ['Pytho', 'Jav', 'C++1']
# Positive lookbehind: match digit preceded by letter
print(re.findall(r'(?<=[a-zA-Z])\d+', text)) # ['3', '8']
# Negative lookbehind: match digit NOT preceded by letter
print(re.findall(r'(?<![a-zA-Z])\d+', text)) # ['11']import re
text = "Python PYTHON python PyThOn"
# Case-insensitive
print(re.findall(r'python', text, re.IGNORECASE))
# ['Python', 'PYTHON', 'python', 'PyThOn']
# Multiline mode (^ and $ match line boundaries)
multiline_text = """Book 1: Python
Book 2: Java
Book 3: C++"""
print(re.findall(r'^Book \d', multiline_text, re.MULTILINE))
# ['Book 1', 'Book 2', 'Book 3']
# Verbose mode (allows comments and whitespace)
pattern = re.compile(r"""
\$ # Dollar sign
(\d+) # Dollars
\. # Decimal point
(\d{2}) # Cents
""", re.VERBOSE)
print(pattern.findall("$29.99 and $39.99"))
# [('29', '99'), ('39', '99')]import re
# Email validation
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
# Phone number (US)
phone_pattern = r'^\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})$'
# URL
url_pattern = r'https?://(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&/=]*)'
# ISBN-13
isbn13_pattern = r'^\d{3}-?\d{1,5}-?\d{1,7}-?\d{1,7}-?\d$'
# Credit card (simple)
cc_pattern = r'^\d{4}-?\d{4}-?\d{4}-?\d{4}$'
# Test examples
test_data = {
'email': '[email protected]',
'phone': '(555) 123-4567',
'url': 'https://www.example.com/path',
'isbn': '978-0-13-110362-7',
'cc': '1234-5678-9012-3456'
}
patterns = {
'email': email_pattern,
'phone': phone_pattern,
'url': url_pattern,
'isbn': isbn13_pattern,
'cc': cc_pattern
}
for data_type, value in test_data.items():
pattern = patterns[data_type]
if re.match(pattern, value):
print(f"✓ Valid {data_type}: {value}")
else:
print(f"✗ Invalid {data_type}: {value}")Output:
✓ Valid email: [email protected]
✓ Valid phone: (555) 123-4567
✓ Valid url: https://www.example.com/path
✓ Valid isbn: 978-0-13-110362-7
✓ Valid cc: 1234-5678-9012-3456In this lesson, we learned about regular expressions:
., ^, $, etc.)\d (digits), \w (word chars), \s (whitespace)*, +, ?, {n}, {n,m}search(), match(), findall(), sub(), split()() and named groups with (?P<name>...)re.IGNORECASE, re.MULTILINE, re.VERBOSECommon use cases:
Best practices:
r'...') for regex patternsre.compile() if used repeatedlyIn the next lesson, we’ll explore Python’s advanced collection types from the collections module.