import re
Regular expressions RE
Regular expression in Python
Regular Expressions Cheat-Sheets
Check out the following links for more information:
https://docs.python.org/3/howto/regex.html
https://docs.python.org/3/library/re.html
https://docs.python.org/3/howto/regex.html#greedy-versus-non-greedy
Shout out to regex101.com, which will explain each stage of a regex.
Summary
^
Beginning character
$
End character
Putting before finds the special char
\w
matches letters number and underscores
\d
matches digits
\s
for whitespace characters, space tab or newline
\b
for word boundaries
[a-z]
is all lowercase letters
[A-Z]
is upercase letters
[0-9]
is numbers
.
is a joker button
*
means can have any length
?
zero or one occurence of the character before it
^
NOT to all in the character class
|
Or statement
[]
within square brackets are or statements
{n,m}
numeric repetition qualifiers with brackets between n and m. Remove n or m can do less or more
Different commands
re.search
finds first instance
re.findall
finds all instances
re.split
split based on the expression
re.sub
substitute a part of the string
print('Using>> re.search(r"ba","babar") gives:\n',\
r"ba","babar"))
re.search(print('Using>> re.findall(r"ba","babar") gives:\n',\
r"ba","babar")) re.findall(
Using>> re.search(r"ba","babar") gives:
<re.Match object; span=(0, 2), match='ba'>
Using>> re.findall(r"ba","babar") gives:
['ba', 'ba']
print(re.split(r"[.?!]","the dog! is here. whhere? oh I see."))
['the dog', ' is here', ' whhere', ' oh I see', '']
print(re.sub(r"dog","cat","the dog! is here. whhere? oh I see."))
the cat! is here. whhere? oh I see.
Search within a string
print(' Find a string within a string')
= re.search(r"aza","bazaar")
result print(result)
= re.search(r"aza","plaza")
result print(result)
print('\n At start of the string')
=re.search(r"^z","zebra")
resultprint(result)
= re.search(r"^z","plaza")
result print(result)
print('\n The joker .')
= re.search(r"x.n","xenon")
result print(result)
= re.search(r"x..o","xenon")
result print(result)
Find a string within a string
<re.Match object; span=(1, 4), match='aza'>
<re.Match object; span=(2, 5), match='aza'>
At start of the string
<re.Match object; span=(0, 1), match='z'>
None
The joker .
<re.Match object; span=(0, 3), match='xen'>
<re.Match object; span=(0, 4), match='xeno'>
Character classes
These are inside square brackets and are OR statements
[a-z] is all lowercase letters [A-Z] is upercase letters [0-9] is numbers
print(re.search(r"[Ppc]ython","cython"))
print(re.search(r"[a-z]ython","dython"))
print(re.search(r"[a-z]way","My way"))
print(re.search(r"[a-z]way","Myway"))
#find cloud with letter or number after it
print(re.search(r"cloud[a-zA-Z0-9]","cloud9"))
print(re.search(r"[a-zA-Z0-9]","dy9thon"))
<re.Match object; span=(0, 6), match='cython'>
<re.Match object; span=(0, 6), match='dython'>
None
<re.Match object; span=(1, 5), match='yway'>
<re.Match object; span=(0, 6), match='cloud9'>
<re.Match object; span=(0, 1), match='d'>
apply a NOT to all in the character class
This uses the pipe class ^
#this finds a space
print(re.search(r"[^a-zA-Z0-9]","dy9 thon"))
#this finds an underscore
print(re.search(r"[^a-zA-Z0-9]","dy9_thon"))
#this includes a not for spaces
print(re.search(r"[^a-zA-Z0-9 ]","dy9 thon-"))
<re.Match object; span=(3, 4), match=' '>
<re.Match object; span=(3, 4), match='_'>
<re.Match object; span=(8, 9), match='-'>
find a string OR another one
print(re.search(r"cat|dog","I ilke cats"))
<re.Match object; span=(7, 10), match='cat'>
Greedy *
Extension of .
.*
means can have any length
#this finds something starting with p and ending with n
print(re.search(r"p.*n","python programming"))
#this finds something starting with py ending n but only a-z chars
print(re.search(r"py[a-z]*n","python programming"))
<re.Match object; span=(0, 17), match='python programmin'>
<re.Match object; span=(0, 6), match='python'>
match one or more occurence +
So o+l
looks for ol
#this works
print(re.search(r"o+l","olly"))
#this fails because there is an i inbetween
print(re.search(r"o+l","oilly"))
#this finds from 1st o to l
print(re.search(r"o+l","oolly"))
# here we can just remove the +
print(re.search(r"ol","oolly"))
<re.Match object; span=(0, 2), match='ol'>
None
<re.Match object; span=(0, 3), match='ool'>
<re.Match object; span=(1, 3), match='ol'>
zero or one occurence of the character before it ?
#
print(re.search(r"p?each","To each their own"))
#
print(re.search(r"p?each","To peach their own"))
#
print(re.search(r"p?each","Top each their own"))
<re.Match object; span=(3, 7), match='each'>
<re.Match object; span=(3, 8), match='peach'>
<re.Match object; span=(4, 8), match='each'>
special characters \
Putting \
before finds the special char
# . here is anything so works here
print(re.search(r".com","internet.com"))
# but not here
print(re.search(r".com","welcome"))
# Add backslash we get it here
print(re.search(r"\.com","internet.com"))
# and a negative here
print(re.search(r"\.com","welcome"))
# find a (
print(re.search(r"\(","welcome (no dont)"))
<re.Match object; span=(8, 12), match='.com'>
<re.Match object; span=(2, 6), match='lcom'>
<re.Match object; span=(8, 12), match='.com'>
None
<re.Match object; span=(8, 9), match='('>
More special chars
\w
matches letters number and underscores
\d
matches digits
\s
for whitespace characters, space tab or newline
^
Beginning character
$
End character
print(' so get here internet (stops at dot)')
print(re.search(r"\w*","internet.com"))
print("\n and here the whole string")
print(re.search(r"\w*","internet99_com"))
print("\n find country start and end in 'a'")
print("this works>>\n",re.search(r"A.*a","Australia"))
print("this doesn't end in a>>\n",re.search(r"A.*a","Azerbaijan"))
print("\n add the begin and end chars- works correct for both")
print(re.search(r"^A.*a$","Australia"))
print(re.search(r"^A.*a$","Azerbaijan"))
so get here internet (stops at dot)
<re.Match object; span=(0, 8), match='internet'>
and here the whole string
<re.Match object; span=(0, 14), match='internet99_com'>
find country start and end in 'a'
this works>>
<re.Match object; span=(0, 9), match='Australia'>
this doesn't end in a>>
<re.Match object; span=(0, 9), match='Azerbaija'>
add the begin and end chars- works correct for both
<re.Match object; span=(0, 9), match='Australia'>
None
Word boundaries \b
For word boundaries the \b
needs to be placed on both sides of the word to find
print(" Find the word hello")
print(re.search(r"\bhello\b","hello darkness my old friend"))
print("\n Find the substring hell")
print("works without \b>>\n",re.search(r"hell","hello darkness my old friend"))
print("but not a full word so doesn't work with \b>>\n",re.search(r"\bhell\b","hello darkness my old friend"))
Find the word hello
<re.Match object; span=(0, 5), match='hello'>
Find the substring hell
works without >>
<re.Match object; span=(0, 4), match='hell'>
but not a full word so doesn't work with >>
None
Combine a few
This is for valid variable names
# ^[a-zA-Z_] startswith letters or underscore
# [a-zA-Z0-9_] then letters, numbers or undercore
# *$ end with above
=r"^[a-zA-Z_][a-zA-Z0-9_]*$"
pattern
print(re.search(pattern,"LLnananj_9"))
print(re.search(pattern,"LLnananj_9"))
print(re.search(pattern,"9LLnananj_9"))
<re.Match object; span=(0, 10), match='LLnananj_9'>
<re.Match object; span=(0, 10), match='LLnananj_9'>
None
numeric repetition qualifiers {m,n}
[a-z]{n}
for a repetition of lower case chars n time
[a-z]{n,m}
repetition between n and m
[a-z]{n,}
repetition of n or more
[a-z]{,n}
repetitions of n or less
print(re.search(r"[a-zA-Z]{5}","a ghost"))
# a number but we only get the first
print(re.search(r"[a-zA-Z]{5}","a scary super ghost"))
<re.Match object; span=(2, 7), match='ghost'>
<re.Match object; span=(2, 7), match='scary'>
print(re.findall(r"[a-zA-Z]{5}","a scary super ghost"))
# but if we give a longer word?
print(re.findall(r"[a-zA-Z]{5}","a scary superior ghost"))
# we get part of the superior word
# to get just the words we want of 5 long can use \b
print(re.findall(r"\b[a-zA-Z]{5}\b","a scary superior ghost"))
['scary', 'super', 'ghost']
['scary', 'super', 'ghost']
['scary', 'ghost']
#between 2 and 4 exactly full word
print(re.search(r"\b[a-zA-Z]{2,4}\b","a ab abc abcd abcde abcdef"))
print(re.findall(r"\b[a-zA-Z]{2,4}\b","a ab abc abcd abcde abcdef"))
#NB \b needed otherwise see below
print(re.findall(r"[a-zA-Z]{2,4}","a ab abc abcd abcde abcdef"))
# 2 and above full word
print(re.findall(r"\b[a-zA-Z]{2,}\b","a ab abc abcd abcde abcdef"))
# {,3} up to this many reps
print(re.findall(r"\b[a-zA-Z]{,3}\b","a ab abc abcd abcde abcdef"))
<re.Match object; span=(2, 4), match='ab'>
['ab', 'abc', 'abcd']
['ab', 'abc', 'abcd', 'abcd', 'abcd', 'ef']
['ab', 'abc', 'abcd', 'abcde', 'abcdef']
['a', '', 'ab', '', 'abc', '', '', '', '', '', '', '']
Capturing groups
# start with letters number and underscores
# then comma and space
# ends with letters number and underscores
def dogroups(regExpr,string):
= re.search(regExpr,string)
result
print("String is >> {},\n regExpr is >> {},\n result is >> {}\n".format(string, regExpr,result))
print("groups",result.groups())
try:
print("result[0]",result[0])
except:
print("no result 0")
try:
print("result[1]",result[1])
except:
print("no result 1")
try:
print("result[2]",result[2])
except:
print("no result 2")
Match normally, just get one result
r"^\w*, \w*$","Lovelace, Ada") dogroups(
String is >> Lovelace, Ada,
regExpr is >> ^\w*, \w*$,
result is >> <re.Match object; span=(0, 13), match='Lovelace, Ada'>
groups ()
result[0] Lovelace, Ada
no result 1
no result 2
Use brackets ()
to match multiple results
r"(^\w*), (\w*$)","Lovelace, Ada") dogroups(
String is >> Lovelace, Ada,
regExpr is >> (^\w*), (\w*$),
result is >> <re.Match object; span=(0, 13), match='Lovelace, Ada'>
groups ('Lovelace', 'Ada')
result[0] Lovelace, Ada
result[1] Lovelace
result[2] Ada
= "July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade"
log # has [ followed by digits at least 1 followed by ]
= r"\[(\d+)\]"
regex= re.search(regex,log)
resultprint("result[0]= {}, result[1]={}".format(result[0],result[1]))
print("result.group= {}, result.groups= {}".format(result.group(),result.groups()))
result[0]= [12345], result[1]=12345
result.group= [12345], result.groups= ('12345',)
def extract_pid(log_line):
= r"\[(\d+)\]"
regex=re.search(regex,log_line)
resultif result is None:
return "None"
return result[1]
print(extract_pid(log))
print(extract_pid("[cat] sass"))
12345
None
re.sub
the general format is:
re.sub(regular_expression_looking_for, what_to_replace_with,the_input_string)
# (char num _) at least one + folowed by @ with char dot or dash at least one +
print(re.sub(r"[\w.%+-]+@[\w.-]+","[REDACTED]","Received an email for go_nuts95@my.examle.com"))
Received an email for [REDACTED]
Combining with groups
If the regular expression has split the answer into groups (using ()
) then can specify those parts using \1
for first term \2
for second term etc in the what to replace with part
re.sub(regexp,r"\1 and \2",string)
the output is just the 1st and 2nd parts with “and” in the middle
= "Lovelace, Ada"
texta = r"^([\w]*), ([\w]*$)"
patt =re.search(patt, texta)
resprint(res[0],res[1],res[2])
#this says sub with: result2 space result 1
r"\2 \1",texta) re.sub(patt,
Lovelace, Ada Lovelace Ada
'Ada Lovelace'
Some Examples
import re
def repeating_letter_a(text):
#here we go A or a - followed by a-z or space- followed by a or A
= re.search(r"(a|A)[a-z ]*(a|A)", text)
result
return result != None
print(repeating_letter_a("banana")) # True
print(repeating_letter_a("pineapple")) # False
print(repeating_letter_a("Animal Kingdom")) # True
print(repeating_letter_a("A is for apple")) # True
# Fill in the code to check if the text
# passed has at least 2 groups of alphanumeric characters
# (including letters, numbers, and underscores)
# separated by one or more whitespace characters.
import re
def check_character_groups(text):
= re.search(r"\w\s\w", text)
result return result != None
print(check_character_groups("One")) # False
print(check_character_groups("123 Ready Set GO")) # True
print(check_character_groups("username user_01")) # True
print(check_character_groups("shopping_list: milk, bread, eggs.")) # False
import re
def check_web_address(text):
# starts with letters,numbers,underscores
# followed by a dot then ends with letters,numbers,underscores
= r"\w\.\w*$"
pattern = re.search(pattern, text)
result return result != None
print(check_web_address("gmail.com")) # True
print(check_web_address("www@google")) # False
print(check_web_address("www.Coursera.org")) # True
print(check_web_address("web-address.com/homepage")) # False
print(check_web_address("My_Favorite-Blog.US")) # True
True
False
True
False
True
import re
def rearrange_name(name):
= re.search(r"^([\w \.-]*), ([\w \.-]*)$", name)
result if result == None:
return name
return "{} {}".format(result[2], result[1])
=rearrange_name("Kennedy, John F.")
nameprint(name)
=rearrange_name("Kennedy, John Franklin")
nameprint(name)
John F. Kennedy
John Franklin Kennedy
#words of at least 7 chars
import re
def long_words(text):
#this says full words (\b) with chars [A-Za-z] repeated 7 times or more {7,}
= r"\b[A-Za-z]{7,}\b"
pattern = re.findall(pattern, text)
result return result
print(long_words("I like to drink coffee in the morning.")) # ['morning']
print(long_words("I also have a taste for hot chocolate in the afternoon.")) # ['chocolate', 'afternoon']
print(long_words("I never drink tea late at night.")) # []
['morning']
['chocolate', 'afternoon']
[]
# Add to the regular expression used in the extract_pid function,
# to return the uppercase message in parenthesis, after the process id.
import re
def extract_pid(log_line):
= r"\[([0-9]*)\]" #
regex = re.search(regex, log_line)
result if result is None:
return None
print(result)
return result[1]#"{} ({})".format(result[1],result[2])
print(extract_pid("July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade")) # 12345 (ERROR)
print(extract_pid("99 elephants in a [cage]")) # None
print(extract_pid("A string that also has numbers [34567] but no uppercase message")) # None
print(extract_pid("July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup")) # 67890 (RUNNING)
<re.Match object; span=(39, 46), match='[12345]'>
12345
None
<re.Match object; span=(31, 38), match='[34567]'>
34567
<re.Match object; span=(39, 46), match='[67890]'>
67890
# We want to split a piece of text by either the word "a" or "the",
# as implemented in the following code.
# What is the resulting split list?
r"the|a", "One sentence. Another one? And the last one!") re.split(
['One sentence. Ano', 'r one? And ', ' l', 'st one!']
import re
def transform_record(record):
= r"(\b[A-Za-z ]{2,}\b),([0-9-]{2,}),(\b[A-Za-z ]{2,}\b)"
pat = re.sub(pat,r"\1,\3 (+1-\2)",record)
new_record
return new_record
# Change the order to Name, Job, (Phone No.)
print(transform_record("Sabrina Green,802-867-5309,System Administrator"))
print(transform_record("Eli Jones,684-3481127,IT specialist"))
print(transform_record("Melody Daniels,846-687-7436,Programmer"))
print(transform_record("Charlie Rivera,698-746-3357,Web Developer"))
Sabrina Green,System Administrator (+1-802-867-5309)
Eli Jones,IT specialist (+1-684-3481127)
Melody Daniels,Programmer (+1-846-687-7436)
Charlie Rivera,Web Developer (+1-698-746-3357)
import re
def multi_vowel_words(text):
= r"[A-Za-z]*[aeiou]{3,}[a-z]*"
pattern = re.findall(pattern, text)
result return result
print(multi_vowel_words("Life is beautiful"))
# ['beautiful']
print(multi_vowel_words("Obviously, the queen is courageous and gracious."))
# ['Obviously', 'queen', 'courageous', 'gracious']
print(multi_vowel_words("The rambunctious children had to sit quietly and await their delicious dinner."))
# ['rambunctious', 'quietly', 'delicious']
print(multi_vowel_words("The order of a data queue is First In First Out (FIFO)"))
# ['queue']
print(multi_vowel_words("Hello world!"))
# []
['beautiful']
['Obviously', 'queen', 'courageous', 'gracious']
['rambunctious', 'quietly', 'delicious']
['queue']
[]
import re
def transform_comments(line_of_code):
=r"#{1,}"
patt= re.sub(patt,"//",line_of_code)
result return result
print(transform_comments("### Start of program"))
# Should be "// Start of program"
print(transform_comments(" number = 0 ## Initialize the variable"))
# Should be " number = 0 // Initialize the variable"
print(transform_comments(" number += 1 # Increment the variable"))
# Should be " number += 1 // Increment the variable"
print(transform_comments(" return(number)"))
# Should be " return(number)"
// Start of program
number = 0 // Initialize the variable
number += 1 // Increment the variable
return(number)
= "My number is 21-345-9999."
string
= r"([0-9]{2}-)"
patt =re.findall(patt,phone)
resa# result = re.sub(patt,r"({0}),{1},{2}".format(resa[0],resa[1],resa[2]),phone)
# result
import re
def convert_phone_number(phone):
= r"\s([0-9]{1,})[-\s]([0-9]{1,})[-]([0-9]{1,})"
patt
= re.sub(patt,r" (\1) \2-\3",phone)
result return result
print(convert_phone_number("My number is 212-345-9999.")) # My number is (212) 345-9999.
print(convert_phone_number("Please call 888-555-1234")) # Please call (888) 555-1234
print(convert_phone_number("123-123-12345")) # 123-123-12345
print(convert_phone_number("Phone number of Buckingham Palace is +44 303 123 7300")) # Phone number of Buckingham Palace is +44 303 123 7300
My number is (212) 345-9999.
Please call (888) 555-1234
123-123-12345
Phone number of Buckingham Palace is +44 303 123 7300