Regular expressions RE

Regular expression in Python

Author

Thomas H. Simm

import re

Regular Expressions Cheat-Sheets

Check out the following links for more information:

https://docs.python.org/3/howto/regex.html

https://docs.python.org/3/library/re.html

https://docs.python.org/3/howto/regex.html#greedy-versus-non-greedy

Shout out to regex101.com, which will explain each stage of a regex.

Summary

^ Beginning character

$ End character

Putting before finds the special char

\w matches letters number and underscores

\d matches digits

\s for whitespace characters, space tab or newline

\b for word boundaries

[a-z] is all lowercase letters

[A-Z] is upercase letters

[0-9] is numbers

. is a joker button

* means can have any length

? zero or one occurence of the character before it

^ NOT to all in the character class

| Or statement

[] within square brackets are or statements

{n,m} numeric repetition qualifiers with brackets between n and m. Remove n or m can do less or more

Different commands

re.search finds first instance

re.findall finds all instances

re.split split based on the expression

re.sub substitute a part of the string

print('Using>> re.search(r"ba","babar") gives:\n',\
      re.search(r"ba","babar"))
print('Using>> re.findall(r"ba","babar") gives:\n',\
      re.findall(r"ba","babar"))

Using>> re.search(r"ba","babar") gives:
 <re.Match object; span=(0, 2), match='ba'>
Using>> re.findall(r"ba","babar") gives:
 ['ba', 'ba']

print(re.split(r"[.?!]","the dog! is here. whhere? oh I see."))

['the dog', ' is here', ' whhere', ' oh I see', '']

print(re.sub(r"dog","cat","the dog! is here. whhere? oh I see."))

the cat! is here. whhere? oh I see.

Search within a string

print('        Find a string within a string')

result = re.search(r"aza","bazaar")
print(result)
result = re.search(r"aza","plaza")
print(result)

print('\n        At start of the string')
result=re.search(r"^z","zebra")
print(result)
result = re.search(r"^z","plaza")
print(result)

print('\n        The joker .')
result = re.search(r"x.n","xenon")
print(result)
result = re.search(r"x..o","xenon")
print(result)

        Find a string within a string
<re.Match object; span=(1, 4), match='aza'>
<re.Match object; span=(2, 5), match='aza'>

        At start of the string
<re.Match object; span=(0, 1), match='z'>
None

        The joker .
<re.Match object; span=(0, 3), match='xen'>
<re.Match object; span=(0, 4), match='xeno'>

Character classes

These are inside square brackets and are OR statements

[a-z] is all lowercase letters [A-Z] is upercase letters [0-9] is numbers

print(re.search(r"[Ppc]ython","cython"))
print(re.search(r"[a-z]ython","dython"))
print(re.search(r"[a-z]way","My way"))
print(re.search(r"[a-z]way","Myway"))

#find cloud with letter or number after it
print(re.search(r"cloud[a-zA-Z0-9]","cloud9"))
print(re.search(r"[a-zA-Z0-9]","dy9thon"))

<re.Match object; span=(0, 6), match='cython'>
<re.Match object; span=(0, 6), match='dython'>
None
<re.Match object; span=(1, 5), match='yway'>
<re.Match object; span=(0, 6), match='cloud9'>
<re.Match object; span=(0, 1), match='d'>

apply a NOT to all in the character class

This uses the pipe class ^

#this finds a space
print(re.search(r"[^a-zA-Z0-9]","dy9 thon"))
#this finds an underscore
print(re.search(r"[^a-zA-Z0-9]","dy9_thon"))

#this includes a not for spaces
print(re.search(r"[^a-zA-Z0-9 ]","dy9 thon-"))

<re.Match object; span=(3, 4), match=' '>
<re.Match object; span=(3, 4), match='_'>
<re.Match object; span=(8, 9), match='-'>

find a string OR another one


print(re.search(r"cat|dog","I ilke cats"))

<re.Match object; span=(7, 10), match='cat'>

Greedy `*`

Extension of .

.* means can have any length

#this finds something starting with p and ending with n
print(re.search(r"p.*n","python programming"))

#this finds something starting with py ending n but only a-z chars
print(re.search(r"py[a-z]*n","python programming"))

<re.Match object; span=(0, 17), match='python programmin'>
<re.Match object; span=(0, 6), match='python'>

match one or more occurence `+`

So o+l looks for ol

#this works
print(re.search(r"o+l","olly"))

#this fails because there is an i inbetween
print(re.search(r"o+l","oilly"))

#this finds from 1st o to l
print(re.search(r"o+l","oolly"))

# here we can just remove the +
print(re.search(r"ol","oolly"))

<re.Match object; span=(0, 2), match='ol'>
None
<re.Match object; span=(0, 3), match='ool'>
<re.Match object; span=(1, 3), match='ol'>

zero or one occurence of the character before it `?`

#
print(re.search(r"p?each","To each their own"))

#
print(re.search(r"p?each","To peach their own"))

#
print(re.search(r"p?each","Top each their own"))

<re.Match object; span=(3, 7), match='each'>
<re.Match object; span=(3, 8), match='peach'>
<re.Match object; span=(4, 8), match='each'>

special characters `\`

Putting \ before finds the special char

# . here is anything so works here
print(re.search(r".com","internet.com"))
# but not here
print(re.search(r".com","welcome"))
# Add backslash we get it here 
print(re.search(r"\.com","internet.com"))
# and a negative here
print(re.search(r"\.com","welcome"))
# find a (
print(re.search(r"\(","welcome (no dont)"))

<re.Match object; span=(8, 12), match='.com'>
<re.Match object; span=(2, 6), match='lcom'>
<re.Match object; span=(8, 12), match='.com'>
None
<re.Match object; span=(8, 9), match='('>

More special chars

\w matches letters number and underscores

\d matches digits

\s for whitespace characters, space tab or newline

^ Beginning character

$ End character

print('       so get here internet (stops at dot)')
print(re.search(r"\w*","internet.com"))

print("\n     and here the whole string")
print(re.search(r"\w*","internet99_com"))


print("\n     find country start and end in 'a'")

print("this works>>\n",re.search(r"A.*a","Australia"))
print("this doesn't end in a>>\n",re.search(r"A.*a","Azerbaijan"))

print("\n     add the begin and end chars- works correct for both")
print(re.search(r"^A.*a$","Australia"))
print(re.search(r"^A.*a$","Azerbaijan"))

       so get here internet (stops at dot)
<re.Match object; span=(0, 8), match='internet'>

     and here the whole string
<re.Match object; span=(0, 14), match='internet99_com'>

     find country start and end in 'a'
this works>>
 <re.Match object; span=(0, 9), match='Australia'>
this doesn't end in a>>
 <re.Match object; span=(0, 9), match='Azerbaija'>

     add the begin and end chars- works correct for both
<re.Match object; span=(0, 9), match='Australia'>
None

Word boundaries `\b`

For word boundaries the \b needs to be placed on both sides of the word to find

print("           Find the word hello")
print(re.search(r"\bhello\b","hello darkness my old friend"))
print("\n           Find the substring hell")
print("works without \b>>\n",re.search(r"hell","hello darkness my old friend"))
print("but not a full word so doesn't work with \b>>\n",re.search(r"\bhell\b","hello darkness my old friend"))

           Find the word hello
<re.Match object; span=(0, 5), match='hello'>

           Find the substring hell
works without >>
 <re.Match object; span=(0, 4), match='hell'>
but not a full word so doesn't work with >>
 None

Combine a few

This is for valid variable names

# ^[a-zA-Z_] startswith letters or underscore
# [a-zA-Z0-9_] then letters, numbers or undercore
# *$ end with above
pattern=r"^[a-zA-Z_][a-zA-Z0-9_]*$"

print(re.search(pattern,"LLnananj_9"))

print(re.search(pattern,"LLnananj_9"))
print(re.search(pattern,"9LLnananj_9"))

<re.Match object; span=(0, 10), match='LLnananj_9'>
<re.Match object; span=(0, 10), match='LLnananj_9'>
None

numeric repetition qualifiers `{m,n}`

[a-z]{n} for a repetition of lower case chars n time

[a-z]{n,m} repetition between n and m

[a-z]{n,} repetition of n or more

[a-z]{,n} repetitions of n or less

print(re.search(r"[a-zA-Z]{5}","a ghost"))

# a number but we only get the first
print(re.search(r"[a-zA-Z]{5}","a scary super ghost"))

<re.Match object; span=(2, 7), match='ghost'>
<re.Match object; span=(2, 7), match='scary'>

print(re.findall(r"[a-zA-Z]{5}","a scary super ghost"))

# but if we give a longer word?
print(re.findall(r"[a-zA-Z]{5}","a scary superior ghost"))

# we get part of the superior word

# to get just the words we want of 5 long can use \b
print(re.findall(r"\b[a-zA-Z]{5}\b","a scary superior ghost"))

['scary', 'super', 'ghost']
['scary', 'super', 'ghost']
['scary', 'ghost']

#between 2 and 4 exactly  full word
print(re.search(r"\b[a-zA-Z]{2,4}\b","a ab abc abcd abcde abcdef"))

print(re.findall(r"\b[a-zA-Z]{2,4}\b","a ab abc abcd abcde abcdef"))

#NB \b needed otherwise see below
print(re.findall(r"[a-zA-Z]{2,4}","a ab abc abcd abcde abcdef"))

# 2 and above full word
print(re.findall(r"\b[a-zA-Z]{2,}\b","a ab abc abcd abcde abcdef"))

# {,3} up to this many reps
print(re.findall(r"\b[a-zA-Z]{,3}\b","a ab abc abcd abcde abcdef"))

<re.Match object; span=(2, 4), match='ab'>
['ab', 'abc', 'abcd']
['ab', 'abc', 'abcd', 'abcd', 'abcd', 'ef']
['ab', 'abc', 'abcd', 'abcde', 'abcdef']
['a', '', 'ab', '', 'abc', '', '', '', '', '', '', '']

Capturing groups

# start with letters number and underscores  
# then comma and space
# ends with letters number and underscores

def dogroups(regExpr,string):

    result = re.search(regExpr,string)
    
    print("String is >> {},\n regExpr is >> {},\n result is >> {}\n".format(string, regExpr,result))

    print("groups",result.groups())
    try:
        print("result[0]",result[0])
    except:
        print("no result 0")
    try:
        print("result[1]",result[1])
    except:
        print("no result 1")
    try:
        print("result[2]",result[2])
    except:
        print("no result 2")

Match normally, just get one result

dogroups(r"^\w*, \w*$","Lovelace, Ada")

String is >> Lovelace, Ada,
 regExpr is >> ^\w*, \w*$,
 result is >> <re.Match object; span=(0, 13), match='Lovelace, Ada'>

groups ()
result[0] Lovelace, Ada
no result 1
no result 2

Use brackets `()` to match multiple results

dogroups(r"(^\w*), (\w*$)","Lovelace, Ada")

String is >> Lovelace, Ada,
 regExpr is >> (^\w*), (\w*$),
 result is >> <re.Match object; span=(0, 13), match='Lovelace, Ada'>

groups ('Lovelace', 'Ada')
result[0] Lovelace, Ada
result[1] Lovelace
result[2] Ada

log = "July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade"
# has [ followed by digits at least 1 followed by ]
regex= r"\[(\d+)\]"
result= re.search(regex,log)
print("result[0]= {}, result[1]={}".format(result[0],result[1]))
print("result.group= {}, result.groups= {}".format(result.group(),result.groups()))

result[0]= [12345], result[1]=12345
result.group= [12345], result.groups= ('12345',)

def extract_pid(log_line):
    regex= r"\[(\d+)\]"
    result=re.search(regex,log_line)
    if result is None:
        return "None"
    return result[1]

print(extract_pid(log))
print(extract_pid("[cat]  sass"))

12345
None

`re.sub`

the general format is:

re.sub(regular_expression_looking_for, what_to_replace_with,the_input_string)

# (char num _) at least one + folowed by @ with char dot or dash at least one +

print(re.sub(r"[\w.%+-]+@[\w.-]+","[REDACTED]","Received an email for go_nuts95@my.examle.com"))

Received an email for [REDACTED]

Combining with groups

If the regular expression has split the answer into groups (using ()) then can specify those parts using \1 for first term \2 for second term etc in the what to replace with part

re.sub(regexp,r"\1 and \2",string)

the output is just the 1st and 2nd parts with “and” in the middle

texta = "Lovelace, Ada"
patt = r"^([\w]*), ([\w]*$)"
res=re.search(patt, texta)
print(res[0],res[1],res[2])

     #this says sub with: result2 space result 1
re.sub(patt, r"\2 \1",texta)

Lovelace, Ada Lovelace Ada

'Ada Lovelace'

Some Examples

import re
def repeating_letter_a(text):
    #here we go A or a - followed by a-z or space- followed by a or A  
    result = re.search(r"(a|A)[a-z ]*(a|A)", text)

    return result != None

print(repeating_letter_a("banana")) # True
print(repeating_letter_a("pineapple")) # False
print(repeating_letter_a("Animal Kingdom")) # True
print(repeating_letter_a("A is for apple")) # True

# Fill in the code to check if the text
# passed has at least 2 groups of alphanumeric characters 
# (including letters, numbers, and underscores)
# separated by one or more whitespace characters.
import re
def check_character_groups(text):
  result = re.search(r"\w\s\w", text)
  return result != None

print(check_character_groups("One")) # False
print(check_character_groups("123  Ready Set GO")) # True
print(check_character_groups("username user_01")) # True
print(check_character_groups("shopping_list: milk, bread, eggs.")) # False

import re
def check_web_address(text):

# starts with letters,numbers,underscores
# followed by a dot then ends with letters,numbers,underscores
    pattern = r"\w\.\w*$"
    result = re.search(pattern, text)
    return result != None

print(check_web_address("gmail.com")) # True
print(check_web_address("www@google")) # False
print(check_web_address("www.Coursera.org")) # True
print(check_web_address("web-address.com/homepage")) # False
print(check_web_address("My_Favorite-Blog.US")) # True

True
False
True
False
True

import re
def rearrange_name(name):
    result = re.search(r"^([\w \.-]*), ([\w \.-]*)$", name)
    if result == None:
        
        return name
        
    return "{} {}".format(result[2], result[1])

name=rearrange_name("Kennedy, John F.")
print(name)

name=rearrange_name("Kennedy, John Franklin")
print(name)

John F. Kennedy
John Franklin Kennedy

#words of at least 7 chars
import re
def long_words(text):
    #this says full words (\b) with chars [A-Za-z] repeated 7 times or more {7,}
    pattern = r"\b[A-Za-z]{7,}\b"
    result = re.findall(pattern, text)
    return result

print(long_words("I like to drink coffee in the morning.")) # ['morning']
print(long_words("I also have a taste for hot chocolate in the afternoon.")) # ['chocolate', 'afternoon']
print(long_words("I never drink tea late at night.")) # []

['morning']
['chocolate', 'afternoon']
[]

# Add to the regular expression used in the extract_pid function, 
# to return the uppercase message in parenthesis, after the process id.

import re
def extract_pid(log_line):
    regex = r"\[([0-9]*)\]"  #
    result = re.search(regex, log_line)
    if result is None:
        return None
    print(result)
    return result[1]#"{} ({})".format(result[1],result[2])

print(extract_pid("July 31 07:51:48 mycomputer bad_process[12345]: ERROR Performing package upgrade")) # 12345 (ERROR)
print(extract_pid("99 elephants in a [cage]")) # None
print(extract_pid("A string that also has numbers [34567] but no uppercase message")) # None
print(extract_pid("July 31 08:08:08 mycomputer new_process[67890]: RUNNING Performing backup")) # 67890 (RUNNING)

<re.Match object; span=(39, 46), match='[12345]'>
12345
None
<re.Match object; span=(31, 38), match='[34567]'>
34567
<re.Match object; span=(39, 46), match='[67890]'>
67890

# We want to split a piece of text by either the word "a" or "the", 
# as implemented in the following code. 
# What is the resulting split list?

re.split(r"the|a", "One sentence. Another one? And the last one!")

['One sentence. Ano', 'r one? And ', ' l', 'st one!']

import re
def transform_record(record):
    pat = r"(\b[A-Za-z ]{2,}\b),([0-9-]{2,}),(\b[A-Za-z ]{2,}\b)"
    new_record = re.sub(pat,r"\1,\3 (+1-\2)",record)
    
    return new_record

# Change the order to Name, Job, (Phone No.)

print(transform_record("Sabrina Green,802-867-5309,System Administrator")) 

print(transform_record("Eli Jones,684-3481127,IT specialist")) 

print(transform_record("Melody Daniels,846-687-7436,Programmer")) 

print(transform_record("Charlie Rivera,698-746-3357,Web Developer"))

Sabrina Green,System Administrator (+1-802-867-5309)
Eli Jones,IT specialist (+1-684-3481127)
Melody Daniels,Programmer (+1-846-687-7436)
Charlie Rivera,Web Developer (+1-698-746-3357)

import re
def multi_vowel_words(text):
  pattern = r"[A-Za-z]*[aeiou]{3,}[a-z]*"
  result = re.findall(pattern, text)
  return result

print(multi_vowel_words("Life is beautiful")) 
# ['beautiful']

print(multi_vowel_words("Obviously, the queen is courageous and gracious.")) 
# ['Obviously', 'queen', 'courageous', 'gracious']

print(multi_vowel_words("The rambunctious children had to sit quietly and await their delicious dinner.")) 
# ['rambunctious', 'quietly', 'delicious']

print(multi_vowel_words("The order of a data queue is First In First Out (FIFO)")) 
# ['queue']

print(multi_vowel_words("Hello world!")) 
# []

['beautiful']
['Obviously', 'queen', 'courageous', 'gracious']
['rambunctious', 'quietly', 'delicious']
['queue']
[]

import re
def transform_comments(line_of_code):
    patt=r"#{1,}"
    result = re.sub(patt,"//",line_of_code)
    return result

print(transform_comments("### Start of program")) 
# Should be "// Start of program"
print(transform_comments("  number = 0   ## Initialize the variable")) 
# Should be "  number = 0   // Initialize the variable"
print(transform_comments("  number += 1   # Increment the variable")) 
# Should be "  number += 1   // Increment the variable"
print(transform_comments("  return(number)")) 
# Should be "  return(number)"

// Start of program
  number = 0   // Initialize the variable
  number += 1   // Increment the variable
  return(number)

string = "My number is 21-345-9999."

patt = r"([0-9]{2}-)"
resa=re.findall(patt,phone)
# result = re.sub(patt,r"({0}),{1},{2}".format(resa[0],resa[1],resa[2]),phone)


# result

import re
def convert_phone_number(phone):
  patt = r"\s([0-9]{1,})[-\s]([0-9]{1,})[-]([0-9]{1,})"
    
  result = re.sub(patt,r" (\1) \2-\3",phone)
  return result

print(convert_phone_number("My number is 212-345-9999.")) # My number is (212) 345-9999.
print(convert_phone_number("Please call 888-555-1234")) # Please call (888) 555-1234
print(convert_phone_number("123-123-12345")) # 123-123-12345
print(convert_phone_number("Phone number of Buckingham Palace is +44 303 123 7300")) # Phone number of Buckingham Palace is +44 303 123 7300

My number is (212) 345-9999.
Please call (888) 555-1234
123-123-12345
Phone number of Buckingham Palace is +44 303 123 7300