Canonicalization

KB0129 2023. 5. 24. 09:37

import re

text = "<div><td valign='top'>Moo</td></div>"
pattern = r"<[^>]+>"
re.sub(pattern, '', text) 

# return 'Moo'

Notice the r proceeding the regular expression pattern; this specifies the regular expression is a raw string.

Raw string do not recognize escape sequences. This makes them useful for regular expressions, which often contain literal '\' chracters.

data = {"HTML": ["<div><td valign='top'>Moo</td></div>", \
                 "<a href='http://ds100.org'>Link</a>", \
                 "<b>Bold text</b>"]}
html_data = pd.DataFrame(data)

html_data

https://ds100.org/course-notes/regex/regex.html#canonicalization-with-pandas-series-methods

pattern = r"<[^>]+>"
html_data['HTML'].str.replace(pattern, '', regex=True)

'''
0          Moo
1         Link
2    Bold text
Name: HTML, dtype: object
'''

Extraction with Pandas

data = {"SSN": ["987-65-4321", "forty", \
                "123-45-6789 bro or 321-45-6789",
               "999-99-9999"]}
ssn_data = pd.DataFrame(data)

ssn_data

ssn_data["SSN"].str.findall(pattern)

'''
0                 [987-65-4321]
1                            []
2    [123-45-6789, 321-45-6789]
3                 [999-99-9999]
Name: SSN, dtype: object
'''

Examples:

text = "Observations: 03:04:53 - Horse awakens. \
        03:05:14 - Horse goes back to sleep."
        
# we want to capture all occurences of time data(hour, minute, and second) as seperate entities.
pattern_1 = r"(\d\d):(\d\d):(\d\d)"
re.findall(pattern_1, text)

# [('03', '04', '53'), ('03', '05', '14')]



# shorthand to extract the same data
pattern_2 = r"(\d\d):(\d\d):(\d{2})"
re.findall(pattern_2, text)

# [('03', '04', '53'), ('03', '05', '14')]

first = log_lines[0]
first

# '169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

pattern = r'\[(\d+)\/(\w+)\/(\d+):(\d+):(\d+):(\d+) (.+)\]'
day, month, year, hour, minute, second, time_zone = re.findall(pattern, first)[0]
print(day, month, year, hour, minute, second, time_zone)

# 26 Jan 2014 10 47 58 -0800