"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Text Methods"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"A normal Python string has a variety of method calls available:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"mystring = 'hello'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Hello'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mystring.capitalize()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mystring.isdigit()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Help on class str in module builtins:\n",
"\n",
"class str(object)\n",
" | str(object='') -> str\n",
" | str(bytes_or_buffer[, encoding[, errors]]) -> str\n",
" | \n",
" | Create a new string object from the given object. If encoding or\n",
" | errors is specified, then the object must expose a data buffer\n",
" | that will be decoded using the given encoding and error handler.\n",
" | Otherwise, returns the result of object.__str__() (if defined)\n",
" | or repr(object).\n",
" | encoding defaults to sys.getdefaultencoding().\n",
" | errors defaults to 'strict'.\n",
" | \n",
" | Methods defined here:\n",
" | \n",
" | __add__(self, value, /)\n",
" | Return self+value.\n",
" | \n",
" | __contains__(self, key, /)\n",
" | Return key in self.\n",
" | \n",
" | __eq__(self, value, /)\n",
" | Return self==value.\n",
" | \n",
" | __format__(self, format_spec, /)\n",
" | Return a formatted version of the string as described by format_spec.\n",
" | \n",
" | __ge__(self, value, /)\n",
" | Return self>=value.\n",
" | \n",
" | __getattribute__(self, name, /)\n",
" | Return getattr(self, name).\n",
" | \n",
" | __getitem__(self, key, /)\n",
" | Return self[key].\n",
" | \n",
" | __getnewargs__(...)\n",
" | \n",
" | __gt__(self, value, /)\n",
" | Return self>value.\n",
" | \n",
" | __hash__(self, /)\n",
" | Return hash(self).\n",
" | \n",
" | __iter__(self, /)\n",
" | Implement iter(self).\n",
" | \n",
" | __le__(self, value, /)\n",
" | Return self<=value.\n",
" | \n",
" | __len__(self, /)\n",
" | Return len(self).\n",
" | \n",
" | __lt__(self, value, /)\n",
" | Return self int\n",
" | \n",
" | Return the number of non-overlapping occurrences of substring sub in\n",
" | string S[start:end]. Optional arguments start and end are\n",
" | interpreted as in slice notation.\n",
" | \n",
" | encode(self, /, encoding='utf-8', errors='strict')\n",
" | Encode the string using the codec registered for encoding.\n",
" | \n",
" | encoding\n",
" | The encoding in which to encode the string.\n",
" | errors\n",
" | The error handling scheme to use for encoding errors.\n",
" | The default is 'strict' meaning that encoding errors raise a\n",
" | UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n",
" | 'xmlcharrefreplace' as well as any other name registered with\n",
" | codecs.register_error that can handle UnicodeEncodeErrors.\n",
" | \n",
" | endswith(...)\n",
" | S.endswith(suffix[, start[, end]]) -> bool\n",
" | \n",
" | Return True if S ends with the specified suffix, False otherwise.\n",
" | With optional start, test S beginning at that position.\n",
" | With optional end, stop comparing S at that position.\n",
" | suffix can also be a tuple of strings to try.\n",
" | \n",
" | expandtabs(self, /, tabsize=8)\n",
" | Return a copy where all tab characters are expanded using spaces.\n",
" | \n",
" | If tabsize is not given, a tab size of 8 characters is assumed.\n",
" | \n",
" | find(...)\n",
" | S.find(sub[, start[, end]]) -> int\n",
" | \n",
" | Return the lowest index in S where substring sub is found,\n",
" | such that sub is contained within S[start:end]. Optional\n",
" | arguments start and end are interpreted as in slice notation.\n",
" | \n",
" | Return -1 on failure.\n",
" | \n",
" | format(...)\n",
" | S.format(*args, **kwargs) -> str\n",
" | \n",
" | Return a formatted version of S, using substitutions from args and kwargs.\n",
" | The substitutions are identified by braces ('{' and '}').\n",
" | \n",
" | format_map(...)\n",
" | S.format_map(mapping) -> str\n",
" | \n",
" | Return a formatted version of S, using substitutions from mapping.\n",
" | The substitutions are identified by braces ('{' and '}').\n",
" | \n",
" | index(...)\n",
" | S.index(sub[, start[, end]]) -> int\n",
" | \n",
" | Return the lowest index in S where substring sub is found, \n",
" | such that sub is contained within S[start:end]. Optional\n",
" | arguments start and end are interpreted as in slice notation.\n",
" | \n",
" | Raises ValueError when the substring is not found.\n",
" | \n",
" | isalnum(self, /)\n",
" | Return True if the string is an alpha-numeric string, False otherwise.\n",
" | \n",
" | A string is alpha-numeric if all characters in the string are alpha-numeric and\n",
" | there is at least one character in the string.\n",
" | \n",
" | isalpha(self, /)\n",
" | Return True if the string is an alphabetic string, False otherwise.\n",
" | \n",
" | A string is alphabetic if all characters in the string are alphabetic and there\n",
" | is at least one character in the string.\n",
" | \n",
" | isascii(self, /)\n",
" | Return True if all characters in the string are ASCII, False otherwise.\n",
" | \n",
" | ASCII characters have code points in the range U+0000-U+007F.\n",
" | Empty string is ASCII too.\n",
" | \n",
" | isdecimal(self, /)\n",
" | Return True if the string is a decimal string, False otherwise.\n",
" | \n",
" | A string is a decimal string if all characters in the string are decimal and\n",
" | there is at least one character in the string.\n",
" | \n",
" | isdigit(self, /)\n",
" | Return True if the string is a digit string, False otherwise.\n",
" | \n",
" | A string is a digit string if all characters in the string are digits and there\n",
" | is at least one character in the string.\n",
" | \n",
" | isidentifier(self, /)\n",
" | Return True if the string is a valid Python identifier, False otherwise.\n",
" | \n",
" | Use keyword.iskeyword() to test for reserved identifiers such as \"def\" and\n",
" | \"class\".\n",
" | \n",
" | islower(self, /)\n",
" | Return True if the string is a lowercase string, False otherwise.\n",
" | \n",
" | A string is lowercase if all cased characters in the string are lowercase and\n",
" | there is at least one cased character in the string.\n",
" | \n",
" | isnumeric(self, /)\n",
" | Return True if the string is a numeric string, False otherwise.\n",
" | \n",
" | A string is numeric if all characters in the string are numeric and there is at\n",
" | least one character in the string.\n",
" | \n",
" | isprintable(self, /)\n",
" | Return True if the string is printable, False otherwise.\n",
" | \n",
" | A string is printable if all of its characters are considered printable in\n",
" | repr() or if it is empty.\n",
" | \n",
" | isspace(self, /)\n",
" | Return True if the string is a whitespace string, False otherwise.\n",
" | \n",
" | A string is whitespace if all characters in the string are whitespace and there\n",
" | is at least one character in the string.\n",
" | \n",
" | istitle(self, /)\n",
" | Return True if the string is a title-cased string, False otherwise.\n",
" | \n",
" | In a title-cased string, upper- and title-case characters may only\n",
" | follow uncased characters and lowercase characters only cased ones.\n",
" | \n",
" | isupper(self, /)\n",
" | Return True if the string is an uppercase string, False otherwise.\n",
" | \n",
" | A string is uppercase if all cased characters in the string are uppercase and\n",
" | there is at least one cased character in the string.\n",
" | \n",
" | join(self, iterable, /)\n",
" | Concatenate any number of strings.\n",
" | \n",
" | The string whose method is called is inserted in between each given string.\n",
" | The result is returned as a new string.\n",
" | \n",
" | Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'\n",
" | \n",
" | ljust(self, width, fillchar=' ', /)\n",
" | Return a left-justified string of length width.\n",
" | \n",
" | Padding is done using the specified fill character (default is a space).\n",
" | \n",
" | lower(self, /)\n",
" | Return a copy of the string converted to lowercase.\n",
" | \n",
" | lstrip(self, chars=None, /)\n",
" | Return a copy of the string with leading whitespace removed.\n",
" | \n",
" | If chars is given and not None, remove characters in chars instead.\n",
" | \n",
" | partition(self, sep, /)\n",
" | Partition the string into three parts using the given separator.\n",
" | \n",
" | This will search for the separator in the string. If the separator is found,\n",
" | returns a 3-tuple containing the part before the separator, the separator\n",
" | itself, and the part after it.\n",
" | \n",
" | If the separator is not found, returns a 3-tuple containing the original string\n",
" | and two empty strings.\n",
" | \n",
" | replace(self, old, new, count=-1, /)\n",
" | Return a copy with all occurrences of substring old replaced by new.\n",
" | \n",
" | count\n",
" | Maximum number of occurrences to replace.\n",
" | -1 (the default value) means replace all occurrences.\n",
" | \n",
" | If the optional argument count is given, only the first count occurrences are\n",
" | replaced.\n",
" | \n",
" | rfind(...)\n",
" | S.rfind(sub[, start[, end]]) -> int\n",
" | \n",
" | Return the highest index in S where substring sub is found,\n",
" | such that sub is contained within S[start:end]. Optional\n",
" | arguments start and end are interpreted as in slice notation.\n",
" | \n",
" | Return -1 on failure.\n",
" | \n",
" | rindex(...)\n",
" | S.rindex(sub[, start[, end]]) -> int\n",
" | \n",
" | Return the highest index in S where substring sub is found,\n",
" | such that sub is contained within S[start:end]. Optional\n",
" | arguments start and end are interpreted as in slice notation.\n",
" | \n",
" | Raises ValueError when the substring is not found.\n",
" | \n",
" | rjust(self, width, fillchar=' ', /)\n",
" | Return a right-justified string of length width.\n",
" | \n",
" | Padding is done using the specified fill character (default is a space).\n",
" | \n",
" | rpartition(self, sep, /)\n",
" | Partition the string into three parts using the given separator.\n",
" | \n",
" | This will search for the separator in the string, starting at the end. If\n",
" | the separator is found, returns a 3-tuple containing the part before the\n",
" | separator, the separator itself, and the part after it.\n",
" | \n",
" | If the separator is not found, returns a 3-tuple containing two empty strings\n",
" | and the original string.\n",
" | \n",
" | rsplit(self, /, sep=None, maxsplit=-1)\n",
" | Return a list of the words in the string, using sep as the delimiter string.\n",
" | \n",
" | sep\n",
" | The delimiter according which to split the string.\n",
" | None (the default value) means split according to any whitespace,\n",
" | and discard empty strings from the result.\n",
" | maxsplit\n",
" | Maximum number of splits to do.\n",
" | -1 (the default value) means no limit.\n",
" | \n",
" | Splits are done starting at the end of the string and working to the front.\n",
" | \n",
" | rstrip(self, chars=None, /)\n",
" | Return a copy of the string with trailing whitespace removed.\n",
" | \n",
" | If chars is given and not None, remove characters in chars instead.\n",
" | \n",
" | split(self, /, sep=None, maxsplit=-1)\n",
" | Return a list of the words in the string, using sep as the delimiter string.\n",
" | \n",
" | sep\n",
" | The delimiter according which to split the string.\n",
" | None (the default value) means split according to any whitespace,\n",
" | and discard empty strings from the result.\n",
" | maxsplit\n",
" | Maximum number of splits to do.\n",
" | -1 (the default value) means no limit.\n",
" | \n",
" | splitlines(self, /, keepends=False)\n",
" | Return a list of the lines in the string, breaking at line boundaries.\n",
" | \n",
" | Line breaks are not included in the resulting list unless keepends is given and\n",
" | true.\n",
" | \n",
" | startswith(...)\n",
" | S.startswith(prefix[, start[, end]]) -> bool\n",
" | \n",
" | Return True if S starts with the specified prefix, False otherwise.\n",
" | With optional start, test S beginning at that position.\n",
" | With optional end, stop comparing S at that position.\n",
" | prefix can also be a tuple of strings to try.\n",
" | \n",
" | strip(self, chars=None, /)\n",
" | Return a copy of the string with leading and trailing whitespace removed.\n",
" | \n",
" | If chars is given and not None, remove characters in chars instead.\n",
" | \n",
" | swapcase(self, /)\n",
" | Convert uppercase characters to lowercase and lowercase characters to uppercase.\n",
" | \n",
" | title(self, /)\n",
" | Return a version of the string where each word is titlecased.\n",
" | \n",
" | More specifically, words start with uppercased characters and all remaining\n",
" | cased characters have lower case.\n",
" | \n",
" | translate(self, table, /)\n",
" | Replace each character in the string using the given translation table.\n",
" | \n",
" | table\n",
" | Translation table, which must be a mapping of Unicode ordinals to\n",
" | Unicode ordinals, strings, or None.\n",
" | \n",
" | The table must implement lookup/indexing via __getitem__, for instance a\n",
" | dictionary or list. If this operation raises LookupError, the character is\n",
" | left untouched. Characters mapped to None are deleted.\n",
" | \n",
" | upper(self, /)\n",
" | Return a copy of the string converted to uppercase.\n",
" | \n",
" | zfill(self, width, /)\n",
" | Pad a numeric string with zeros on the left, to fill a field of the given width.\n",
" | \n",
" | The string is never truncated.\n",
" | \n",
" | ----------------------------------------------------------------------\n",
" | Static methods defined here:\n",
" | \n",
" | __new__(*args, **kwargs) from builtins.type\n",
" | Create and return a new object. See help(type) for accurate signature.\n",
" | \n",
" | maketrans(x, y=None, z=None, /)\n",
" | Return a translation table usable for str.translate().\n",
" | \n",
" | If there is only one argument, it must be a dictionary mapping Unicode\n",
" | ordinals (integers) or characters to Unicode ordinals, strings or None.\n",
" | Character keys will be then converted to ordinals.\n",
" | If there are two arguments, they must be strings of equal length, and\n",
" | in the resulting dictionary, each character in x will be mapped to the\n",
" | character at the same position in y. If there is a third argument, it\n",
" | must be a string, whose characters will be mapped to None in the result.\n",
"\n"
]
}
],
"source": [
"help(str)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pandas and Text\n",
"\n",
"Pandas can do a lot more than what we show here. Full online documentation on things like advanced string indexing and regular expressions with pandas can be found here: https://pandas.pydata.org/docs/user_guide/text.html\n",
"\n",
"## Text Methods on Pandas String Column"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"names = pd.Series(['andrew','bobo','claire','david','4'])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 andrew\n",
"1 bobo\n",
"2 claire\n",
"3 david\n",
"4 4\n",
"dtype: object"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 Andrew\n",
"1 Bobo\n",
"2 Claire\n",
"3 David\n",
"4 4\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names.str.capitalize()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 True\n",
"dtype: bool"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"names.str.isdigit()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Splitting , Grabbing, and Expanding"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(tech_finance)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"tickers = pd.Series(tech_finance)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 GOOG,APPL,AMZN\n",
"1 JPM,BAC,GS\n",
"dtype: object"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tickers"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 [GOOG, APPL, AMZN]\n",
"1 [JPM, BAC, GS]\n",
"dtype: object"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tickers.str.split(',')"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 GOOG\n",
"1 JPM\n",
"dtype: object"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tickers.str.split(',').str[0]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"