{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "___\n",
    "\n",
    "<a href='http://www.pieriandata.com'><img src='../Pierian_Data_Logo.png'/></a>\n",
    "___\n",
    "<center><em>Copyright by Pierian Data Inc.</em></center>\n",
    "<center><em>For more information, visit us at <a href='http://www.pieriandata.com'>www.pieriandata.com</a></em></center>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Text Methods"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "A normal Python string has a variety of method calls available:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "mystring = 'hello'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Hello'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mystring.capitalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mystring.isdigit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Help on class str in module builtins:\n",
      "\n",
      "class str(object)\n",
      " |  str(object='') -> str\n",
      " |  str(bytes_or_buffer[, encoding[, errors]]) -> str\n",
      " |  \n",
      " |  Create a new string object from the given object. If encoding or\n",
      " |  errors is specified, then the object must expose a data buffer\n",
      " |  that will be decoded using the given encoding and error handler.\n",
      " |  Otherwise, returns the result of object.__str__() (if defined)\n",
      " |  or repr(object).\n",
      " |  encoding defaults to sys.getdefaultencoding().\n",
      " |  errors defaults to 'strict'.\n",
      " |  \n",
      " |  Methods defined here:\n",
      " |  \n",
      " |  __add__(self, value, /)\n",
      " |      Return self+value.\n",
      " |  \n",
      " |  __contains__(self, key, /)\n",
      " |      Return key in self.\n",
      " |  \n",
      " |  __eq__(self, value, /)\n",
      " |      Return self==value.\n",
      " |  \n",
      " |  __format__(self, format_spec, /)\n",
      " |      Return a formatted version of the string as described by format_spec.\n",
      " |  \n",
      " |  __ge__(self, value, /)\n",
      " |      Return self>=value.\n",
      " |  \n",
      " |  __getattribute__(self, name, /)\n",
      " |      Return getattr(self, name).\n",
      " |  \n",
      " |  __getitem__(self, key, /)\n",
      " |      Return self[key].\n",
      " |  \n",
      " |  __getnewargs__(...)\n",
      " |  \n",
      " |  __gt__(self, value, /)\n",
      " |      Return self>value.\n",
      " |  \n",
      " |  __hash__(self, /)\n",
      " |      Return hash(self).\n",
      " |  \n",
      " |  __iter__(self, /)\n",
      " |      Implement iter(self).\n",
      " |  \n",
      " |  __le__(self, value, /)\n",
      " |      Return self<=value.\n",
      " |  \n",
      " |  __len__(self, /)\n",
      " |      Return len(self).\n",
      " |  \n",
      " |  __lt__(self, value, /)\n",
      " |      Return self<value.\n",
      " |  \n",
      " |  __mod__(self, value, /)\n",
      " |      Return self%value.\n",
      " |  \n",
      " |  __mul__(self, value, /)\n",
      " |      Return self*value.\n",
      " |  \n",
      " |  __ne__(self, value, /)\n",
      " |      Return self!=value.\n",
      " |  \n",
      " |  __repr__(self, /)\n",
      " |      Return repr(self).\n",
      " |  \n",
      " |  __rmod__(self, value, /)\n",
      " |      Return value%self.\n",
      " |  \n",
      " |  __rmul__(self, value, /)\n",
      " |      Return value*self.\n",
      " |  \n",
      " |  __sizeof__(self, /)\n",
      " |      Return the size of the string in memory, in bytes.\n",
      " |  \n",
      " |  __str__(self, /)\n",
      " |      Return str(self).\n",
      " |  \n",
      " |  capitalize(self, /)\n",
      " |      Return a capitalized version of the string.\n",
      " |      \n",
      " |      More specifically, make the first character have upper case and the rest lower\n",
      " |      case.\n",
      " |  \n",
      " |  casefold(self, /)\n",
      " |      Return a version of the string suitable for caseless comparisons.\n",
      " |  \n",
      " |  center(self, width, fillchar=' ', /)\n",
      " |      Return a centered string of length width.\n",
      " |      \n",
      " |      Padding is done using the specified fill character (default is a space).\n",
      " |  \n",
      " |  count(...)\n",
      " |      S.count(sub[, start[, end]]) -> int\n",
      " |      \n",
      " |      Return the number of non-overlapping occurrences of substring sub in\n",
      " |      string S[start:end].  Optional arguments start and end are\n",
      " |      interpreted as in slice notation.\n",
      " |  \n",
      " |  encode(self, /, encoding='utf-8', errors='strict')\n",
      " |      Encode the string using the codec registered for encoding.\n",
      " |      \n",
      " |      encoding\n",
      " |        The encoding in which to encode the string.\n",
      " |      errors\n",
      " |        The error handling scheme to use for encoding errors.\n",
      " |        The default is 'strict' meaning that encoding errors raise a\n",
      " |        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and\n",
      " |        'xmlcharrefreplace' as well as any other name registered with\n",
      " |        codecs.register_error that can handle UnicodeEncodeErrors.\n",
      " |  \n",
      " |  endswith(...)\n",
      " |      S.endswith(suffix[, start[, end]]) -> bool\n",
      " |      \n",
      " |      Return True if S ends with the specified suffix, False otherwise.\n",
      " |      With optional start, test S beginning at that position.\n",
      " |      With optional end, stop comparing S at that position.\n",
      " |      suffix can also be a tuple of strings to try.\n",
      " |  \n",
      " |  expandtabs(self, /, tabsize=8)\n",
      " |      Return a copy where all tab characters are expanded using spaces.\n",
      " |      \n",
      " |      If tabsize is not given, a tab size of 8 characters is assumed.\n",
      " |  \n",
      " |  find(...)\n",
      " |      S.find(sub[, start[, end]]) -> int\n",
      " |      \n",
      " |      Return the lowest index in S where substring sub is found,\n",
      " |      such that sub is contained within S[start:end].  Optional\n",
      " |      arguments start and end are interpreted as in slice notation.\n",
      " |      \n",
      " |      Return -1 on failure.\n",
      " |  \n",
      " |  format(...)\n",
      " |      S.format(*args, **kwargs) -> str\n",
      " |      \n",
      " |      Return a formatted version of S, using substitutions from args and kwargs.\n",
      " |      The substitutions are identified by braces ('{' and '}').\n",
      " |  \n",
      " |  format_map(...)\n",
      " |      S.format_map(mapping) -> str\n",
      " |      \n",
      " |      Return a formatted version of S, using substitutions from mapping.\n",
      " |      The substitutions are identified by braces ('{' and '}').\n",
      " |  \n",
      " |  index(...)\n",
      " |      S.index(sub[, start[, end]]) -> int\n",
      " |      \n",
      " |      Return the lowest index in S where substring sub is found, \n",
      " |      such that sub is contained within S[start:end].  Optional\n",
      " |      arguments start and end are interpreted as in slice notation.\n",
      " |      \n",
      " |      Raises ValueError when the substring is not found.\n",
      " |  \n",
      " |  isalnum(self, /)\n",
      " |      Return True if the string is an alpha-numeric string, False otherwise.\n",
      " |      \n",
      " |      A string is alpha-numeric if all characters in the string are alpha-numeric and\n",
      " |      there is at least one character in the string.\n",
      " |  \n",
      " |  isalpha(self, /)\n",
      " |      Return True if the string is an alphabetic string, False otherwise.\n",
      " |      \n",
      " |      A string is alphabetic if all characters in the string are alphabetic and there\n",
      " |      is at least one character in the string.\n",
      " |  \n",
      " |  isascii(self, /)\n",
      " |      Return True if all characters in the string are ASCII, False otherwise.\n",
      " |      \n",
      " |      ASCII characters have code points in the range U+0000-U+007F.\n",
      " |      Empty string is ASCII too.\n",
      " |  \n",
      " |  isdecimal(self, /)\n",
      " |      Return True if the string is a decimal string, False otherwise.\n",
      " |      \n",
      " |      A string is a decimal string if all characters in the string are decimal and\n",
      " |      there is at least one character in the string.\n",
      " |  \n",
      " |  isdigit(self, /)\n",
      " |      Return True if the string is a digit string, False otherwise.\n",
      " |      \n",
      " |      A string is a digit string if all characters in the string are digits and there\n",
      " |      is at least one character in the string.\n",
      " |  \n",
      " |  isidentifier(self, /)\n",
      " |      Return True if the string is a valid Python identifier, False otherwise.\n",
      " |      \n",
      " |      Use keyword.iskeyword() to test for reserved identifiers such as \"def\" and\n",
      " |      \"class\".\n",
      " |  \n",
      " |  islower(self, /)\n",
      " |      Return True if the string is a lowercase string, False otherwise.\n",
      " |      \n",
      " |      A string is lowercase if all cased characters in the string are lowercase and\n",
      " |      there is at least one cased character in the string.\n",
      " |  \n",
      " |  isnumeric(self, /)\n",
      " |      Return True if the string is a numeric string, False otherwise.\n",
      " |      \n",
      " |      A string is numeric if all characters in the string are numeric and there is at\n",
      " |      least one character in the string.\n",
      " |  \n",
      " |  isprintable(self, /)\n",
      " |      Return True if the string is printable, False otherwise.\n",
      " |      \n",
      " |      A string is printable if all of its characters are considered printable in\n",
      " |      repr() or if it is empty.\n",
      " |  \n",
      " |  isspace(self, /)\n",
      " |      Return True if the string is a whitespace string, False otherwise.\n",
      " |      \n",
      " |      A string is whitespace if all characters in the string are whitespace and there\n",
      " |      is at least one character in the string.\n",
      " |  \n",
      " |  istitle(self, /)\n",
      " |      Return True if the string is a title-cased string, False otherwise.\n",
      " |      \n",
      " |      In a title-cased string, upper- and title-case characters may only\n",
      " |      follow uncased characters and lowercase characters only cased ones.\n",
      " |  \n",
      " |  isupper(self, /)\n",
      " |      Return True if the string is an uppercase string, False otherwise.\n",
      " |      \n",
      " |      A string is uppercase if all cased characters in the string are uppercase and\n",
      " |      there is at least one cased character in the string.\n",
      " |  \n",
      " |  join(self, iterable, /)\n",
      " |      Concatenate any number of strings.\n",
      " |      \n",
      " |      The string whose method is called is inserted in between each given string.\n",
      " |      The result is returned as a new string.\n",
      " |      \n",
      " |      Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'\n",
      " |  \n",
      " |  ljust(self, width, fillchar=' ', /)\n",
      " |      Return a left-justified string of length width.\n",
      " |      \n",
      " |      Padding is done using the specified fill character (default is a space).\n",
      " |  \n",
      " |  lower(self, /)\n",
      " |      Return a copy of the string converted to lowercase.\n",
      " |  \n",
      " |  lstrip(self, chars=None, /)\n",
      " |      Return a copy of the string with leading whitespace removed.\n",
      " |      \n",
      " |      If chars is given and not None, remove characters in chars instead.\n",
      " |  \n",
      " |  partition(self, sep, /)\n",
      " |      Partition the string into three parts using the given separator.\n",
      " |      \n",
      " |      This will search for the separator in the string.  If the separator is found,\n",
      " |      returns a 3-tuple containing the part before the separator, the separator\n",
      " |      itself, and the part after it.\n",
      " |      \n",
      " |      If the separator is not found, returns a 3-tuple containing the original string\n",
      " |      and two empty strings.\n",
      " |  \n",
      " |  replace(self, old, new, count=-1, /)\n",
      " |      Return a copy with all occurrences of substring old replaced by new.\n",
      " |      \n",
      " |        count\n",
      " |          Maximum number of occurrences to replace.\n",
      " |          -1 (the default value) means replace all occurrences.\n",
      " |      \n",
      " |      If the optional argument count is given, only the first count occurrences are\n",
      " |      replaced.\n",
      " |  \n",
      " |  rfind(...)\n",
      " |      S.rfind(sub[, start[, end]]) -> int\n",
      " |      \n",
      " |      Return the highest index in S where substring sub is found,\n",
      " |      such that sub is contained within S[start:end].  Optional\n",
      " |      arguments start and end are interpreted as in slice notation.\n",
      " |      \n",
      " |      Return -1 on failure.\n",
      " |  \n",
      " |  rindex(...)\n",
      " |      S.rindex(sub[, start[, end]]) -> int\n",
      " |      \n",
      " |      Return the highest index in S where substring sub is found,\n",
      " |      such that sub is contained within S[start:end].  Optional\n",
      " |      arguments start and end are interpreted as in slice notation.\n",
      " |      \n",
      " |      Raises ValueError when the substring is not found.\n",
      " |  \n",
      " |  rjust(self, width, fillchar=' ', /)\n",
      " |      Return a right-justified string of length width.\n",
      " |      \n",
      " |      Padding is done using the specified fill character (default is a space).\n",
      " |  \n",
      " |  rpartition(self, sep, /)\n",
      " |      Partition the string into three parts using the given separator.\n",
      " |      \n",
      " |      This will search for the separator in the string, starting at the end. If\n",
      " |      the separator is found, returns a 3-tuple containing the part before the\n",
      " |      separator, the separator itself, and the part after it.\n",
      " |      \n",
      " |      If the separator is not found, returns a 3-tuple containing two empty strings\n",
      " |      and the original string.\n",
      " |  \n",
      " |  rsplit(self, /, sep=None, maxsplit=-1)\n",
      " |      Return a list of the words in the string, using sep as the delimiter string.\n",
      " |      \n",
      " |        sep\n",
      " |          The delimiter according which to split the string.\n",
      " |          None (the default value) means split according to any whitespace,\n",
      " |          and discard empty strings from the result.\n",
      " |        maxsplit\n",
      " |          Maximum number of splits to do.\n",
      " |          -1 (the default value) means no limit.\n",
      " |      \n",
      " |      Splits are done starting at the end of the string and working to the front.\n",
      " |  \n",
      " |  rstrip(self, chars=None, /)\n",
      " |      Return a copy of the string with trailing whitespace removed.\n",
      " |      \n",
      " |      If chars is given and not None, remove characters in chars instead.\n",
      " |  \n",
      " |  split(self, /, sep=None, maxsplit=-1)\n",
      " |      Return a list of the words in the string, using sep as the delimiter string.\n",
      " |      \n",
      " |      sep\n",
      " |        The delimiter according which to split the string.\n",
      " |        None (the default value) means split according to any whitespace,\n",
      " |        and discard empty strings from the result.\n",
      " |      maxsplit\n",
      " |        Maximum number of splits to do.\n",
      " |        -1 (the default value) means no limit.\n",
      " |  \n",
      " |  splitlines(self, /, keepends=False)\n",
      " |      Return a list of the lines in the string, breaking at line boundaries.\n",
      " |      \n",
      " |      Line breaks are not included in the resulting list unless keepends is given and\n",
      " |      true.\n",
      " |  \n",
      " |  startswith(...)\n",
      " |      S.startswith(prefix[, start[, end]]) -> bool\n",
      " |      \n",
      " |      Return True if S starts with the specified prefix, False otherwise.\n",
      " |      With optional start, test S beginning at that position.\n",
      " |      With optional end, stop comparing S at that position.\n",
      " |      prefix can also be a tuple of strings to try.\n",
      " |  \n",
      " |  strip(self, chars=None, /)\n",
      " |      Return a copy of the string with leading and trailing whitespace removed.\n",
      " |      \n",
      " |      If chars is given and not None, remove characters in chars instead.\n",
      " |  \n",
      " |  swapcase(self, /)\n",
      " |      Convert uppercase characters to lowercase and lowercase characters to uppercase.\n",
      " |  \n",
      " |  title(self, /)\n",
      " |      Return a version of the string where each word is titlecased.\n",
      " |      \n",
      " |      More specifically, words start with uppercased characters and all remaining\n",
      " |      cased characters have lower case.\n",
      " |  \n",
      " |  translate(self, table, /)\n",
      " |      Replace each character in the string using the given translation table.\n",
      " |      \n",
      " |        table\n",
      " |          Translation table, which must be a mapping of Unicode ordinals to\n",
      " |          Unicode ordinals, strings, or None.\n",
      " |      \n",
      " |      The table must implement lookup/indexing via __getitem__, for instance a\n",
      " |      dictionary or list.  If this operation raises LookupError, the character is\n",
      " |      left untouched.  Characters mapped to None are deleted.\n",
      " |  \n",
      " |  upper(self, /)\n",
      " |      Return a copy of the string converted to uppercase.\n",
      " |  \n",
      " |  zfill(self, width, /)\n",
      " |      Pad a numeric string with zeros on the left, to fill a field of the given width.\n",
      " |      \n",
      " |      The string is never truncated.\n",
      " |  \n",
      " |  ----------------------------------------------------------------------\n",
      " |  Static methods defined here:\n",
      " |  \n",
      " |  __new__(*args, **kwargs) from builtins.type\n",
      " |      Create and return a new object.  See help(type) for accurate signature.\n",
      " |  \n",
      " |  maketrans(x, y=None, z=None, /)\n",
      " |      Return a translation table usable for str.translate().\n",
      " |      \n",
      " |      If there is only one argument, it must be a dictionary mapping Unicode\n",
      " |      ordinals (integers) or characters to Unicode ordinals, strings or None.\n",
      " |      Character keys will be then converted to ordinals.\n",
      " |      If there are two arguments, they must be strings of equal length, and\n",
      " |      in the resulting dictionary, each character in x will be mapped to the\n",
      " |      character at the same position in y. If there is a third argument, it\n",
      " |      must be a string, whose characters will be mapped to None in the result.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "help(str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pandas and Text\n",
    "\n",
    "Pandas can do a lot more than what we show here. Full online documentation on things like advanced string indexing and regular expressions with pandas can be found here: https://pandas.pydata.org/docs/user_guide/text.html\n",
    "\n",
    "## Text Methods on Pandas String Column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "names = pd.Series(['andrew','bobo','claire','david','4'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    andrew\n",
       "1      bobo\n",
       "2    claire\n",
       "3     david\n",
       "4         4\n",
       "dtype: object"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    Andrew\n",
       "1      Bobo\n",
       "2    Claire\n",
       "3     David\n",
       "4         4\n",
       "dtype: object"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names.str.capitalize()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1    False\n",
       "2    False\n",
       "3    False\n",
       "4     True\n",
       "dtype: bool"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "names.str.isdigit()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Splitting , Grabbing, and Expanding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "tech_finance = ['GOOG,APPL,AMZN','JPM,BAC,GS']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tech_finance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "tickers = pd.Series(tech_finance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    GOOG,APPL,AMZN\n",
       "1        JPM,BAC,GS\n",
       "dtype: object"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tickers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    [GOOG, APPL, AMZN]\n",
       "1        [JPM, BAC, GS]\n",
       "dtype: object"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tickers.str.split(',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    GOOG\n",
       "1     JPM\n",
       "dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tickers.str.split(',').str[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>GOOG</td>\n",
       "      <td>APPL</td>\n",
       "      <td>AMZN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>JPM</td>\n",
       "      <td>BAC</td>\n",
       "      <td>GS</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      0     1     2\n",
       "0  GOOG  APPL  AMZN\n",
       "1   JPM   BAC    GS"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tickers.str.split(',',expand=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cleaning or Editing Strings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "messy_names = pd.Series([\"andrew  \",\"bo;bo\",\"  claire  \"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      andrew  \n",
       "1         bo;bo\n",
       "2      claire  \n",
       "dtype: object"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Notice the \"mis-alignment\" on the right hand side due to spacing in \"andrew  \" and \"  claire  \"\n",
    "messy_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      andrew  \n",
       "1          bobo\n",
       "2      claire  \n",
       "dtype: object"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messy_names.str.replace(\";\",\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    andrew\n",
       "1     bo;bo\n",
       "2    claire\n",
       "dtype: object"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messy_names.str.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    andrew\n",
       "1      bobo\n",
       "2    claire\n",
       "dtype: object"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messy_names.str.replace(\";\",\"\").str.strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    Andrew\n",
       "1      Bobo\n",
       "2    Claire\n",
       "dtype: object"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messy_names.str.replace(\";\",\"\").str.strip().str.capitalize()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Alternative with Custom apply() call"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def cleanup(name):\n",
    "    name = name.replace(\";\",\"\")\n",
    "    name = name.strip()\n",
    "    name = name.capitalize()\n",
    "    return name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0      andrew  \n",
       "1         bo;bo\n",
       "2      claire  \n",
       "dtype: object"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messy_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    Andrew\n",
       "1      Bobo\n",
       "2    Claire\n",
       "dtype: object"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "messy_names.apply(cleanup)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Which one is more efficient?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import timeit \n",
    "  \n",
    "# code snippet to be executed only once \n",
    "setup = '''\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "messy_names = pd.Series([\"andrew  \",\"bo;bo\",\"  claire  \"])\n",
    "def cleanup(name):\n",
    "    name = name.replace(\";\",\"\")\n",
    "    name = name.strip()\n",
    "    name = name.capitalize()\n",
    "    return name\n",
    "'''\n",
    "  \n",
    "# code snippet whose execution time is to be measured \n",
    "stmt_pandas_str = ''' \n",
    "messy_names.str.replace(\";\",\"\").str.strip().str.capitalize()\n",
    "'''\n",
    "\n",
    "stmt_pandas_apply = '''\n",
    "messy_names.apply(cleanup)\n",
    "'''\n",
    "\n",
    "stmt_pandas_vectorize='''\n",
    "np.vectorize(cleanup)(messy_names)\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3.931618999999955"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "timeit.timeit(setup = setup, \n",
    "                    stmt = stmt_pandas_str, \n",
    "                    number = 10000) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.2268500999999787"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "timeit.timeit(setup = setup, \n",
    "                    stmt = stmt_pandas_apply, \n",
    "                    number = 10000) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.28283379999993485"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "timeit.timeit(setup = setup, \n",
    "                    stmt = stmt_pandas_vectorize, \n",
    "                    number = 10000) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wow! While .str() methods can be extremely convienent, when it comes to performance, don't forget about np.vectorize()! Review the \"Useful Methods\" lecture for a deeper discussion on np.vectorize()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}