{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "lVvPGn1Qfkxu"
   },
   "source": [
    "# More regular Expressions examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "8fax2difIRo4"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('[Pp]umas?|[Cc]ougars?')\n",
    "p.findall('I saw a puma chasing two cougars.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "bY4Z9_qTFGKi"
   },
   "outputs": [],
   "source": [
    "text = 'I saw a puma puma puma puma in the jungle.'\n",
    "p = re.compile('(puma )+')\n",
    "m = p.search(text)\n",
    "print(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "5ac82669"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[Ww]oodchuck')\n",
    "m = p.match('Woodchucks ran after a woodchuck.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "3bc1f324",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "m"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "R-Jxps5VAtik"
   },
   "outputs": [],
   "source": [
    "m.span()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "240d3085"
   },
   "outputs": [],
   "source": [
    "m.group()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ee1d8e0d"
   },
   "outputs": [],
   "source": [
    "len('Woodchuck'), 'Woodchuck ran ...'[8]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "e11d6c98"
   },
   "outputs": [],
   "source": [
    "m.span(), m.start()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "cfcefeef"
   },
   "outputs": [],
   "source": [
    "m = p.match('Three Woodchucks ran after a woodchuck.')\n",
    "print(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "0d84af4e"
   },
   "outputs": [],
   "source": [
    "m = p.search('Three Woodchucks ran after a woodchuck.')\n",
    "m.group(), m.span(), 'Three Woodchucks'.find('Woodchuck')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "690035a2"
   },
   "outputs": [],
   "source": [
    "matches = p.findall('Three Woodchucks ran after a woodchuck.')\n",
    "matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "677ac286"
   },
   "outputs": [],
   "source": [
    "matches = p.finditer('Three Woodchucks ran after a woodchuck.')\n",
    "for m in matches:\n",
    "    print(m.span())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "4469c308"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[Ww]oodchuck|[Gg]roundhog')\n",
    "matches = p.findall('The woodchuck appears at the beginning in the movie Groundhog Day')\n",
    "matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "wfdt_ms1CsBX"
   },
   "outputs": [],
   "source": [
    "pd = re.compile(r'\\d+')\n",
    "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he can swim 4000 yards without stopping\")\n",
    "print(matches)\n",
    "\n",
    "pd = re.compile(r'[0-9]+')\n",
    "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he can swim 4000 yards without stopping\")\n",
    "print(matches)\n",
    "\n",
    "pd = re.compile(r'[\\d.]+')\n",
    "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he can swim 4000 yards without stopping\")\n",
    "print(matches)\n",
    "\n",
    "pd = re.compile(r'[\\d]+ [.]? \\d+', re.VERBOSE)\n",
    "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he \" \\\n",
    "                     \"can swim 4000 yards without stopping.\" \\\n",
    "                     \"How about 3.85.4?\")\n",
    "print(matches)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "270bb3c5"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "p = re.compile('[Ww]oodchuck | [Gg]roundhog')\n",
    "matches = p.findall('The woodchucks appears at the beginning in the movie Groundhog Day')\n",
    "matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "fc47ef4c"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[Ww]oodchuck | [Gg]roundhog', re.VERBOSE)\n",
    "matches = p.findall('The woodchucks appears at the beginning in the movie Groundhog Day')\n",
    "matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "f55d7d85"
   },
   "outputs": [],
   "source": [
    "p = re.compile(r'[Ww]oodchuck\\ | [Gg]roundhog', re.VERBOSE)\n",
    "matches = p.findall('The woodchuck appears at the beginning in the movie Groundhog Day')\n",
    "matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ecfa80b7"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[Ww]oodchucks?|[Gg]roundhogs?')\n",
    "p.findall('Woodchucks, by any other name, such as groundhog, '\n",
    "          'would woodchuck the same.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "a7cd1bf2"
   },
   "outputs": [],
   "source": [
    "p = re.compile('^[Hh]ow')\n",
    "p.findall('How do you do? I do how I always do.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "0fadba18"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[Hh]ow')\n",
    "p.findall('How do you do? I do how I always do.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "0f254770"
   },
   "outputs": [],
   "source": [
    "#p = re.compile('[^a-zA-Z][tT]he[^a-zA-Z]')\n",
    "p = re.compile('[tT]he')\n",
    "p.findall('The cat ran after the dog, but the other dog intervened.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "8965f721"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[tT]he')\n",
    "matches = p.finditer('The cat ran after the dog, '\n",
    "                     'but the other dog intervened.')\n",
    "for m in matches:\n",
    "    print(m)\n",
    "\n",
    "print()\n",
    "\n",
    "matches = p.finditer('The cat ran after the dog, '\n",
    "                     'but the other dog intervened.')\n",
    "for m in matches:\n",
    "    print(m.group(), m.start(), m.end())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "8a9edca2"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[^a-zA-Z][tT]he[^a-zA-Z]')\n",
    "#p = re.compile('[tT]he')\n",
    "p.findall('The cat ran after the dog, '\n",
    "          'but the other dog intervened.')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "793e5931",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "s = 'The cat ran after the dog, but the other dog intervened.'\n",
    "\n",
    "p1 = re.compile('[^a-zA-Z] ([tT]he) [^a-zA-Z]', re.VERBOSE)\n",
    "r1 = p1.findall(s)\n",
    "print(r1)\n",
    "\n",
    "p2 = re.compile('^([tT]he) [^a-zA-Z]', re.VERBOSE)\n",
    "r2 = p2.findall(s)\n",
    "print(r2)\n",
    "\n",
    "# Instead of trying to combine the two patterns (but try it as a homework exercise).\n",
    "r3 = p1.findall(' ' + s)\n",
    "print(r3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "ada21245"
   },
   "outputs": [],
   "source": [
    "p = re.compile('a+b+')\n",
    "p.findall('aabb aaabbb abcba aba aaaabb')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "eabd9db4"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile(r'[pP]ythons?')\n",
    "matches = p.findall('Python is a fun programming language. '\n",
    "                    'There are many pythons in the jungle. '\n",
    "                    'I like PYTHON!')\n",
    "print(matches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "2767e526",
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "p = re.compile(r'\\s(cats?|dogs?)\\W')\n",
    "matches = p.findall('It is raining cats and dogs. '\n",
    "                    'Her cat likes catfish.')\n",
    "print(matches)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "c71bdccf"
   },
   "outputs": [],
   "source": [
    "p = re.compile('colou?r')\n",
    "p.sub('<color>', 'I would like to drive a blue coloured car.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_7N16ILYgCZh"
   },
   "source": [
    "## Character classes `\\d`, `\\D`, ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "lMA1l4EQJ1SW"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "text = 'I woke up at 8am this morning.'\n",
    "p = re.compile('\\D+')\n",
    "p.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "6841i3yMOdHd"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[^0-9]+')\n",
    "p.findall(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "etgFblPEO32k"
   },
   "source": [
    "Regular expression for recognizing time expressions, e.g. `8am`, `12:05pm`, ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Me0naJG-PC9L"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('[0-9]+(:[0-9]+)?[ap]m')\n",
    "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n",
    "m1 = p.search(text)\n",
    "print(m1)\n",
    "print(m1.group()) # this prints the matched string\n",
    "print(m1.start()) # this prints the starting position\n",
    "print(m1.end()) # this prints the end position\n",
    "print(m1.span()) # this prints the (start, end) tuple"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "9AgM2VJaQHY3"
   },
   "outputs": [],
   "source": [
    "m2 = p.search(text[m1.end():])\n",
    "print(m2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "bBpMM1hqS2zJ"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('[0-9]+(:[0-9]+)?[ap]m')\n",
    "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n",
    "\n",
    "# Find and print all matches.\n",
    "m = p.search(text)\n",
    "while m:\n",
    "  print(m.group())\n",
    "  text = text[m.end():]\n",
    "  m = p.search(text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "d-OGJA6EgdNo"
   },
   "source": [
    "`Pattern.search()` has a keyword argument `pos` to specify where to start the search, by default 0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "r-ebDGI1TRYH"
   },
   "outputs": [],
   "source": [
    "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n",
    "p.search(text, pos = 16)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "Yg7snMSYVDmC"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('[0-9]+(:[0-9]+)?[ap]m')\n",
    "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n",
    "# Find and print all matches.\n",
    "m = p.search(text)\n",
    "while m:\n",
    "  print(m.group())\n",
    "  m = p.search(text, pos = m.end())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "we11oyHkgxmF"
   },
   "source": [
    "Use `re.VERBOSE` to indicate that spaces in the regular expression string are to be ignored."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "0JjHmg-sVs3M"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('[0-9]+ (:[0-9]+)? [ap]m', re.VERBOSE)\n",
    "text = 'I woke up at 8am and had lunch at 12:15pm, then went for a walk.'\n",
    "m = p.search(text)\n",
    "while m:\n",
    "  print(m.group())\n",
    "  m = p.search(text, pos = m.end())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "PtC_K3d_r5A-"
   },
   "source": [
    "Let's make the regular expression more precise."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "f3hQPLs-r-JI"
   },
   "outputs": [],
   "source": [
    "p = re.compile(r'(?<=\\D) (0?[0-9] | 1[012]) (:[0-5][0-9])? [ap]m', re.VERBOSE)\n",
    "text = 'I woke up at 8am and had lunch at 12:15pm, then went for a walk. 34:49am is not a valid time expression.'\n",
    "m = p.search(text)\n",
    "while m:\n",
    "  print(m.group())\n",
    "  m = p.search(text, pos = m.end())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "DT0vLvL8g70h"
   },
   "source": [
    "## Use parantheses for *capturing* behavior"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "keNQLZRvV3Je"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[^a-zA-Z] [Tt]he [^a-zA-Z]', re.VERBOSE)\n",
    "m = p.findall('Yes. The cat chases the dogs that bathe.')\n",
    "print(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "eejWIe4eXfLP"
   },
   "outputs": [],
   "source": [
    "p = re.compile('[^a-zA-Z] ([Tt]he) [^a-zA-Z]', re.VERBOSE)\n",
    "m = p.findall('Yes. The cat chases the dogs that bathe.')\n",
    "print(m)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "6af46db1"
   },
   "outputs": [],
   "source": [
    "p = re.compile('( [0-9]+ )', re.VERBOSE)\n",
    "p.sub(r'<\\1> extra', 'the 35 boxes')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "8bd8991d"
   },
   "outputs": [],
   "source": [
    "p = re.compile('( [0-9]+ )', re.VERBOSE)\n",
    "p.sub(r'<\\1> extra', '10 whiseky bottles and 35 boxes of gold')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "qLENJF3rlsTq"
   },
   "source": [
    "## Use `(?!   )` to indicate non-matching behavior."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "f04b857e"
   },
   "outputs": [],
   "source": [
    "p = re.compile(r'Isaac (?!Asimov)')\n",
    "matches = p.finditer('I like reading Isaac Asimov '\n",
    "                     'and listening to Isaac Perlman '\n",
    "                     'and playing chess with Isaac .')\n",
    "for m in matches:\n",
    "    print(m.span(), m.group())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "53cef78d"
   },
   "outputs": [],
   "source": [
    "p = re.compile(r'Isaac (?!Asimov|Perlman)')\n",
    "matches = p.finditer('I like reading Isaac Asimov '\n",
    "                     'and listening to Isaac Perlman '\n",
    "                     'and playing chess with Isaac .')\n",
    "for m in matches:\n",
    "    print(m.span(), m.group())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "AI4lrVA_hHFD"
   },
   "source": [
    "## Use `(?:    )` to indicate parantheses are used for *grouping*, but not capturing behavior"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "uh1bI_uqXtFO"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('[0-9]+ (?: :[0-9]+)? [ap]m', re.VERBOSE)\n",
    "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n",
    "m = p.findall(text)\n",
    "print(m)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "FQPkDKdYhuL4"
   },
   "source": [
    "## Find-replace using regular expressions and `p.sub()`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "P5pVD3umYCGn"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('\\d+')\n",
    "text = 'She ran for 3 miles, than she ate 2 apples and drank a 12 ounce can of Coke.'\n",
    "p.sub('<num>', text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "krisMFnSh2mb"
   },
   "source": [
    "Capture groups using parantheses and numbered registers."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "78v1GjhOZR5G"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile('(\\d+)')\n",
    "text = 'I ran for 3 miles, than I ate 2 apples and drank a 12 ounce can of Coke.'\n",
    "p.sub(r'\\1 extra', text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "2rlziVGFZvGX"
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "p = re.compile(\".*I am (depressed|sad).*\")\n",
    "text = \"My cat is sick, I am sad, I don't know what to do!\"\n",
    "p.sub(r'I am sorry to hear you are \\1.', text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "J44__MgVcc2k"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}