{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "lVvPGn1Qfkxu" }, "source": [ "# More regular Expressions examples" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8fax2difIRo4" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('[Pp]umas?|[Cc]ougars?')\n", "p.findall('I saw a puma chasing two cougars.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bY4Z9_qTFGKi" }, "outputs": [], "source": [ "text = 'I saw a puma puma puma puma in the jungle.'\n", "p = re.compile('(puma )+')\n", "m = p.search(text)\n", "print(m)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5ac82669" }, "outputs": [], "source": [ "p = re.compile('[Ww]oodchuck')\n", "m = p.match('Woodchucks ran after a woodchuck.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3bc1f324", "scrolled": true }, "outputs": [], "source": [ "m" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "R-Jxps5VAtik" }, "outputs": [], "source": [ "m.span()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "240d3085" }, "outputs": [], "source": [ "m.group()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ee1d8e0d" }, "outputs": [], "source": [ "len('Woodchuck'), 'Woodchuck ran ...'[8]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "e11d6c98" }, "outputs": [], "source": [ "m.span(), m.start()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "cfcefeef" }, "outputs": [], "source": [ "m = p.match('Three Woodchucks ran after a woodchuck.')\n", "print(m)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0d84af4e" }, "outputs": [], "source": [ "m = p.search('Three Woodchucks ran after a woodchuck.')\n", "m.group(), m.span(), 'Three Woodchucks'.find('Woodchuck')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "690035a2" }, "outputs": [], "source": [ "matches = p.findall('Three Woodchucks ran after a woodchuck.')\n", "matches" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "677ac286" }, "outputs": [], "source": [ "matches = p.finditer('Three Woodchucks ran after a woodchuck.')\n", "for m in matches:\n", " print(m.span())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4469c308" }, "outputs": [], "source": [ "p = re.compile('[Ww]oodchuck|[Gg]roundhog')\n", "matches = p.findall('The woodchuck appears at the beginning in the movie Groundhog Day')\n", "matches" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "wfdt_ms1CsBX" }, "outputs": [], "source": [ "pd = re.compile(r'\\d+')\n", "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he can swim 4000 yards without stopping\")\n", "print(matches)\n", "\n", "pd = re.compile(r'[0-9]+')\n", "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he can swim 4000 yards without stopping\")\n", "print(matches)\n", "\n", "pd = re.compile(r'[\\d.]+')\n", "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he can swim 4000 yards without stopping\")\n", "print(matches)\n", "\n", "pd = re.compile(r'[\\d]+ [.]? \\d+', re.VERBOSE)\n", "matches = pd.findall(\"His GPA is 3.85. His age is 23, and he \" \\\n", " \"can swim 4000 yards without stopping.\" \\\n", " \"How about 3.85.4?\")\n", "print(matches)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "270bb3c5" }, "outputs": [], "source": [ "import re\n", "p = re.compile('[Ww]oodchuck | [Gg]roundhog')\n", "matches = p.findall('The woodchucks appears at the beginning in the movie Groundhog Day')\n", "matches" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fc47ef4c" }, "outputs": [], "source": [ "p = re.compile('[Ww]oodchuck | [Gg]roundhog', re.VERBOSE)\n", "matches = p.findall('The woodchucks appears at the beginning in the movie Groundhog Day')\n", "matches" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "f55d7d85" }, "outputs": [], "source": [ "p = re.compile(r'[Ww]oodchuck\\ | [Gg]roundhog', re.VERBOSE)\n", "matches = p.findall('The woodchuck appears at the beginning in the movie Groundhog Day')\n", "matches" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ecfa80b7" }, "outputs": [], "source": [ "p = re.compile('[Ww]oodchucks?|[Gg]roundhogs?')\n", "p.findall('Woodchucks, by any other name, such as groundhog, '\n", " 'would woodchuck the same.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "a7cd1bf2" }, "outputs": [], "source": [ "p = re.compile('^[Hh]ow')\n", "p.findall('How do you do? I do how I always do.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0fadba18" }, "outputs": [], "source": [ "p = re.compile('[Hh]ow')\n", "p.findall('How do you do? I do how I always do.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0f254770" }, "outputs": [], "source": [ "#p = re.compile('[^a-zA-Z][tT]he[^a-zA-Z]')\n", "p = re.compile('[tT]he')\n", "p.findall('The cat ran after the dog, but the other dog intervened.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8965f721" }, "outputs": [], "source": [ "p = re.compile('[tT]he')\n", "matches = p.finditer('The cat ran after the dog, '\n", " 'but the other dog intervened.')\n", "for m in matches:\n", " print(m)\n", "\n", "print()\n", "\n", "matches = p.finditer('The cat ran after the dog, '\n", " 'but the other dog intervened.')\n", "for m in matches:\n", " print(m.group(), m.start(), m.end())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8a9edca2" }, "outputs": [], "source": [ "p = re.compile('[^a-zA-Z][tT]he[^a-zA-Z]')\n", "#p = re.compile('[tT]he')\n", "p.findall('The cat ran after the dog, '\n", " 'but the other dog intervened.')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "793e5931", "scrolled": true }, "outputs": [], "source": [ "s = 'The cat ran after the dog, but the other dog intervened.'\n", "\n", "p1 = re.compile('[^a-zA-Z] ([tT]he) [^a-zA-Z]', re.VERBOSE)\n", "r1 = p1.findall(s)\n", "print(r1)\n", "\n", "p2 = re.compile('^([tT]he) [^a-zA-Z]', re.VERBOSE)\n", "r2 = p2.findall(s)\n", "print(r2)\n", "\n", "# Instead of trying to combine the two patterns (but try it as a homework exercise).\n", "r3 = p1.findall(' ' + s)\n", "print(r3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ada21245" }, "outputs": [], "source": [ "p = re.compile('a+b+')\n", "p.findall('aabb aaabbb abcba aba aaaabb')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eabd9db4" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile(r'[pP]ythons?')\n", "matches = p.findall('Python is a fun programming language. '\n", " 'There are many pythons in the jungle. '\n", " 'I like PYTHON!')\n", "print(matches)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2767e526", "scrolled": true }, "outputs": [], "source": [ "p = re.compile(r'\\s(cats?|dogs?)\\W')\n", "matches = p.findall('It is raining cats and dogs. '\n", " 'Her cat likes catfish.')\n", "print(matches)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "c71bdccf" }, "outputs": [], "source": [ "p = re.compile('colou?r')\n", "p.sub('', 'I would like to drive a blue coloured car.')" ] }, { "cell_type": "markdown", "metadata": { "id": "_7N16ILYgCZh" }, "source": [ "## Character classes `\\d`, `\\D`, ..." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lMA1l4EQJ1SW" }, "outputs": [], "source": [ "import re\n", "\n", "text = 'I woke up at 8am this morning.'\n", "p = re.compile('\\D+')\n", "p.findall(text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6841i3yMOdHd" }, "outputs": [], "source": [ "p = re.compile('[^0-9]+')\n", "p.findall(text)" ] }, { "cell_type": "markdown", "metadata": { "id": "etgFblPEO32k" }, "source": [ "Regular expression for recognizing time expressions, e.g. `8am`, `12:05pm`, ..." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Me0naJG-PC9L" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('[0-9]+(:[0-9]+)?[ap]m')\n", "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n", "m1 = p.search(text)\n", "print(m1)\n", "print(m1.group()) # this prints the matched string\n", "print(m1.start()) # this prints the starting position\n", "print(m1.end()) # this prints the end position\n", "print(m1.span()) # this prints the (start, end) tuple" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9AgM2VJaQHY3" }, "outputs": [], "source": [ "m2 = p.search(text[m1.end():])\n", "print(m2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bBpMM1hqS2zJ" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('[0-9]+(:[0-9]+)?[ap]m')\n", "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n", "\n", "# Find and print all matches.\n", "m = p.search(text)\n", "while m:\n", " print(m.group())\n", " text = text[m.end():]\n", " m = p.search(text)" ] }, { "cell_type": "markdown", "metadata": { "id": "d-OGJA6EgdNo" }, "source": [ "`Pattern.search()` has a keyword argument `pos` to specify where to start the search, by default 0." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "r-ebDGI1TRYH" }, "outputs": [], "source": [ "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n", "p.search(text, pos = 16)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Yg7snMSYVDmC" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('[0-9]+(:[0-9]+)?[ap]m')\n", "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n", "# Find and print all matches.\n", "m = p.search(text)\n", "while m:\n", " print(m.group())\n", " m = p.search(text, pos = m.end())" ] }, { "cell_type": "markdown", "metadata": { "id": "we11oyHkgxmF" }, "source": [ "Use `re.VERBOSE` to indicate that spaces in the regular expression string are to be ignored." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "0JjHmg-sVs3M" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('[0-9]+ (:[0-9]+)? [ap]m', re.VERBOSE)\n", "text = 'I woke up at 8am and had lunch at 12:15pm, then went for a walk.'\n", "m = p.search(text)\n", "while m:\n", " print(m.group())\n", " m = p.search(text, pos = m.end())" ] }, { "cell_type": "markdown", "metadata": { "id": "PtC_K3d_r5A-" }, "source": [ "Let's make the regular expression more precise." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "f3hQPLs-r-JI" }, "outputs": [], "source": [ "p = re.compile(r'(?<=\\D) (0?[0-9] | 1[012]) (:[0-5][0-9])? [ap]m', re.VERBOSE)\n", "text = 'I woke up at 8am and had lunch at 12:15pm, then went for a walk. 34:49am is not a valid time expression.'\n", "m = p.search(text)\n", "while m:\n", " print(m.group())\n", " m = p.search(text, pos = m.end())" ] }, { "cell_type": "markdown", "metadata": { "id": "DT0vLvL8g70h" }, "source": [ "## Use parantheses for *capturing* behavior" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "keNQLZRvV3Je" }, "outputs": [], "source": [ "p = re.compile('[^a-zA-Z] [Tt]he [^a-zA-Z]', re.VERBOSE)\n", "m = p.findall('Yes. The cat chases the dogs that bathe.')\n", "print(m)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eejWIe4eXfLP" }, "outputs": [], "source": [ "p = re.compile('[^a-zA-Z] ([Tt]he) [^a-zA-Z]', re.VERBOSE)\n", "m = p.findall('Yes. The cat chases the dogs that bathe.')\n", "print(m)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "6af46db1" }, "outputs": [], "source": [ "p = re.compile('( [0-9]+ )', re.VERBOSE)\n", "p.sub(r'<\\1> extra', 'the 35 boxes')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8bd8991d" }, "outputs": [], "source": [ "p = re.compile('( [0-9]+ )', re.VERBOSE)\n", "p.sub(r'<\\1> extra', '10 whiseky bottles and 35 boxes of gold')" ] }, { "cell_type": "markdown", "metadata": { "id": "qLENJF3rlsTq" }, "source": [ "## Use `(?! )` to indicate non-matching behavior." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "f04b857e" }, "outputs": [], "source": [ "p = re.compile(r'Isaac (?!Asimov)')\n", "matches = p.finditer('I like reading Isaac Asimov '\n", " 'and listening to Isaac Perlman '\n", " 'and playing chess with Isaac .')\n", "for m in matches:\n", " print(m.span(), m.group())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "53cef78d" }, "outputs": [], "source": [ "p = re.compile(r'Isaac (?!Asimov|Perlman)')\n", "matches = p.finditer('I like reading Isaac Asimov '\n", " 'and listening to Isaac Perlman '\n", " 'and playing chess with Isaac .')\n", "for m in matches:\n", " print(m.span(), m.group())" ] }, { "cell_type": "markdown", "metadata": { "id": "AI4lrVA_hHFD" }, "source": [ "## Use `(?: )` to indicate parantheses are used for *grouping*, but not capturing behavior" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "uh1bI_uqXtFO" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('[0-9]+ (?: :[0-9]+)? [ap]m', re.VERBOSE)\n", "text = 'I woke up at 8am and had lunch at 12:35pm, then went for a walk.'\n", "m = p.findall(text)\n", "print(m)" ] }, { "cell_type": "markdown", "metadata": { "id": "FQPkDKdYhuL4" }, "source": [ "## Find-replace using regular expressions and `p.sub()`" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "P5pVD3umYCGn" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('\\d+')\n", "text = 'She ran for 3 miles, than she ate 2 apples and drank a 12 ounce can of Coke.'\n", "p.sub('', text)" ] }, { "cell_type": "markdown", "metadata": { "id": "krisMFnSh2mb" }, "source": [ "Capture groups using parantheses and numbered registers." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "78v1GjhOZR5G" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile('(\\d+)')\n", "text = 'I ran for 3 miles, than I ate 2 apples and drank a 12 ounce can of Coke.'\n", "p.sub(r'\\1 extra', text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2rlziVGFZvGX" }, "outputs": [], "source": [ "import re\n", "\n", "p = re.compile(\".*I am (depressed|sad).*\")\n", "text = \"My cat is sick, I am sad, I don't know what to do!\"\n", "p.sub(r'I am sorry to hear you are \\1.', text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "J44__MgVcc2k" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 4 }