English
hassaanulhaq01 commited on
Commit
532394f
·
verified ·
1 Parent(s): 707bf00

Add interactive schedule_o notebook from Databricks

Browse files
notebooks/schedule_o_classifier_notebook.ipynb ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "application/vnd.databricks.v1+cell": {
7
+ "cellMetadata": {},
8
+ "inputWidgets": {},
9
+ "nuid": "b7236467-00b7-43e0-bba8-6ed909c52fb5",
10
+ "showTitle": false,
11
+ "tableResultSettingsMap": {},
12
+ "title": ""
13
+ }
14
+ },
15
+ "source": [
16
+ "##### Classifies a Form 990 Schedule O text description based on rules for both Form 990 and Form 990-EZ using the open text response from the filer describing which part of the form they are providing supplementary information for. \n",
17
+ "#### It outputs a string classification label from the following list:'I EZ', 'II EZ', 'III EZ', 'V EZ', 'III', 'V', 'VI', 'VII', 'IX', 'XI', 'XII', 'Unknown'. \n",
18
+ "\n",
19
+ "##### Note: Change the source and target tables according to the environment\n",
20
+ "\n",
21
+ "##### Author: GivingTuesday Data Commons"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 0,
27
+ "metadata": {
28
+ "application/vnd.databricks.v1+cell": {
29
+ "cellMetadata": {
30
+ "byteLimit": 2048000,
31
+ "rowLimit": 10000
32
+ },
33
+ "inputWidgets": {},
34
+ "nuid": "833ba8f0-d981-446d-9a8d-5b8ee3cad525",
35
+ "showTitle": false,
36
+ "tableResultSettingsMap": {},
37
+ "title": ""
38
+ }
39
+ },
40
+ "outputs": [],
41
+ "source": [
42
+ "import re\n",
43
+ "import pandas as pd\n",
44
+ "import string\n",
45
+ "from pyspark.sql import SparkSession\n",
46
+ "from pyspark.sql.functions import udf, col, when\n",
47
+ "from pyspark.sql.types import StringType\n",
48
+ "\n",
49
+ "def classify_schedule_o_submission(sidfalrdesc: str) -> str:\n",
50
+ " \"\"\"\n",
51
+ " Classifies a Form 990 Schedule O text description based on rules for both\n",
52
+ " Form 990 and Form 990-EZ.\n",
53
+ "\n",
54
+ " Args:\n",
55
+ " sidfalrdesc: The open text response from the filer describing which\n",
56
+ " part of the form they are providing supplementary\n",
57
+ " information for.\n",
58
+ "\n",
59
+ " Returns:\n",
60
+ " A string classification label from the following list:\n",
61
+ " 'I EZ', 'II EZ', 'III EZ', 'V EZ', 'III', 'V', 'VI', 'VII', 'IX',\n",
62
+ " 'XI', 'XII', 'Unknown'.\n",
63
+ " \"\"\"\n",
64
+ " if sidfalrdesc is None or sidfalrdesc == '':\n",
65
+ " return 'Unknown'\n",
66
+ " \n",
67
+ " text = sidfalrdesc.lower()\n",
68
+ "\n",
69
+ " text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))\n",
70
+ " \n",
71
+ " # Replace multiple spaces with a single space\n",
72
+ " text = re.sub(r'\\s+', ' ', text).strip()\n",
73
+ " # Define a common pattern for 'part' or 'pt' for cleaner regexes\n",
74
+ " part_or_pt_pattern = r'(part|pt)\\s+'\n",
75
+ "\n",
76
+ " rules = [\n",
77
+ " # ======================================================================\n",
78
+ " # Rule Group 1: Direct and Unambiguous EZ Form Mentions\n",
79
+ " # ======================================================================\n",
80
+ " ('I EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(i|1)\\b|' + part_or_pt_pattern + r'(i|1)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(i|1)\\b)'),\n",
81
+ " ('II EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(ii|2)\\b|' + part_or_pt_pattern + r'(ii|2)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(ii|2)\\b)'),\n",
82
+ " ('III EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(iii|3)\\b|' + part_or_pt_pattern + r'(iii|3)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(iii|3)\\b)'),\n",
83
+ " ('V EZ', r'\\b(form\\s+990-ez.*' + part_or_pt_pattern + r'(v|5)\\b|' + part_or_pt_pattern + r'(v|5)\\s+ez\\b|\\b(990\\s?)?ez.*' + part_or_pt_pattern + r'(v|5)\\b)'),\n",
84
+ "\n",
85
+ " # ======================================================================\n",
86
+ " # Rule Group 2: Specific Part/Pt + Line Number Combinations (990 & 990-EZ)\n",
87
+ " # These rules identify parts by their unique line numbers, regardless of surrounding context.\n",
88
+ " # ======================================================================\n",
89
+ " ('I EZ', r'\\b' + part_or_pt_pattern + r'(i|1)\\b.*(line|ln)\\s+(8|10|16|20)\\b'), # EZ Part I specific lines\n",
90
+ " ('II EZ', r'\\b' + part_or_pt_pattern + r'(ii|2)\\b.*(line|ln)\\s+(24|26)\\b'), # EZ Part II specific lines\n",
91
+ " ('III EZ', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b.*(line|ln)\\s+(31)\\b'), # EZ Part III specific line\n",
92
+ " ('V EZ', r'\\b' + part_or_pt_pattern + r'(v|5)\\b.*(line|ln)\\s+(33|34|35b|44d)\\b'), # EZ Part V specific lines\n",
93
+ "\n",
94
+ " ('III', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b.*(line|ln)\\s+(2|3|4)\\b'), # Standard 990 Part III specific lines\n",
95
+ " ('V', r'\\b' + part_or_pt_pattern + r'(v|5)\\b.*(line|ln)\\s+(3b|13a|14b)\\b'), # Standard 990 Part V specific lines\n",
96
+ "\n",
97
+ " ('VI', r'\\b' + part_or_pt_pattern + r'(vi|6)\\b.*(line|ln)\\s+(11b|19)\\b'),\n",
98
+ " ('IX', r'\\b' + part_or_pt_pattern + r'(ix|9)\\b.*(line|ln)\\s+(11g|24e)\\b'),\n",
99
+ " ('XII', r'\\b' + part_or_pt_pattern + r'(xii|12)\\b.*(line|ln)\\s+(1|2c)\\b'),\n",
100
+ "\n",
101
+ " # ======================================================================\n",
102
+ " # Rule Group 3: Highly Specific Contextual Keywords (No mandatory line number or part mention)\n",
103
+ " # ======================================================================\n",
104
+ " ('VI', r'governance|governing\\s+body|board\\s+of\\s+directors|conflict\\s+of\\s+interest|whistleblower\\s+policy|document\\s+retention'\n",
105
+ " r'|independent\\s+voting\\s+members|minutes|states\\s+where\\s+copy\\s+of\\s+return\\s+is\\s+filed|public\\s+disclosure'\n",
106
+ " r'|process\\s+for\\s+review|process\\s+for\\s+determining\\s+compensation'),\n",
107
+ " ('VII', r'compensation.*related\\s+organization|form\\s+w-2|form\\s+1099-misc|form\\s+1099-nec|severance\\s+payment|highest\\s+compensated'),\n",
108
+ " ('IX', r'functional\\s+expenses|statement\\s+of\\s+functional\\s+expenses|lobbying|professional\\s+fundraising|investment\\s+management\\s+fees'\n",
109
+ " r'|other\\s+fees\\s+for\\s+services|all\\s+other\\s+expenses'),\n",
110
+ " ('XI', r'reconciliation\\s+of\\s+net\\s+assets|prior\\s+period\\s+adjustment|unrealized\\s+gains|donated\\s+services'),\n",
111
+ " ('XII', r'financial\\s+statements\\s+and\\s+reporting|audited\\s+financial\\s+statements|reviewed\\s+financial\\s+statements|compiled\\s+financial\\s+statements'\n",
112
+ " r'|fin\\s+48|asc\\s+740|basis\\s+of\\s+accounting|accounting\\s+method|committee\\s+oversight'),\n",
113
+ " ('V', r'other\\s+irs\\s+filings|tax\\s+compliance|fbar|foreign\\s+bank\\s+account|form\\s+4720|tax-exempt\\s+bonds'),\n",
114
+ " ('III', r'exempt\\s+purpose\\s+achievements'),\n",
115
+ " ('III EZ', r'program\\s+service\\s+accomplishments|program\\s+service|additional\\s+program\\s+services'),\n",
116
+ " ('V EZ', r'significant\\s+activity\\s+changes|amended\\s+bylaws|changes\\s+to\\s+organizing\\s+documents|unrelated\\s+business\\s+gross\\s+income|list\\s+of\\s+states\\s+return\\s+filed\\s+in'),\n",
117
+ " ('II EZ', r'\\bother\\s+assets\\b|\\btotal\\s+assets\\b|\\bloans\\s+from\\s+officers\\b|\\baccounts\\s+payable\\b'),\n",
118
+ "\n",
119
+ " # ======================================================================\n",
120
+ " # Rule Group 4: Ambiguous Part/Pt Mentions (Default to Standard 990)\n",
121
+ " # ======================================================================\n",
122
+ " ('XII', r'\\b' + part_or_pt_pattern + r'(xii|12)\\b'),\n",
123
+ " ('XI', r'\\b' + part_or_pt_pattern + r'(xi|11)\\b'),\n",
124
+ " ('IX', r'\\b' + part_or_pt_pattern + r'(ix|9)\\b'),\n",
125
+ " ('VII', r'\\b' + part_or_pt_pattern + r'(vii|7)\\b'),\n",
126
+ " ('VI', r'\\b' + part_or_pt_pattern + r'(vi|6)\\b'),\n",
127
+ " ('V', r'\\b' + part_or_pt_pattern + r'(v|5)\\b'), \n",
128
+ " ('III', r'\\b' + part_or_pt_pattern + r'(iii|3)\\b'),\n",
129
+ "\n",
130
+ " # ======================================================================\n",
131
+ " # Rule Group 5: Ambiguous Keyword Mentions (Default to Standard 990)\n",
132
+ " # ======================================================================\n",
133
+ " ('IX', r'\\bother\\s+expenses\\b'), # Catch \"other expenses\" as a low-priority IX match\n",
134
+ " ('XI', r'other\\s+changes\\s+in\\s+net\\s+assets'),\n",
135
+ " ]\n",
136
+ "\n",
137
+ " for classification, pattern in rules:\n",
138
+ " if re.search(pattern, text):\n",
139
+ " return classification\n",
140
+ "\n",
141
+ " return 'Unknown'\n",
142
+ "\n",
143
+ "classify_udf = udf(classify_schedule_o_submission, StringType())\n",
144
+ "\n",
145
+ "\n",
146
+ "def process_with_filters(source_table: str = \"---Put the source table name here schedule O Table Name here---\", \n",
147
+ " target_table: str = \"put target table path here,\n",
148
+ " tax_years: list = [2020,2021,2022,2023,2024,2025],\n",
149
+ " limit_records: int = None):\n",
150
+ " \"\"\"\n",
151
+ " Process Schedule O classifications with optional filters.\n",
152
+ " \n",
153
+ " Args:\n",
154
+ " source_table: Name of the source table\n",
155
+ " target_table: Target table to write results\n",
156
+ " tax_years: List of tax years to process (optional)\n",
157
+ " limit_records: Limit number of records for testing (optional)\n",
158
+ " \"\"\"\n",
159
+ " \n",
160
+ " df = spark.table(source_table)\n",
161
+ " \n",
162
+ " if tax_years:\n",
163
+ " df = df.filter(col(\"TAXYEAR\").isin(tax_years))\n",
164
+ " \n",
165
+ " if limit_records:\n",
166
+ " df = df.limit(limit_records)\n",
167
+ " \n",
168
+ " df_classified = df.withColumn(\n",
169
+ " \"SCHEDULE_O_CLASSIFICATION\",\n",
170
+ " classify_udf(col(\"SIDFALRDESC\"))\n",
171
+ " )\n",
172
+ "\n",
173
+ " df_classified.write \\\n",
174
+ " .mode(\"overwrite\") \\\n",
175
+ " .option(\"overwriteSchema\", \"true\") \\\n",
176
+ " .saveAsTable(target_table)\n",
177
+ " \n",
178
+ " df_classified.groupBy(\"SCHEDULE_O_CLASSIFICATION\").count().orderBy(\"count\", ascending=False).show()\n"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": 0,
184
+ "metadata": {
185
+ "application/vnd.databricks.v1+cell": {
186
+ "cellMetadata": {
187
+ "byteLimit": 2048000,
188
+ "rowLimit": 10000
189
+ },
190
+ "inputWidgets": {},
191
+ "nuid": "c3bf2484-e248-4f39-a1b9-f76e6e4cf67e",
192
+ "showTitle": false,
193
+ "tableResultSettingsMap": {},
194
+ "title": ""
195
+ }
196
+ },
197
+ "outputs": [
198
+ {
199
+ "output_type": "stream",
200
+ "name": "stdout",
201
+ "output_type": "stream",
202
+ "text": [
203
+ "+-------------------------+-------+\n|SCHEDULE_O_CLASSIFICATION| count|\n+-------------------------+-------+\n| VI|4696909|\n| I EZ|1723572|\n| IX|1590432|\n| II EZ| 471703|\n| Unknown| 450339|\n| XI| 234674|\n| III| 214748|\n| III EZ| 184880|\n| XII| 160507|\n| VII| 45941|\n| V| 34822|\n| V EZ| 13777|\n+-------------------------+-------+\n\n"
204
+ ]
205
+ }
206
+ ],
207
+ "source": [
208
+ "process_with_filters()"
209
+ ]
210
+ }
211
+ ],
212
+ "metadata": {
213
+ "application/vnd.databricks.v1+notebook": {
214
+ "computePreferences": null,
215
+ "dashboards": [],
216
+ "environmentMetadata": {
217
+ "base_environment": "",
218
+ "environment_version": "4"
219
+ },
220
+ "inputWidgetPreferences": null,
221
+ "language": "python",
222
+ "notebookMetadata": {
223
+ "pythonIndentUnit": 4
224
+ },
225
+ "notebookName": "schedule_o_classifier_notebook",
226
+ "widgets": {}
227
+ },
228
+ "language_info": {
229
+ "name": "python"
230
+ }
231
+ },
232
+ "nbformat": 4,
233
+ "nbformat_minor": 0
234
+ }