Skip to content

Commit 3757cf7

Browse files
committed
handle null values in indexing data
1 parent c770b7a commit 3757cf7

File tree

1 file changed

+15
-41
lines changed

1 file changed

+15
-41
lines changed

api/utils/data_indexing.py

Lines changed: 15 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,7 @@ def get_sql_type(pandas_dtype: str) -> str:
3030
return "TEXT"
3131

3232

33-
def create_table_for_resource(
34-
resource: Resource, df: pd.DataFrame
35-
) -> Optional[ResourceDataTable]:
33+
def create_table_for_resource(resource: Resource, df: pd.DataFrame) -> Optional[ResourceDataTable]:
3634
"""Create a database table for the resource data and index it."""
3735
try:
3836
# Create ResourceDataTable entry first to get the table name
@@ -65,9 +63,7 @@ def create_table_for_resource(
6563
df.to_csv(csv_data, index=False, header=False)
6664
csv_data.seek(0)
6765

68-
copy_sql = (
69-
f'COPY "{temp_table}" ({",".join(quoted_columns)}) FROM STDIN WITH CSV'
70-
)
66+
copy_sql = f'COPY "{temp_table}" ({",".join(quoted_columns)}) FROM STDIN WITH CSV'
7167
cursor.copy_expert(copy_sql, csv_data)
7268

7369
# Insert from temp to main table with validation
@@ -102,14 +98,10 @@ def index_resource_data(resource: Resource) -> Optional[ResourceDataTable]:
10298
try:
10399
file_details = resource.resourcefiledetails
104100
if not file_details:
105-
logger.info(
106-
f"Resource {resource_id} has no file details, skipping indexing"
107-
)
101+
logger.info(f"Resource {resource_id} has no file details, skipping indexing")
108102
return None
109103
except Exception as e:
110-
logger.error(
111-
f"Failed to access file details for resource {resource_id}: {str(e)}"
112-
)
104+
logger.error(f"Failed to access file details for resource {resource_id}: {str(e)}")
113105
return None
114106

115107
# Check file format
@@ -131,9 +123,7 @@ def index_resource_data(resource: Resource) -> Optional[ResourceDataTable]:
131123
)
132124
return None
133125
except Exception as e:
134-
logger.error(
135-
f"Failed to determine format for resource {resource_id}: {str(e)}"
136-
)
126+
logger.error(f"Failed to determine format for resource {resource_id}: {str(e)}")
137127
return None
138128

139129
# Load tabular data with timeout protection
@@ -144,9 +134,7 @@ def index_resource_data(resource: Resource) -> Optional[ResourceDataTable]:
144134
@contextmanager
145135
def timeout(seconds: int) -> Generator[None, None, None]:
146136
def handler(signum: int, frame: Any) -> None:
147-
raise TimeoutError(
148-
f"Loading data timed out after {seconds} seconds"
149-
)
137+
raise TimeoutError(f"Loading data timed out after {seconds} seconds")
150138

151139
# Set the timeout handler
152140
original_handler = signal.getsignal(signal.SIGALRM)
@@ -163,9 +151,7 @@ def handler(signum: int, frame: Any) -> None:
163151
with timeout(60): # 60 second timeout for loading data
164152
df = load_tabular_data(file_details.file.path, format)
165153
except TimeoutError as te:
166-
logger.error(
167-
f"Timeout while loading data for resource {resource_id}: {str(te)}"
168-
)
154+
logger.error(f"Timeout while loading data for resource {resource_id}: {str(te)}")
169155
return None
170156
except Exception:
171157
# Fallback without timeout if signal.SIGALRM is not available (e.g., on Windows)
@@ -204,9 +190,7 @@ def handler(signum: int, frame: Any) -> None:
204190
# Rename all but the first occurrence
205191
for i, idx in enumerate(indices[1:], 1):
206192
df.columns.values[idx] = f"{col}_{i}"
207-
logger.warning(
208-
f"Renamed duplicate columns in resource {resource_id}"
209-
)
193+
logger.warning(f"Renamed duplicate columns in resource {resource_id}")
210194
except Exception as e:
211195
logger.error(
212196
f"Failed to sanitize column names for resource {resource_id}: {str(e)}"
@@ -229,9 +213,7 @@ def handler(signum: int, frame: Any) -> None:
229213
existing_table = ResourceDataTable.objects.get(resource=resource)
230214
try:
231215
with connections[DATA_DB].cursor() as cursor:
232-
cursor.execute(
233-
f'DROP TABLE IF EXISTS "{existing_table.table_name}"'
234-
)
216+
cursor.execute(f'DROP TABLE IF EXISTS "{existing_table.table_name}"')
235217
except Exception as drop_error:
236218
logger.error(
237219
f"Failed to drop existing table for resource {resource_id}: {str(drop_error)}"
@@ -292,15 +274,11 @@ def handler(signum: int, frame: Any) -> None:
292274
# For description, preserve existing if available, otherwise auto-generate
293275
description = f"Description of column {col}"
294276
if col in existing_schemas:
295-
existing_description = existing_schemas[col][
296-
"description"
297-
]
277+
existing_description = existing_schemas[col]["description"]
298278
# Check for None and non-auto-generated descriptions
299279
if existing_description is not None:
300280
description = existing_description
301-
logger.debug(
302-
f"Preserved custom description for column {col}"
303-
)
281+
logger.debug(f"Preserved custom description for column {col}")
304282

305283
# Create the schema entry
306284
ResourceSchema.objects.create(
@@ -393,9 +371,7 @@ def get_row_count(resource: Resource) -> int:
393371
import traceback
394372

395373
error_tb = traceback.format_exc()
396-
logger.error(
397-
f"Error getting row count for resource {resource.id}:\n{str(e)}\n{error_tb}"
398-
)
374+
logger.error(f"Error getting row count for resource {resource.id}:\n{str(e)}\n{error_tb}")
399375
return 0
400376

401377

@@ -429,9 +405,7 @@ def get_preview_data(resource: Resource) -> Optional[PreviewData]:
429405
try:
430406
if is_all_entries:
431407
# For safety, always limit the number of rows returned even for 'all entries'
432-
cursor.execute(
433-
f'SELECT * FROM "{data_table.table_name}" LIMIT 1000'
434-
)
408+
cursor.execute(f'SELECT * FROM "{data_table.table_name}" LIMIT 1000')
435409
else:
436410
# Ensure we have valid integer values for the calculation
437411
start = int(start_entry) if start_entry is not None else 0
@@ -443,8 +417,8 @@ def get_preview_data(resource: Resource) -> Optional[PreviewData]:
443417

444418
columns = [desc[0] for desc in cursor.description]
445419
data = cursor.fetchall()
446-
# Convert tuples to lists
447-
rows = [list(row) for row in data]
420+
# Convert tuples to lists and sanitize None values to empty strings
421+
rows = [[cell if cell is not None else "" for cell in row] for row in data]
448422
return PreviewData(columns=columns, rows=rows)
449423
except Exception as query_error:
450424
logger.error(

0 commit comments

Comments
 (0)