Skip to content

Commit d764da3

Browse files
dibahlfisimorenoh
andauthored
fix: partition split recursion issue (#44649)
* fix: partition split recursion issue * fix: fixing passing internal flag to azure-core * fix: refacotring log statements * fix: fixing tests * fix: comments refactoring * fix: adding more test coverage * fix: conditionally importing module * Update sdk/cosmos/azure-cosmos/CHANGELOG.md * fix: updating version --------- Co-authored-by: Simon Moreno <30335873+simorenoh@users.noreply.github.com>
1 parent f7809e0 commit d764da3

File tree

11 files changed

+759
-6
lines changed

11 files changed

+759
-6
lines changed

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
## Release History
22

3+
### 4.14.5 (2026-01-15)
4+
5+
#### Bugs Fixed
6+
* Fixed bug where sdk was encountering a timeout issue caused by infinite recursion during the 410 (Gone) error.See [PR 44659](https://github.com/Azure/azure-sdk-for-python/pull/44649)
7+
38
### 4.14.4 (2026-01-12)
49

510
#### Bugs Fixed

sdk/cosmos/azure-cosmos/azure/cosmos/_execution_context/aio/base_execution_context.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,12 @@
2525

2626
from collections import deque
2727
import copy
28+
import logging
2829

2930
from ...aio import _retry_utility_async
3031
from ... import http_constants, exceptions
3132

33+
_LOGGER = logging.getLogger(__name__)
3234

3335
# pylint: disable=protected-access
3436

@@ -145,23 +147,53 @@ async def callback(**kwargs): # pylint: disable=unused-argument
145147
self._client, self._client._global_endpoint_manager, callback, **self._options
146148
)
147149

150+
# Check if this is an internal partition key range fetch - skip 410 retry logic to avoid recursion
151+
# When we call refresh_routing_map_provider(), it triggers _ReadPartitionKeyRanges which would
152+
# come through this same code path. If that also gets a 410 and tries to refresh, we get infinite recursion.
153+
is_pk_range_fetch = self._options.get("_internal_pk_range_fetch", False)
154+
if is_pk_range_fetch:
155+
# For partition key range queries, just execute without 410 partition split retry
156+
# The underlying retry utility will still handle other transient errors
157+
_LOGGER.debug("Partition split retry (async): Skipping 410 retry for internal PK range fetch")
158+
return await execute_fetch()
159+
148160
max_retries = 3
149161
attempt = 0
162+
150163
while attempt <= max_retries:
151164
try:
152165
return await execute_fetch()
153166
except exceptions.CosmosHttpResponseError as e:
154167
if exceptions._partition_range_is_gone(e):
155168
attempt += 1
156169
if attempt > max_retries:
170+
_LOGGER.error(
171+
"Partition split retry (async): Exhausted all %d retries. "
172+
"state: _has_started=%s, _continuation=%s",
173+
max_retries, self._has_started, self._continuation
174+
)
157175
raise # Exhausted retries, propagate error
158176

177+
_LOGGER.warning(
178+
"Partition split retry (async): 410 error (sub_status=%s). Attempt %d of %d. "
179+
"Refreshing routing map and resetting state.",
180+
getattr(e, 'sub_status', 'N/A'),
181+
attempt,
182+
max_retries
183+
)
184+
159185
# Refresh routing map to get new partition key ranges
160186
self._client.refresh_routing_map_provider()
187+
# Reset execution context state to allow retry from the beginning
188+
self._has_started = False
189+
self._continuation = None
161190
# Retry immediately (no backoff needed for partition splits)
162191
continue
163192
raise # Not a partition split error, propagate immediately
164193

194+
# This should never be reached, but added for safety
195+
return []
196+
165197

166198
class _DefaultQueryExecutionContext(_QueryExecutionContextBase):
167199
"""

sdk/cosmos/azure-cosmos/azure/cosmos/_execution_context/base_execution_context.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,10 @@
2525

2626
from collections import deque
2727
import copy
28+
import logging
2829
from .. import _retry_utility, http_constants, exceptions
2930

31+
_LOGGER = logging.getLogger(__name__)
3032

3133
# pylint: disable=protected-access
3234

@@ -143,6 +145,16 @@ def callback(**kwargs): # pylint: disable=unused-argument
143145
self._client, self._client._global_endpoint_manager, callback, **self._options
144146
)
145147

148+
# Check if this is an internal partition key range fetch - skip 410 retry logic to avoid recursion
149+
# When we call refresh_routing_map_provider(), it triggers _ReadPartitionKeyRanges which would
150+
# come through this same code path. If that also gets a 410 and tries to refresh, we get infinite recursion.
151+
is_pk_range_fetch = self._options.get("_internal_pk_range_fetch", False)
152+
if is_pk_range_fetch:
153+
# For partition key range queries, just execute without 410 partition split retry
154+
# The underlying retry utility will still handle other transient errors
155+
_LOGGER.debug("Partition split retry: Skipping 410 retry for internal PK range fetch")
156+
return execute_fetch()
157+
146158
max_retries = 3
147159
attempt = 0
148160

@@ -153,13 +165,32 @@ def callback(**kwargs): # pylint: disable=unused-argument
153165
if exceptions._partition_range_is_gone(e):
154166
attempt += 1
155167
if attempt > max_retries:
168+
_LOGGER.error(
169+
"Partition split retry: Exhausted all %d retries. "
170+
"state: _has_started=%s, _continuation=%s",
171+
max_retries, self._has_started, self._continuation
172+
)
156173
raise # Exhausted retries, propagate error
157174

175+
_LOGGER.warning(
176+
"Partition split retry: 410 error (sub_status=%s). Attempt %d of %d. "
177+
"Refreshing routing map and resetting state.",
178+
getattr(e, 'sub_status', 'N/A'),
179+
attempt,
180+
max_retries
181+
)
182+
158183
# Refresh routing map to get new partition key ranges
159184
self._client.refresh_routing_map_provider()
185+
# Reset execution context state to allow retry from the beginning
186+
self._has_started = False
187+
self._continuation = None
160188
# Retry immediately (no backoff needed for partition splits)
161189
continue
162190
raise # Not a partition split error, propagate immediately
191+
192+
# This should never be reached, but added for safety
193+
return []
163194
next = __next__ # Python 2 compatibility.
164195

165196

sdk/cosmos/azure-cosmos/azure/cosmos/_routing/aio/routing_map_provider.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,15 @@
2222
"""Internal class for partition key range cache implementation in the Azure
2323
Cosmos database service.
2424
"""
25+
import logging
2526
from typing import Any, Optional
2627

2728
from ... import _base
2829
from ..collection_routing_map import CollectionRoutingMap
2930
from .. import routing_range
3031

32+
_LOGGER = logging.getLogger(__name__)
33+
3134
# pylint: disable=protected-access
3235

3336

@@ -75,18 +78,33 @@ async def init_collection_routing_map_if_needed(
7578
):
7679
collection_routing_map = self._collection_routing_map_by_item.get(collection_id)
7780
if collection_routing_map is None:
81+
# Pass _internal_pk_range_fetch flag to prevent recursive 410 retry logic
82+
# When a 410 partition split error occurs, the SDK calls refresh_routing_map_provider()
83+
# which clears the cache and retries. The retry needs partition key ranges, which calls
84+
# this method, which triggers _ReadPartitionKeyRanges. If that query also goes through
85+
# the 410 retry logic and calls refresh again, we get infinite recursion.
86+
_LOGGER.debug(
87+
"PK range cache (async): Initializing routing map for collection_id=%s with "
88+
"_internal_pk_range_fetch=True to prevent recursive 410 retry.",
89+
collection_id
90+
)
91+
pk_range_kwargs = {**kwargs, "_internal_pk_range_fetch": True}
7892
collection_pk_ranges = [pk async for pk in
7993
self._documentClient._ReadPartitionKeyRanges(collection_link,
8094
feed_options,
81-
**kwargs)]
95+
**pk_range_kwargs)]
8296
# for large collections, a split may complete between the read partition key ranges query page responses,
8397
# causing the partitionKeyRanges to have both the children ranges and their parents. Therefore, we need
8498
# to discard the parent ranges to have a valid routing map.
85-
collection_pk_ranges = PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges)
99+
collection_pk_ranges = list(PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges))
86100
collection_routing_map = CollectionRoutingMap.CompleteRoutingMap(
87101
[(r, True) for r in collection_pk_ranges], collection_id
88102
)
89103
self._collection_routing_map_by_item[collection_id] = collection_routing_map
104+
_LOGGER.debug(
105+
"PK range cache (async): Cached routing map for collection_id=%s with %d ranges",
106+
collection_id, len(collection_pk_ranges)
107+
)
90108

91109
async def get_range_by_partition_key_range_id(
92110
self,

sdk/cosmos/azure-cosmos/azure/cosmos/_routing/routing_map_provider.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,16 @@
2222
"""Internal class for partition key range cache implementation in the Azure
2323
Cosmos database service.
2424
"""
25+
import logging
2526
from typing import Any, Optional
2627

2728
from .. import _base
2829
from .collection_routing_map import CollectionRoutingMap
2930
from . import routing_range
3031
from .routing_range import PartitionKeyRange
3132

33+
_LOGGER = logging.getLogger(__name__)
34+
3235

3336
# pylint: disable=protected-access
3437

@@ -61,17 +64,32 @@ def init_collection_routing_map_if_needed(
6164
):
6265
collection_routing_map = self._collection_routing_map_by_item.get(collection_id)
6366
if not collection_routing_map:
67+
# Pass _internal_pk_range_fetch flag to prevent recursive 410 retry logic
68+
# When a 410 partition split error occurs, the SDK calls refresh_routing_map_provider()
69+
# which clears the cache and retries. The retry needs partition key ranges, which calls
70+
# this method, which triggers _ReadPartitionKeyRanges. If that query also goes through
71+
# the 410 retry logic and calls refresh again, we get infinite recursion.
72+
_LOGGER.debug(
73+
"PK range cache: Initializing routing map for collection_id=%s with "
74+
"_internal_pk_range_fetch=True to prevent recursive 410 retry.",
75+
collection_id
76+
)
77+
pk_range_kwargs = {**kwargs, "_internal_pk_range_fetch": True}
6478
collection_pk_ranges = list(self._documentClient._ReadPartitionKeyRanges(collection_link,
6579
feed_options,
66-
**kwargs))
80+
**pk_range_kwargs))
6781
# for large collections, a split may complete between the read partition key ranges query page responses,
6882
# causing the partitionKeyRanges to have both the children ranges and their parents. Therefore, we need
6983
# to discard the parent ranges to have a valid routing map.
70-
collection_pk_ranges = PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges)
84+
collection_pk_ranges = list(PartitionKeyRangeCache._discard_parent_ranges(collection_pk_ranges))
7185
collection_routing_map = CollectionRoutingMap.CompleteRoutingMap(
7286
[(r, True) for r in collection_pk_ranges], collection_id
7387
)
7488
self._collection_routing_map_by_item[collection_id] = collection_routing_map
89+
_LOGGER.debug(
90+
"PK range cache: Cached routing map for collection_id=%s with %d ranges",
91+
collection_id, len(collection_pk_ranges)
92+
)
7593

7694
def get_overlapping_ranges(self, collection_link, partition_key_ranges, feed_options, **kwargs):
7795
"""Given a partition key range and a collection, return the list of

sdk/cosmos/azure-cosmos/azure/cosmos/_synchronized_request.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,8 @@ def _Request(global_endpoint_manager, request_params, connection_policy, pipelin
8080
"""
8181
# pylint: disable=protected-access, too-many-branches
8282
kwargs.pop(_Constants.OperationStartTime, None)
83+
# Pop internal flags that should not be passed to the HTTP layer
84+
kwargs.pop("_internal_pk_range_fetch", None)
8385
connection_timeout = connection_policy.RequestTimeout
8486
connection_timeout = kwargs.pop("connection_timeout", connection_timeout)
8587
read_timeout = connection_policy.ReadTimeout

sdk/cosmos/azure-cosmos/azure/cosmos/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,4 @@
1919
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2020
# SOFTWARE.
2121

22-
VERSION = "4.14.4"
22+
VERSION = "4.14.5"

sdk/cosmos/azure-cosmos/azure/cosmos/aio/_asynchronous_request.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ async def _Request(global_endpoint_manager, request_params, connection_policy, p
5252
"""
5353
# pylint: disable=protected-access, too-many-branches
5454
kwargs.pop(_Constants.OperationStartTime, None)
55+
# Pop internal flags that should not be passed to the HTTP layer
56+
kwargs.pop("_internal_pk_range_fetch", None)
5557
connection_timeout = connection_policy.RequestTimeout
5658
read_timeout = connection_policy.ReadTimeout
5759
connection_timeout = kwargs.pop("connection_timeout", connection_timeout)

sdk/cosmos/azure-cosmos/tests/routing/test_routing_map_provider.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class MockedCosmosClientConnection(object):
1818
def __init__(self, partition_key_ranges):
1919
self.partition_key_ranges = partition_key_ranges
2020

21-
def _ReadPartitionKeyRanges(self, collection_link: str, feed_options: Optional[Mapping[str, Any]] = None):
21+
def _ReadPartitionKeyRanges(self, collection_link: str, feed_options: Optional[Mapping[str, Any]] = None, **kwargs):
2222
return self.partition_key_ranges
2323

2424
def setUp(self):

0 commit comments

Comments
 (0)