Coverage for bioimageio/spec/_internal/url.py: 97%

73 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-02 14:21 +0000

1from typing import Any, ClassVar, Optional, Type, Union 

2 

3import pydantic 

4import requests 

5import requests.exceptions 

6from loguru import logger 

7from pydantic import RootModel 

8from typing_extensions import Literal, assert_never 

9 

10from .field_warning import issue_warning 

11from .root_url import RootHttpUrl 

12from .validation_context import get_validation_context 

13 

14 

15def _validate_url(url: Union[str, pydantic.HttpUrl]) -> pydantic.HttpUrl: 

16 return _validate_url_impl(url, request_mode="head") 

17 

18 

19def _validate_url_impl( 

20 url: Union[str, pydantic.HttpUrl], 

21 request_mode: Literal["head", "get_stream", "get"], 

22 timeout: int = 3, 

23) -> pydantic.HttpUrl: 

24 

25 url = str(url) 

26 context = get_validation_context() 

27 if url in context.known_files: 

28 with context.replace(perform_io_checks=False): 

29 return ( # pyright: ignore[reportUnknownVariableType] 

30 # TODO: remove pyright ignore for pydantic > 2.9 

31 pydantic.HttpUrl(url) # pyright: ignore[reportCallIssue] 

32 ) 

33 

34 val_url = url 

35 

36 if url.startswith("http://example.com") or url.startswith("https://example.com"): 

37 return pydantic.HttpUrl( # pyright: ignore[reportUnknownVariableType,reportCallIssue] 

38 url 

39 ) 

40 

41 if url.startswith("https://colab.research.google.com/github/"): 

42 # get requests for colab returns 200 even if the source notebook does not exists. 

43 # We therefore validate the url to the notebbok instead (for github notebooks) 

44 val_url = url.replace( 

45 "https://colab.research.google.com/github/", "https://github.com/" 

46 ) 

47 elif url.startswith("https://colab.research.google.com/"): 

48 # TODO: improve validation of non-github colab urls 

49 issue_warning( 

50 "colab urls currently pass even if the notebook url was not found. Cannot fully validate {value}", 

51 value=url, 

52 ) 

53 

54 try: 

55 if request_mode == "head": 

56 response = requests.head(val_url, timeout=timeout) 

57 elif request_mode == "get_stream": 

58 response = requests.get(val_url, stream=True, timeout=timeout) 

59 elif request_mode == "get": 

60 response = requests.get(val_url, stream=False, timeout=timeout) 

61 else: 

62 assert_never(request_mode) 

63 except ( 

64 requests.exceptions.ChunkedEncodingError, 

65 requests.exceptions.ContentDecodingError, 

66 requests.exceptions.InvalidHeader, 

67 requests.exceptions.InvalidJSONError, 

68 requests.exceptions.InvalidSchema, 

69 requests.exceptions.InvalidURL, 

70 requests.exceptions.MissingSchema, 

71 requests.exceptions.StreamConsumedError, 

72 requests.exceptions.TooManyRedirects, 

73 requests.exceptions.UnrewindableBodyError, 

74 requests.exceptions.URLRequired, 

75 ) as e: 

76 raise ValueError( 

77 f"Invalid URL '{url}': {e}\nrequest: {e.request}\nresponse: {e.response}" 

78 ) 

79 except requests.RequestException as e: 

80 issue_warning( 

81 "Failed to validate URL '{value}': {error}\nrequest: {request}\nresponse: {response}", 

82 value=url, 

83 msg_context={"error": str(e), "response": e.response, "request": e.request}, 

84 ) 

85 except Exception as e: 

86 issue_warning( 

87 "Failed to validate URL '{value}': {error}", 

88 value=url, 

89 msg_context={"error": str(e)}, 

90 ) 

91 else: 

92 if response.status_code == 200: # ok 

93 pass 

94 elif response.status_code in (302, 303): # found 

95 pass 

96 elif response.status_code in (301, 308): 

97 issue_warning( 

98 "URL redirected ({status_code}): consider updating {value} with new" 

99 + " location: {location}", 

100 value=url, 

101 msg_context={ 

102 "status_code": response.status_code, 

103 "location": response.headers.get("location"), 

104 }, 

105 ) 

106 elif request_mode == "head": 

107 return _validate_url_impl(url, request_mode="get_stream", timeout=timeout) 

108 elif request_mode == "get_stream": 

109 return _validate_url_impl(url, request_mode="get", timeout=timeout) 

110 elif response.status_code == 405: 

111 issue_warning( 

112 "{status_code}: {reason} {value}", 

113 value=url, 

114 msg_context={ 

115 "status_code": response.status_code, 

116 "reason": response.reason, 

117 }, 

118 ) 

119 elif request_mode == "get": 

120 raise ValueError(f"{response.status_code}: {response.reason} {url}") 

121 else: 

122 assert_never(request_mode) 

123 

124 context.known_files[url] = None 

125 return ( # pyright: ignore[reportUnknownVariableType] 

126 # TODO: remove pyright ignore for pydantic > 2.9 

127 pydantic.HttpUrl(url) # pyright: ignore[reportCallIssue] 

128 ) 

129 

130 

131class HttpUrl(RootHttpUrl): 

132 """A URL with the HTTP or HTTPS scheme.""" 

133 

134 root_model: ClassVar[Type[RootModel[Any]]] = RootModel[pydantic.HttpUrl] 

135 _exists: Optional[bool] = None 

136 

137 def _after_validator(self): 

138 self = super()._after_validator() 

139 context = get_validation_context() 

140 if context.perform_io_checks: 

141 self._validated = _validate_url(self._validated) 

142 self._exists = True 

143 

144 return self 

145 

146 def exists(self): 

147 """True if URL is available""" 

148 if self._exists is None: 

149 try: 

150 self._validated = _validate_url(self._validated) 

151 except Exception as e: 

152 logger.info(e) 

153 self._exists = False 

154 else: 

155 self._exists = True 

156 

157 return self._exists