55require "openssl"
66require "base64"
77require "connection_pool"
8+ require "yaml"
89
910set :protection , except : [ :json_csrf ]
1011
1112$parser = ConnectionPool . new ( size : 1 , timeout : 5 ) {
1213 HTTP . persistent ( ENV [ "PARSER_URL" ] )
1314}
1415
16+ $users = begin
17+ if ENV [ "EXTRACT_USERS" ]
18+ YAML . safe_load_file ( ENV [ "EXTRACT_USERS" ] )
19+ else
20+ { "demo" => "demo" }
21+ end
22+ end
23+
24+
1525def signature_valid? ( user , signature , data )
16- path = File . expand_path ( File . join ( ".." , "users" , user ) , __dir__ )
17- key = File . read ( path ) . strip
26+ key = $users[ user ]
27+ return false unless key
28+
1829 signature == OpenSSL ::HMAC . hexdigest ( "sha1" , key , data )
1930end
2031
21- def parse ( json )
32+ def parse_with_mercury ( json )
2233 $parser. with do |connection |
23- connection
34+ response = connection
2435 . timeout ( connect : 1 , write : 5 , read : 5 )
2536 . post ( "/parser" , json : json )
37+
38+ body = response . to_s
39+ halt_with_error ( "Cannot extract this URL." ) unless response . status . ok?
40+ headers ( "Content-Type" => response . headers [ :content_type ] )
41+ body
2642 end
2743end
2844
@@ -33,20 +49,37 @@ def halt_with_error(error)
3349 } . to_json
3450end
3551
52+ def parser_object ( url :, html :, content_type :)
53+ {
54+ url : url ,
55+ options : {
56+ html : html ,
57+ contentType : content_type
58+ }
59+ }
60+ end
61+
3662def download_with_http ( url )
3763 response = HTTP
3864 . follow ( max_hops : 5 )
3965 . timeout ( connect : 4 , write : 4 , read : 5 )
4066 . headers ( { accept_encoding : "gzip, deflate" } )
4167 . use ( :auto_inflate )
4268 . get ( url )
43- {
44- url : url ,
45- options : {
46- html : response . to_s ,
47- contentType : response . headers [ :content_type ]
48- }
49- }
69+
70+ parser_object ( url : url , html : response . to_s , content_type : response . headers [ :content_type ] )
71+ end
72+
73+ def authenticate ( user , signature , url )
74+ halt_with_error ( "User does not exist: #{ user } ." ) unless $users. key? ( user )
75+ halt_with_error ( "Invalid signature." ) unless signature_valid? ( user , signature , url )
76+ end
77+
78+ def response_error! ( exception , url , user )
79+ logger . error "Exception processing exception=#{ exception } url=#{ url } user=#{ user } "
80+ logger . error exception . backtrace . join ( "\n " )
81+ halt_with_error ( "Cannot extract this URL." )
82+ raise exception
5083end
5184
5285get "/health_check" do
@@ -62,21 +95,32 @@ def download_with_http(url)
6295
6396 logger . info "url=#{ url } "
6497
65- begin
66- halt_with_error ( "Invalid signature." ) unless signature_valid? ( params [ "user" ] , params [ "signature" ] , url )
67- rescue Errno ::ENOENT
68- halt_with_error ( "User does not exist: #{ params [ "user" ] } ." )
69- end
98+ authenticate ( params [ "user" ] , params [ "signature" ] , url )
7099
71100 payload = download_with_http ( url )
72- response = parse ( payload )
73- body = response . to_s
74- halt_with_error ( "Cannot extract this URL." ) unless response . status . ok?
75- headers ( "Content-Type" => response . headers [ :content_type ] )
76- body
101+
102+ parse_with_mercury ( payload )
77103rescue => exception
78- logger . error "Exception processing exception=#{ exception } url=#{ url } user=#{ params [ "user" ] } "
79- logger . error exception . backtrace . join ( "\n " )
80- halt_with_error ( "Cannot extract this URL." )
81- raise exception
104+ response_error! ( exception , url , params [ "user" ] )
105+ end
106+
107+ post "/parser/:user/:signature" do
108+ json = begin
109+ JSON . parse ( request . body . read )
110+ rescue JSON ::ParserError
111+ halt_with_error ( "Invalid JSON body." )
112+ end
113+
114+ halt_with_error ( "Missing url field in JSON body." ) unless json [ "url" ]
115+ halt_with_error ( "Missing body field in JSON body." ) unless json [ "body" ]
116+
117+ logger . info "url=#{ json [ "url" ] } "
118+
119+ authenticate ( params [ "user" ] , params [ "signature" ] , json [ "url" ] )
120+
121+ payload = parser_object ( url : json [ "url" ] , html : json [ "body" ] , content_type : "text/html" )
122+
123+ parse_with_mercury ( payload )
124+ rescue => exception
125+ response_error! ( exception , url , params [ "user" ] )
82126end
0 commit comments