File:  [Public] / 2004 / semwalker / webload.pl
Revision 1.27: download - view: text, annotated - select for diffs
Fri Oct 8 17:42:33 2004 UTC (21 years, 1 month ago) by sandro
Branches: MAIN
CVS tags: before_oct_2006_restructuring, HEAD
bug fixing; mostly working!

:- ensure_loaded(common).
cvs_id('$Id: webload.pl,v 1.27 2004/10/08 17:42:33 sandro Exp $').

% ? dereference(Term, [robot_name(N), max_threads(N), background_after(S)])

:- ensure_loaded(retrieve).
:- ensure_loaded(thread_util).

:- ensure_loaded(library(debug)).


%%  webload_bg(+URI, +Timeout, -Status)
%%
%%  Start a webload in another thread and return when it's done
%%  or when Timeout seconds have passed (whichever is sooner).
%%
%%  Status == fresh, timeout, done, or failed


webload_bg(URI, Timeout, fresh) :-
	(   fresh(URI, Ret)
	->  pagein(Ret),
	    debug(webload, 'still fresh: ~a', [URI])
	;   debug(webload, 'not fresh: ~a; fetching', [URI]),
	    thread_clear_all_messages,
	    thread_self(Me),
	    thread_create_caught(webload(URI),
				 (   debug(webload, 'webload thread done', []),
				     thread_send_message(Me, done)
				 ),
				 Error,
				 (   debug(webload, 'webload thread status: ~q', [Error]),
				     thread_send_message(Me, Error)
				 )
				),
	    interruptable_sleep(Timeout, Status),
	    debug(webload, '~a tried to fetch ~a for ~q seconds: ~q', [Me, URI, Timeout, Status])
	).

%%  webload(URI)
%%
%%  Make sure the content from the given URI is loaded

webload(URI) :-
	rdfpage_uri(URI), !.
webload(URI) :-
	(   rdfpage(Ret, retr:initialResource, URI)
	->  debug(webload, 'using prior retrieval ~a', [Ret])
	;   retrieve(URI, [], Ret, []),
	    debug(webload, 'using (maybe-new) retrieval ~a', [Ret])
	),
	webload2(Ret, URI).

%%  webreload(URI)
%%
%%  Like webload, but do it even if not expired.

webreload(URI) :-
	retrieve(URI, [], Ret, [max_age(1)]),
	debug(webload, 'using new retrieval ~a', [Ret]),
	webload2(Ret, URI).
	
webload2(Ret, URI) :- 
	rdfpage_sub_uri(Page, _, Ret),
	debug(webload, 'page ~a', [Page]),
	pagein(Ret),
	get_time(Now),
	flag(latest_rdf_db_mod_time, _, Now),
	flag(latest_rdf_db_mod_source, _, URI),
	!.

chaa :-
	webload('http://www.w3.org/People/Charles/foaf.rdf').

worked :-
	['x/worked_source.pl'],
	forall(worked_source(U), webload(U)).

most_recent_retrieval_success(URI, Ret, Time) :-
	address_latest_successful_ret(URI, Ret),
	% pagein(Ret),
	rdfpage(Ret, retr:exchangingHeaders, literal(unixtime(Time))).
	

most_recent_retrieval_attempt(URI, Ret, Time) :-
	address_latest_ret(URI, Ret),
	% pagein(Ret),
	rdfpage(Ret, retr:exchangingHeaders, literal(unixtime(Time))).

/*
most_recent_retrieval_attempt(URI, Ret, Time) :-
	findall(time(NT, Ret),
		(   rdfpage(Ret, retr:initialResource, URI),
		    rdfpage(Ret, retr:startTime, literal(unixtime(T))),
		    NT is -T
		),
		UnsortedRets),
	sort(UnsortedRets, Rets),
	Rets = [RetPair | _],
	RetPair = time(NTime, Ret),
	Time is -NTime.

  
most_recent_retrieval_success(URI, Ret, Time) :-
	findall(time(NT, Ret),
		(   rdfpage(Ret, retr:initialResource, URI),
		    rdfpage(Ret, rdf:type, retr:'SuccessfulRetrieval'),
		    rdfpage(Ret, retr:startTime, literal(unixtime(T))),
		    % use end time?
		    NT is -T
		),
		UnsortedRets),
	sort(UnsortedRets, Rets),
	Rets = [RetPair | _],
	RetPair = time(NTime, Ret),
	Time is -NTime.

most_recent_retrieval_success_final(URI, Ret, Time) :-
	findall(time(NT, Ret),
		(   rdfpage(Ret, retr:finalResource, URI),
		    rdfpage(Ret, rdf:type, retr:'SuccessfulRetrieval'),
		    rdfpage(Ret, retr:startTime, literal(unixtime(T))),
		    % use end time?
		    NT is -T
		),
		UnsortedRets),
	sort(UnsortedRets, Rets),
	Rets = [RetPair | _],
	RetPair = time(NTime, Ret),
	Time is -NTime.

% use stale/3 instead!!!
expiration(Ret, Time) :-
	% assume 30 minutes after fetch, for now.
	rdfpage(Ret, retr:startTime, literal(unixtime(T))),
	Time is T + 30 * 60.

*/

fresh(URI, Ret) :-
	uri_scheme(URI, Scheme),
	(   yes_deref(Scheme)
	->  uri_stem(URI, Stem),
	    address_latest_ret(Stem, Ret),
	    stale(Ret, Stale, []),
	    Stale > 0
	;   true   % call it fresh if we can't ever dereference it
	).


%%  retrieval_status(+Ret, -Status)
%%
%%  Obtain a status indicator for the current retrieval, as of right
%%  now.   This can change due to actions of other threads and other
%%  processes.
%%
%%  Status:
%%      startup       no "curl_output" file
%%      connecting    no "data" file
%%      retrieving    no "after_download.in" file
%%      parsing / not rdf       no "save.rdfdb"
%%
%%    just use the pdb?
%%

%retrieval_status(Ret, Status) :-
%	(   rdfpage(Ret, rdf:type, retr:'SuccessfulRetrieval')
%	->  (


Webmaster