# Copyright (c) 2017, 2023 Luke Shumaker # # This work is free. You can redistribute it and/or modify it under # the terms of the Do What The Fuck You Want To Public License, # Version 2, as published by Sam Hocevar. See the COPYING file for # more details. SHELL=bash -o pipefail PATH:=$(CURDIR)/bin:$(PATH) export PATH url2murl = $(subst %,^25,$(subst :,^3A,$(subst ^,^5E,$1))) murl2url = $(subst ^5E,^,$(subst ^3A,:,$(subst ^25,%,$1))) dirfail = ( r=$$?; mv -- '$@'{,.bak}; exit $$r; ) # This is split into stages for when Make has to make decisions about # the build tree based on the output of a previous stage. That is: # these stages exist for a technical GNU Make reason, not for # human-comprehensibility reasons; so stages have lopsided sizes; the # first two are very small, and almost everything is in the third # stage. all: # Stage 1 ###################################################################### $(MAKE) dat/urlkeys.mk # Stage 2 ###################################################################### $(MAKE) dat/index.mk # Stage 3 ###################################################################### $(MAKE) dat/git .PHONY: all COPYING: curl -L http://www.wtfpl.net/txt/copying/ >$@ # Stage 1 ###################################################################### # # Fetch a listing of all relevant URLs. # # - `dat/cdxindex.txt` # - `dat/urlkeys.txt` # - `dat/urlkeys.mk` dat: mkdir -p $@ dat/cdxindex.txt: | dat cdxget 'url=www.unicode.org/Public/*' 'fl=urlkey' 'collapse=urlkey' 'filter=statuscode:200' 'filter=urlkey:.*(cvt|convert)utf.*' | grep -vFx 'org,unicode)/public/2.0-update/cvtutf7.c' > $@ dat/urlkeys.txt: dat/cdxindex.txt < $< cut -d '?' -f1 | sort -u > $@ dat/urlkeys.mk: dat/urlkeys.txt < $< sed 's/^/urlkeys+=/' > $@ # Stage 2 ###################################################################### # # Fetch the history for each relevant URL. # # - `dat/each-cdx/$(urlkey).txt` (for each urlkey in `dat/urlkeys.mk`) # # - `dat/index.txt` # has a line for each relevant URL: # # ${wayback_timestamp:YYYYmmddHHMMSS} ${url} # # - `dat/index.mk` ifneq ($(wildcard dat/urlkeys.mk),) include dat/urlkeys.mk dat/each-cdx/%.txt: @mkdir -p '$(@D)' cdxget "url=$$(urlkey2url '$*')" 'filter=statuscode:200' 'fl=timestamp,original' > '$@' dat/index.txt: $(addprefix dat/each-cdx/,$(addsuffix .txt,$(urlkeys))) dat/urlkeys.txt cat -- $(foreach c,$(filter dat/each-cdx/%,$^),'$c') | sort > $@ dat/index.mk: dat/index.txt < $< sed -e 's,^,index+=,' -e 's, ,/,' > $@ # Stage 3 ###################################################################### # # The main stage. ifneq ($(wildcard dat/index.mk),) -include dat/index.mk # Part 1: Directory indexes: # # - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/index.html` # # - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/readme.txt` # # - `dat/content-dir/$(wayback_timestamp:YYYYmmddHHMMSS)/$(dir_murl)/metadata.txt` # has a line for each file mentioned in index.html (this format is # controlled by `bin/fmt-metadata`): # # ${file_name} ${file_timestamp:YYYY-mm-dd HH:MM} dat/content-dir/%/index.html: @mkdir -p '$(@D)' curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ dat/content-dir/%/readme.txt: dat/content-dir/%/index.html < $< sed -n '/^<[pP][rR][eE]>$$/,/<\/[pP][rR][eE]>/p' | sed -e 1d -e 's,.*,,' > $@ dat/content-dir/%/metadata.txt: dat/content-dir/%/index.html < $< grep -i '^]*>//g' | grep -vi 'parent directory' | fmt-metadata $(firstword $(subst /, ,$*)) > $@ content-dir = $(foreach u,$(filter %/,$(index)),dat/content-dir/$(call url2murl,$(u))) download += $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) # Part 2: File contents: # - `dat/content-file/$(wayback_timestamp:YYYYmmddHHMMSS)/$(file_murl)` dat/content-file/%: @mkdir -p '$(@D)' curl -sfL 'http://web.archive.org/web/$(call murl2url,$(subst /http,id_/http,$*))' > $@ content-file = $(foreach u,$(filter-out %/,$(index)),dat/content-file/$(call url2murl,$(u))) download += $(content-file) # `download` is a convenience target to download files without # processing them. It isn't depended on by anything. download: $(download) .PHONY: download # Part 3: Aggregate: # - `dat/metadata.txt` # has a line for each file mentioned in any index.html: # # ${dirindex_wayback_timestamp:YYYYmmddHHMMSS} ${branch_name}/${file_name} ${file_html_timestamp:YYYY-mm-dd HH:MM} # # where the ${dirindex_wayback_timestamp} and ${branch_name} are # determined from the path to the relevant index.html. # # - `dat/pools/` # + pass 1 and pass 1.5 # * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/` # * `dat/pools/snaps/${dirindex_wayback_timestamp:YYYYmmddHHMMSS}-${branch_name}/${file_name}` (symlink to the /files/ file) # + pass 2 and pass 3: # * `dat/pools/files/${file_html_timestamp:YYYYmmddHHMM}-${branch_name}_${file_name}/${file_name}` (for each existing /file/ dir) # dat/metadata.txt: $(addsuffix metadata.txt,$(content-dir)) dat/index.txt grep ^ $(foreach c,$(filter %/metadata.txt,$^),'$c') | sed -E -e 's,^dat/content-dir/,,' -e 's,/.*/Public/, ,' -e 's,/metadata\.txt:,/,' -e 's,\s+, ,g' | sort -u > $@ dat/pools: $(download) dat/metadata.txt dat/index.txt rm -rf -- $@ $@.bak poolify dat/metadata.txt dat/index.txt || $(dirfail) # Part 4: Turn each `dat/pools/snaps/*` directory into a Git commit # # - `dat/git/` dat/git: dat/pools $(addsuffix readme.txt,$(content-dir)) $(addsuffix metadata.txt,$(content-dir)) rm -rf -- $@ $@.bak gitify $@ || $(dirfail) ################################################################################ endif endif .DELETE_ON_ERROR: .SECONDARY: