#!/bin/sh # # urlheads4feed2csv - list RSS/Atom feeds in ./urlheadinfo -j output as CSV # # $Id$ # we want to filter by MIME types application/rss+xml and application/atom+xml # except that a quick test on my Firefox history URLs with # # jq -r '. | select(contains({"Content-Type":"xml"})) | [.URL, .["Content-Type"]] | @csv' all.json | egrep -iv 'rss|atom' # # displays feed URLs with application/xml, text/xml; charset=UTF-8, etc. # so I'll retrieve all documents whose MIME type contains 'xml' and # find an element that characterizes RSS or Atom cat "$@" | jq -r '. | select(contains({"Status":"200","Content-Type":"xml"})) | .URL' | while read url do curl -s $url | xmlstarlet \ sel -N a=http://www.w3.org/2005/Atom -t \ --if '/a:feed' -o atom \ --elif '//*[local-name()="channel"]' -o rss \ --else -o unknown \ -t -o ' ' \ -t -v '(//*[local-name()="title"])[1]' | perl -pe 's#,##; s# #,#' # to remove commas from the title echo ,$url done