import java.io.*; /** * Fairly trivial wrapper for the StreamTokenizer class, to allow * you to scan for link elements from a a HTML stream. * It's pretty simple to use, you open a new LinkScanner on an input stream or * reader and then keep asking for links (via nextLink()) until it returns * null to signal at end of stream. *

* * @author Julie Zelenski * @version 1.1 11/1/99 */ public class LinkScanner { private StreamTokenizer parser = null; private boolean extractTags = true; /** Constructs a new Scanner from any InputStream (such as one * created by opening a file or a URLConnection). */ public LinkScanner(InputStream in) { this(new InputStreamReader(in), true); } /** Constructs a new Scanner from any Reader (new 1.1 I/O classes). */ public LinkScanner(Reader reader) { this(reader, true); } /** Constructs a new Scanner from any Reader and allows you to specify * whether you want to extract HTML tags as a unit. If true, it will * pull out everything within <> as one unit. Otherwise, it doesn't think * anything special about the angle brackets. */ private LinkScanner(Reader reader, boolean extractHTMLTagsAsToken) { parser = new StreamTokenizer(reader); configureForNormalTokens(); extractTags = extractHTMLTagsAsToken; if (!extractTags) parser.quoteChar('"'); // use double-quoting to protect white inside tags } /** * Retrieves the next link from the stream and returns it (as a String). * Uses tokenizer to pull tokens until a HTML tag is returned, then searches * for a URL within it if possible. This method returns null if there * are no more tokens indicating the stream is at EOF. * It raises an IOException if the underlying stream is unavailable or * has some I/O trouble reading from the stream. * * @exception IOException if error reading from stream */ public String nextLink() throws IOException { if (parser == null) throw new IOException("Attempt to scan from null stream or reader"); while (true) { String token = nextToken(); if (token == null) return null; if (token.startsWith("<")) {// is HTML Tag String url = findURLinTag(token); if (url != null) return url; } } } /** * Retrieves the next token from the stream and returns it (as a String). * All whitespace is skipped and ignored. Alphabetic (and digits/exluded punct) * is glommed together into a word. Other single punctuation chars are * returned singly as a single-character string. This method returns * null if there are no more tokens indicating the stream is at EOF. * It raises an IOException if the underlying stream is unavailable or * has some I/O trouble reading from the stream. * * @exception IOException if error reading from stream */ private String nextToken() throws IOException { int ttype; if (parser == null) throw new IOException("Attempt to scan from null stream or reader"); switch (ttype = parser.nextToken()) { case StreamTokenizer.TT_EOF : return null; // return null at EOF case StreamTokenizer.TT_WORD: return parser.sval; // return string case '"': if (extractTags) return "\""; else return parser.sval; // return quoted string case '<': if (extractTags) return getHTMLTag(); // fall-thru if not extracting tags default : return String.valueOf((char)ttype); // single "ordinary" char } } /* * This is used to configure the tokenizer for normal word extraction. * We treat most characters as part of a word, alphanumerics, extended * ASCII, etc. as well as a few punctuation (-, _, and '). The rest of * the punctuation we leave as ordinary and thus will be reported as * single-character delimiters. We use no quote characters and * do not parse numbers. */ private void configureForNormalTokens() { parser.resetSyntax(); // clears prev settings, all chars ordinary parser.whitespaceChars(0, ' '); // control, tab, newline, etc treated white parser.wordChars('a', 'z'); parser.wordChars('A', 'Z'); parser.wordChars('0', '9'); // treat digits like normal alpha chars parser.wordChars(129, 255); // extended ASCII characters, too String extraWordChars = "_-'"; // a few punct that are also treat like alpha for (int i = 0; i < extraWordChars.length(); i++) { char ch = extraWordChars.charAt(i); parser.wordChars(ch, ch); } } /* * This is used to temporarily adjust the parsing tables to have * just a few delimiters. For example, one time this is used is right * after pulling the < from the stream and trying to extract the HTML * tag that is expected to follow it. In that case the only delimiter * used is '>'. All other characters are set to word. This has * the effect of making the parser pull all characters out til it * gets to the closing >. The parse needs to be re-set back to normal * syntax tables after such an operation. */ private void configureForSpecialDelimiters(String delimiters) { parser.resetSyntax(); parser.wordChars(0, 255); // make everything a word to start for (int i = 0; i < delimiters.length(); i++) parser.ordinaryChar(delimiters.charAt(i)); // make only these ordinary } /* * Called after finding < in stream and wanting to pull out the * entire HTML tag as one token. We temporarily reset the syntax * tables to only recongize > as a delimiter and then ask parser * to pull out the next token, we then re-set the parser tables to normal * and return the html tag (appending the brackets on either end). */ private String getHTMLTag() throws IOException { configureForSpecialDelimiters(">"); String htmlTag = nextToken(); nextToken(); // to pull off the trailing >, if ended at EOF, no big deal configureForNormalTokens(); // restores tables to normal return "<" + htmlTag + ">"; // put back on angle brackets } /* * Scans ahaed attempting to match particular token (ignoring case). * This is used to find href or src inside an HTML tag. The array * allows you to look for more than one token. The skip parameter * indicates whether the token must be the next token or whether you * can throw away intervening tokens. */ private static boolean scanTilMatch(LinkScanner scanner, String[] searchFor, boolean allowSkip) throws IOException { do { String next = scanner.nextToken(); if (next == null) return false; for (int i=0; i < searchFor.length; i++) if (searchFor[i].equalsIgnoreCase(next)) return true; } while (allowSkip); return false; } /** * Static utility method to extract a URL from an HTML anchor, frame src, or * image map tag. You can give this method any HTML tag (so any string enclosed * in <> brackets) and it will attempt to find the proper sequence of "<... HREF=" * or "<... SRC=" and then pull out the URL that follows. For * a reasonably formmatted anchor/frame/img tag, this method will return the * URL referenced in it. For any other type of tag, malformed tag, not * a tag at all, etc. it will return null. It may be that this doesn't * find the URL in all the whacky variations of ref tags you might find, * don't sweat it, this version catches most and that it all we will expect. * * @param htmlTag string containing an HTML tag wrapped in <> * @return the extracted URL (if there was one), null otherwise */ private static String findURLinTag(String htmlTag) { if (!htmlTag.startsWith("<") || !htmlTag.endsWith(">")) return null; // not a proper tag try { LinkScanner scanner = new LinkScanner(new StringReader(htmlTag), false); String[] linkTags = {"HREF", "SRC"}; // look for one of HREF or SRC String[] equals = {"="}; // must be followed by equals if (!scanTilMatch(scanner, linkTags, true)) return null; if (!scanTilMatch(scanner, equals, false)) return null; // must be followed by = token // If we got this far, we have href= or src= and now need to pull out the // url that follows it scanner.configureForSpecialDelimiters(">"); // accept all characters in URL scanner.parser.whitespaceChars(0, 32); // white space can delimit, too scanner.parser.quoteChar('"'); // turn back on double-quoting to protect white String urlString = scanner.nextToken(); // some weird links start with '?' (Apache file listings, dump those) if (urlString == null || urlString.startsWith("?")) return null; int refBegin = urlString.indexOf('#'); // truncate off optional reference tag if (refBegin != -1) urlString = urlString.substring(0, refBegin); return urlString; } catch (IOException e) { return null; // never happens, there won't be I/O problem scanning string } } }