 import java.io.*;

/**
 * Fairly trivial wrapper for the StreamTokenizer class, to allow
 * you to scan for link elements from a a HTML stream. 
 * It's pretty simple to use, you open a new LinkScanner on an input stream or
 * reader and then keep asking for links (via nextLink()) until it returns
 * null to signal at end of stream.  
 *<P>
 *
 * @author  Julie Zelenski
 * @version 1.1 11/1/99
 */


public class LinkScanner {

	private StreamTokenizer parser = null;
	private boolean extractTags = true;
	
	
	/** Constructs a new Scanner from any InputStream (such as one 
	 * created by opening a file or a URLConnection).
	 */	
	public LinkScanner(InputStream in)
	{
		this(new InputStreamReader(in), true);
	}
	
	
	/** Constructs a new Scanner from any Reader (new 1.1 I/O classes). 
	 */	
	public LinkScanner(Reader reader)
	{
		this(reader, true);
	}
	
	
	/** Constructs a new Scanner from any Reader and allows you to specify
	 * whether you want to extract HTML tags as a unit. If true, it will 
	 * pull out everything within <> as one unit. Otherwise, it doesn't think
	 * anything special about the angle brackets.
	 */	
	private LinkScanner(Reader reader, boolean extractHTMLTagsAsToken)
	{
		parser = new StreamTokenizer(reader);
		configureForNormalTokens();
		extractTags = extractHTMLTagsAsToken;
		if (!extractTags) parser.quoteChar('"');	// use double-quoting to protect white inside tags
	}
	
	
	/**
	 * Retrieves the next link from the stream and returns it (as a String).
	 * Uses tokenizer to pull tokens until a HTML tag is returned, then searches
	 * for a URL within it if possible. This method returns null if there
	 * are no more tokens indicating the stream is at EOF.
	 * It raises an IOException if the underlying stream is unavailable or
	 * has some I/O trouble reading from the stream.
	 *
     * @exception  IOException  if error reading from stream
	 */
	public String nextLink() throws IOException
	{
		if (parser == null) 
			throw new IOException("Attempt to scan from null stream or reader");
		while (true) {
			String token = nextToken(); 
			if (token == null) return null;
			if (token.startsWith("<"))  {// is HTML Tag	
				String url = findURLinTag(token);
				if (url != null) return url;
			}
		}
	} 
	
	
   /**
	 * Retrieves the next token from the stream and returns it (as a String).
	 * All whitespace is skipped and ignored.  Alphabetic (and digits/exluded punct) 
	 * is glommed together into a word. Other single punctuation chars are 
	 * returned singly as a single-character string. This method returns
	 *  null if  there are no more tokens indicating the stream is at EOF.
	 * It raises an IOException if the underlying stream is unavailable or
	 * has some I/O trouble reading from the stream.
	 *
     * @exception  IOException  if error reading from stream
	 */
	private String nextToken() throws IOException
	{
		int ttype;
		
		if (parser == null) 
			throw new IOException("Attempt to scan from null stream or reader");
		switch (ttype = parser.nextToken()) {
			case StreamTokenizer.TT_EOF : return null; // return null at EOF
			case StreamTokenizer.TT_WORD: return parser.sval; // return string
			case '"': if (extractTags) return "\""; else return parser.sval; // return quoted string
			case '<': if (extractTags) return getHTMLTag(); // fall-thru if not extracting tags
			default : return String.valueOf((char)ttype);	// single "ordinary" char
		}
	} 


	/*
	 * This is used to configure the tokenizer for normal word extraction.
	 * We treat most characters as part of a word, alphanumerics, extended
	 * ASCII, etc. as well as a few punctuation (-, _, and '). The rest of
	 * the punctuation we leave as ordinary and thus will be reported as
	 * single-character delimiters. We use no quote characters and 
	 * do not parse numbers.
	 */
	private void configureForNormalTokens()
	{
		parser.resetSyntax();	// clears prev settings, all chars ordinary
		parser.whitespaceChars(0, ' '); // control, tab, newline, etc treated white
		parser.wordChars('a', 'z');
		parser.wordChars('A', 'Z');
		parser.wordChars('0', '9'); // treat digits like normal alpha chars
		parser.wordChars(129, 255); // extended ASCII characters, too
		String extraWordChars = "_-'"; // a few punct that are also treat like alpha
		for (int i = 0; i < extraWordChars.length(); i++) {
			char ch = extraWordChars.charAt(i);
			parser.wordChars(ch, ch);	
		}		
	}
	

	/*
	 * This is used to temporarily adjust the parsing tables to have
	 * just a few delimiters.  For example, one time this is used is right 
	 * after pulling the < from the stream and trying to extract the HTML  
	 * tag that is expected to follow it. In that case the only delimiter
	 * used is '>'. All other characters are set to word. This has
	 * the effect of making the parser pull all characters out til it
	 * gets to the closing >. The parse needs to be re-set back to normal 
	 * syntax tables after such an operation.
	 */
	private void configureForSpecialDelimiters(String delimiters)
	{
		parser.resetSyntax();
		parser.wordChars(0, 255); // make everything a word to start
		for (int i = 0; i < delimiters.length(); i++) 
			parser.ordinaryChar(delimiters.charAt(i));	// make only these ordinary
	}
	
	
	
	/*
	 * Called after finding < in stream and wanting to pull out the
	 * entire HTML tag as one token.  We temporarily reset the syntax
	 * tables to only recongize > as a delimiter and then ask parser
	 * to pull out the next token, we then re-set the parser tables to normal
	 * and return the html tag (appending the brackets on either end).
	 */
	private String getHTMLTag() throws IOException
	{
		configureForSpecialDelimiters(">");
		String htmlTag = nextToken();
		nextToken(); // to pull off the trailing >, if ended at EOF, no big deal
		configureForNormalTokens();	// restores tables to normal
		return "<" + htmlTag + ">"; // put back on angle brackets
	}
	
	


	/*
	 * Scans ahaed attempting to match particular token (ignoring case).
	 * This is used to find href or src inside an HTML tag. The array
	 * allows you to look for more than one token. The skip parameter
	 * indicates whether the token must be the next token or whether you
	 * can throw away intervening tokens.
	 */
	private static boolean scanTilMatch(LinkScanner scanner, String[] searchFor, boolean allowSkip) throws IOException
	{
		do {
			 String next = scanner.nextToken();
			 if (next == null) return false;
			 for (int i=0; i < searchFor.length; i++)
				 if (searchFor[i].equalsIgnoreCase(next)) return true;
		} while (allowSkip);
		return false;
	}
	

	/**
	 * Static utility method to extract a URL from an HTML anchor, frame src, or
	 * image map tag. You can give this method any HTML tag (so any string enclosed
	 * in <> brackets) and it will attempt to find the proper sequence of "<... HREF=" 
	 * or "<... SRC=" and then pull out the URL that follows. For
	 * a reasonably formmatted anchor/frame/img tag, this method will return the
	 * URL referenced in it. For any other type of tag, malformed tag, not
	 * a tag at all, etc. it will return null. It may be that this doesn't
	 * find the URL in all the whacky variations of ref tags you might find, 
	 * don't sweat it, this version catches most and that it all we will expect.
	 *
     * @param  htmlTag  string containing an HTML tag wrapped in <>
     * @return  the extracted URL (if there was one), null otherwise
	 */
	private static String findURLinTag(String htmlTag)
	{		
		if (!htmlTag.startsWith("<") || !htmlTag.endsWith(">")) return null; // not a proper tag
		
		try {
			LinkScanner scanner = new LinkScanner(new StringReader(htmlTag), false);
			String[] linkTags = {"HREF", "SRC"}; // look for one of HREF or SRC
			String[] equals = {"="}; // must be followed by equals
			if (!scanTilMatch(scanner, linkTags, true)) return null;
			if (!scanTilMatch(scanner, equals, false)) return null; // must be followed by = token
			
			// If we got this far, we have href= or src= and now need to pull out the
			// url that follows it
			scanner.configureForSpecialDelimiters(">"); // accept all characters in URL
			scanner.parser.whitespaceChars(0, 32);	// white space can delimit, too
			scanner.parser.quoteChar('"');	// turn back on double-quoting to protect white
			String urlString = scanner.nextToken();
				// some weird links start with '?' (Apache file listings, dump those)
			if (urlString == null || urlString.startsWith("?")) return null;
			
			int refBegin = urlString.indexOf('#'); // truncate off optional reference tag
			if (refBegin != -1) urlString = urlString.substring(0, refBegin);
			return urlString;
		}
		catch (IOException e) { 
			return null; // never happens, there won't be I/O problem scanning string
		}
	}

}
