import java.lang.*; import java.util.*; import java.io.*; import java.net.*; public class TEST { static public void main(String args[]) { int argc = args.length; StringBuffer sb = new StringBuffer(); long start = System.currentTimeMillis(); boolean yahoo = false; boolean google = false; boolean msoft = false; boolean ss = false; boolean p = false; boolean d = false; int qstart = 0; if((argc < 1) || (argc > 3)) { printUsage(); } else { // decide whose search engine we are hitting if (args[0].compareTo("-y") == 0) { yahoo = true; qstart++; } else if (args[0].compareTo("-g") == 0) { google = true; qstart++; } else if (args[0].compareTo("-m") == 0) { msoft = true; qstart++; } else { printUsage(); qstart++; } // decide sponsored search, performance only, or default to search if (argc > 2) { if (args[1].compareTo("-ss") == 0) { ss = true; qstart++; } else if (args[1].compareTo("-p") == 0) { p = true; qstart++; } else if (args[1].compareTo("-d") == 0) { d = true; qstart++; } } for(int i = qstart; i < argc; i++) { sb.append(args[i]); sb.append(" "); } execute(sb.toString(), yahoo, google, msoft, ss, p, d, start); } } static void printUsage() { System.out.println("Usage: java TEST {-y/-g/-m} [-ss/-p/-d] query terms..."); // -y --> Yahoo // -g --> Google // -m --> Microsoft // default is raw search (not Sponsored Search) // -ss --> Sponsored Search (Optional) // -p --> performance only (no parse, just get the raw HTML, optional) // -d --> debugging, print out UNADULTERATED HTML returned from search System.exit(-1); } static public void execute(String query, boolean yahoo, boolean google, boolean msoft, boolean ss, boolean p, boolean d, long start) { URLConnection h; URL url; final String ysearch = "http://search.yahoo.com/search?p="; final String gsearch = "http://www.google.com/search?q="; final String msearch = "http://search.live.com/results.aspx?q="; String search; int count = 0; try { if (yahoo) { search = ysearch; url = new URL("http://search.yahoo.com/search?p=" + URLEncoder.encode(query)); } else if (google) { search = gsearch; } else if (msoft) { search = msearch; } else { search = ysearch; } url = new URL(search + URLEncoder.encode(query)); h = url.openConnection(); h.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT)"); h.connect(); StringBuffer sb = new StringBuffer(); try { InputStream inp = h.getInputStream(); byte[] buffer = new byte[1024*50]; int length = 0; while( (length = inp.read(buffer)) != -1 ) { sb.append(new String(buffer, 0, length)); count = count + length; } } catch(Exception e) { System.out.println(e); } if (p) { // Performance only; don't parse the results System.err.println("Performance/MSECS: " + (System.currentTimeMillis() - start) + " Bytes: " + count); System.exit(0); } else if (d) { // Debugging only; unadulterated HTML System.out.println("UNADULTERATED: " + sb.toString()); System.exit(0); } else if (ss) { // going for the sponsored search results System.out.println("Sponsored Search results for " + query); if (yahoo) { extractURLSyahooss(sb.toString()); } else if (google) { extractURLSgoogless(sb.toString()); } else if (msoft) { extractURLSmsoftss(sb.toString()); } else { System.exit(-1); } } else { // default is normal web search System.out.println("Search results for " + query); if (yahoo) { extractURLSyahoo(sb.toString()); } else if (google) { extractURLSgoogle(sb.toString()); } else if (msoft) { extractURLSmsoft(sb.toString()); } else { System.exit(-1); } } } catch(Exception e) {} } static void extractURLSyahoo(String results) { final String YTAG = ""; final int YTAGLEN = 16; final char ENDCHAR = ' '; // default end character String ytag =""; StringBuffer ytagspecial; // use lowercase version of the page for string matching String lower = results.toLowerCase(); // find the first ytag on the page int ytagStart = lower.indexOf(YTAG); int urlStart, urlEnd; char endYtagChar = ENDCHAR; while (ytagStart != -1) { // we need to find the beginning and end of the URL urlStart = ytagStart + YTAGLEN; urlEnd = lower.indexOf(endYtagChar, urlStart); if (urlEnd != -1) { ytag = results.substring(urlStart, urlEnd); System.out.println(formatUrl(ytag)); // now look for the next search result ytagStart = lower.indexOf(YTAG, urlEnd); } } /* end while */ } /* end extract URLS for Yahoo Search */ static void extractURLSyahooss(String results) { final String YTAG1 = "overture"; final int YTAGLEN1 = 8; final String YTAG2 = ""; final int YTAGLEN2 = 4; final String YTAG3 = ""; final int YTAGLEN3 = 5; int start; String ytag = ""; // use lowercase version of the page for string matching String lower = results.toLowerCase(); int urlStart, urlEnd; // First key on overture string // then search for delimiter for target URL start = lower.indexOf(YTAG1); while (start != -1) { // reset start past the overture string start = start + YTAGLEN1; // search for the URL start delimiter urlStart = lower.indexOf(YTAG2, start) + YTAGLEN2; // search for URL end delimiter and position end before urlEnd = lower.indexOf(YTAG3, start); // get URL ytag = results.substring(urlStart, urlEnd); // cleanup and print target URL System.out.println(formatUrl(ytag)); // now look for the next search result start = lower.indexOf(YTAG1, urlEnd); } /* end while */ } /* end extract URLS for Yahoo Sponsored Search */ static void extractURLSgoogle(String results) { final String HREFTAG = "href="; final int HREFLEN = 5; String href =""; // use a lowercase version of the web page when doing string matching String lower = results.toLowerCase(); // find the first href on the page int hrefStart = lower.indexOf(HREFTAG); int urlStart, urlEnd; int len = HREFLEN; boolean done; boolean haveQuote; boolean haveSearchResult; char endHrefChar = '>'; // default character marking end of an HREF while (hrefStart != -1) { done = false; haveQuote = false; // find the end of the URL // first skip over blanks and the quote after the HREFTAG int i = hrefStart + len; while (!done) { endHrefChar = '>'; if ((lower.charAt(i) == '"') || (lower.charAt(i) == '\'') || (lower.charAt(i) == '`')) { haveQuote = true; endHrefChar = lower.charAt(i); // ending href character is quote done = true; i++; // skip over the quote } else if (lower.charAt(i) != ' ') { done = true; } } urlStart = i; urlEnd = lower.indexOf(endHrefChar, urlStart + 1); if (urlEnd != -1) { // have the start and end of our URL // determine whether it is a search result or a google link. haveSearchResult = false; if (lower.charAt(i) != '/') { // not a site relative link href = results.substring(urlStart, urlEnd); if (href.indexOf("google.com") != -1) { // google link, skip it } else if (href.indexOf("youtube.com") != -1) { // youtube link, skip it } else if ((href.charAt(7) >= '0') && (href.charAt(7) <= '9')) { // newly added, URL has IP address, skip it } else if (href.charAt(0) == 'a') { // a.href.replace, skip it } else { // non-google link haveSearchResult = true; } } else { // relative link, not a search result } if (haveSearchResult) { System.out.println(formatUrl(href)); } // now look for the next search result hrefStart = lower.indexOf(HREFTAG, urlEnd); } } /* end while */ } /* end extract URLS for Google Search */ static void extractURLSgoogless(String results) { final String HREFTAG1 = "id=an"; final String HREFTAG2 = ""; final String HREFTAG3 = ""; final int HREFTAG1LEN = 5; final int HREFTAG2LEN = 6; String href = ""; // use a lowercase version of the web page when doing string matching String lower = results.toLowerCase(); int hrefStart, hrefEnd; int urlStart, urlEnd; // Now two searches per result, one to search for id=ad string // the other to search for q= string hrefStart = lower.indexOf(HREFTAG1); while (hrefStart != -1) { // search for actual result hrefStart = lower.indexOf(HREFTAG2, hrefStart); if (hrefStart != -1) { urlStart = hrefStart + HREFTAG2LEN; urlEnd = lower.indexOf(HREFTAG3, urlStart); if (urlEnd == -1) { hrefStart = -1; } else { href = results.substring(urlStart, urlEnd); System.out.println(formatUrl(href)); // now look for the next search result hrefStart = lower.indexOf(HREFTAG1, urlEnd); } } } /* end while */ } /* end extract URLS for Google Sponsored Search */ static void extractURLSmsoft(String results) { final String START = "div id=\"results\""; final String END = "div class=\"sb_pag\""; final String SKIP1 = "msnscache"; final String SKIP2 = "r.msn.com"; final String HREFTAG = "href=\""; final int MSNLEN = 9; final int HREFLEN = 6; String href =""; // use a lowercase version of the web page when doing string matching String lower = results.toLowerCase(); int hrefStart; int urlStart; int urlEnd; int searchStart; int searchEnd; int len = HREFLEN; boolean done; char endHrefChar = '"'; // default character marking end of an HREF // find the start of the Search results searchStart = lower.indexOf(START); // find the end of the Search results searchEnd = lower.indexOf(END); // find the first href on the page (after Start) hrefStart = lower.indexOf(HREFTAG, searchStart); if (hrefStart > searchEnd) { done = true; } else { done = false; } // plow thru all search results while (!done) { // find start, end of the URL string we want to return urlStart = hrefStart + HREFLEN; urlEnd = lower.indexOf(endHrefChar, urlStart); // determine whether it is a search result or a msn link. href = results.substring(urlStart, urlEnd); if ((href.indexOf(SKIP1) == -1) && (href.indexOf(SKIP2) == -1)) { // we have a result, print it System.out.println(formatUrl(href)); } // now look for the next search result hrefStart = lower.indexOf(HREFTAG, urlEnd); if (hrefStart > searchEnd) { done = true; } else { done = false; } } /* end while */ } /* end extract URLS for Microsoft Search */ static void extractURLSmsoftss(String results) { // different approach; note start,end of real results, // segregate from sponsored ones final String START = "div id=\"results\""; final String END = "div class=\"sb_pag\""; final String HREFTAG = ""; final String ENDREFSTRING = ""; final String SKIP1 ="Encarta Encyclopedia"; final String SKIP2 ="r.msn.com"; final int HREFLEN = 6; String href =""; // use a lowercase version of the web page when doing string matching String lower = results.toLowerCase(); int hrefStart; int urlStart; int urlEnd; int searchStart; int searchEnd; int len = HREFLEN; boolean done; // the Sponsored search results can be in multiple sections // (top, right rail, bottom // different approach; note start,end of real results, // segregate from sponsored ones // find start, end of real results searchStart = lower.indexOf(START); searchEnd = lower.indexOf(END); hrefStart = lower.indexOf(HREFTAG); if ((searchStart == -1) || (searchEnd == -1) || (hrefStart == -1)) { done = true; } else { done = false; } while (!done) { // return all results using hreftag that are not // within start, end range of normal results // find start, end of the URL string we want to return urlStart = hrefStart + HREFLEN; urlEnd = lower.indexOf(ENDREFSTRING, urlStart); if (((urlStart < searchStart) && (urlEnd < searchStart)) || ((urlStart > searchEnd) && (urlEnd > searchEnd))) { // extract and print SS result (not a Web result href = results.substring(urlStart, urlEnd); if ((href.indexOf(SKIP1) == -1) && (href.indexOf(SKIP2) == -1)) { System.out.println(formatUrl(href)); } } // now look for the next sponsored search result hrefStart = lower.indexOf(HREFTAG, urlEnd); if (hrefStart == -1) { done = true; } } // end outer while loop } /* end extract URLS for Microsoft Sponsored Search */ static public String formatUrl(String ytag) { boolean first = true; boolean second = true; boolean done = false; char c; int length = ytag.length(); char format[] = new char[length]; int i = 0; int format_len = 0; final String prestring1 = "http://"; final int prestringLen1 = 7; final String prestring2 = " - "; final int prestringLen2 = 3; int prestringStart; // System.out.println("formatUrl/before: " + ytag); // first skip pre-strings if they exist (make into loop) prestringStart = ytag.indexOf(prestring1); if (prestringStart != -1) { i = i + prestringLen1; } prestringStart = ytag.indexOf(prestring2); if (prestringStart != -1) { i = i + prestringLen2; } // Next remove embedded HTML that bolds the result and end at trailing char while ((i < length) && (!done)) { c = ytag.charAt(i); if (c == '<') { if (first) { /* skip first HTML tag */ i = i + 3; first = false; } else if (second) { /* skip second HMTL tag or */ i++; c = ytag.charAt(i); while (c != '>') { i++; c = ytag.charAt(i); } i++; second = false; } else { done = true; // trailing slash } } else if (c == '/') { done = true; // trailing slash } else { /* add character to result format string */ format[format_len] = c; format_len++; i++; } } // System.out.println("formatUrl/after: " + new String(format) + format_len); return (new String(format)); } } /* end class TEST */