We are getting great value from the Zoom Search Engine, which has allowed us to open almost 150 years of local newspapers for historical research. Some limitations of our documents and scanning process pointed us toward use of the CGI/XML option and I was not initially sure how best to proceed. In hopes that it may save time for someone else, here's a sample implementation of Zoom Search within a C# application.
(Note that we are storing search results in a Session variable for subsequent re-sorts and pagination. Our highly detailed index provides search results on about 7,000 large documents in roughly 5 seconds. Re-sorting results and moving between pages of search results -- because we store the XML returned by Zoom -- takes less than one second, which provides both the accuracy and performance we were hoping for.)
--------------------------------------
---------------
This could all be done better, but hope it helps!
(Note that we are storing search results in a Session variable for subsequent re-sorts and pagination. Our highly detailed index provides search results on about 7,000 large documents in roughly 5 seconds. Re-sorting results and moving between pages of search results -- because we store the XML returned by Zoom -- takes less than one second, which provides both the accuracy and performance we were hoping for.)
--------------------------------------
Code:
//create a class in which to store search result items public class SearchResult { public string publicationName { get; set; } public string title { get; set; } public string link { get; set; } public string filetype { get; set; } public string context { get; set; } public Int32 termsMatched { get; set; } public Int32 score { get; set; } public double fileSize { get; set; } public List<XElement> highlightedTerms { get; set; } public string month { get; set; } public string year { get; set; } public string day { get; set; } public DateTime dateOfPublication { get; set; } public string strDateOfPublication { get; set; } public string pubDate { get; set; } } /// <summary> /// Method to refresh the results, either by conducting a search /// or by re-sorting of currently displayed results /// </summary> /// <param name="sortExpression"></param> /// <param name="sortDirection"></param> public void Refresh(string sortExpression, string sortDirection) { //create variable to pass instructions to the Zoom Search Engine string queryString = string.Empty; if (!IsPostBack) { //on first call (e.g., from a stored url which contains search criteria), get the query string int posParams = Request.RawUrl.IndexOf("?"); if (posParams >= 0) queryString = Request.RawUrl.Substring(posParams + 1); } else { //on post-backs from the search page, populate search request from variables provided queryString = "zoom_query=" + SearchString; queryString += "&zoom_and=" + Convert.ToInt16(MatchModeAllWords).ToString(); //to support post-processing (sorts and pagination), return all hits to populate our in-memory list string recordsToRetrieve = "1000000"; queryString += "&zoom_per_page=" + recordsToRetrieve; //use user-specified value for max records to display on the page if (RecordsPerPage.Length > 0) { //set records-per-page on the GridView control we'll be displaying search results in, //using the value passed to our RecordsPerPage property this.gvCollection.PageSize = Convert.ToInt32(RecordsPerPage); } } //create variable to store the zoom search xml string zoom_results = string.Empty; //create variable to store the collection of parsed search result items IEnumerable<SearchResult> SearchResults; //As the search of a large index is the most expensive operation, //prepare a Session variable in which to store the XML result for this specific zoom search. //(Use an Application variable instead if we are likely to get same search request from multiple users) string sessionToken = "NewspaperSearchResult_" + queryString; //If we have not saved results for these specific search parameters... if (Session[sessionToken] == null) { //set our grid back to page 1 this.gvCollection.PageIndex = 0; //specify that we want xml queryString += "&zoom_xml=1"; ProcessStartInfo psi = new ProcessStartInfo(); psi.CreateNoWindow = true; //store the zoom assets in a subfolder ("index0") so an updated index can be //ftp'd to an alternate subfolder (e.g. "index1") without interfering with site operations psi.FileName = Server.MapPath("lowellledger/index0/search.cgi"); psi.EnvironmentVariables["REQUEST_METHOD"] = "GET"; psi.EnvironmentVariables["QUERY_STRING"] = queryString; psi.EnvironmentVariables["REMOTE_ADDR"] = Request.ServerVariables["REMOTE_ADDR"]; psi.RedirectStandardInput = false; psi.RedirectStandardOutput = true; psi.UseShellExecute = false; Process proc = Process.Start(psi); proc.StandardOutput.ReadLine(); // skip the HTTP header line zoom_results = proc.StandardOutput.ReadToEnd(); // read the rest into our variable proc.WaitForExit(); //store the results in our session variable for this search profile Session[sessionToken] = zoom_results; } //..Or, if we have already stored the result XML for these specific search parameters... else { //just retrieve the result from the session variable zoom_results = Session[sessionToken].ToString(); } //The colon character interferes with XDocument parsing; replace that character with an underscore XDocument doc = XDocument.Parse(zoom_results.Substring(2).Replace("zoom:", "zoom_")); SearchResults = (from item in doc.Descendants("item") select new SearchResult() { //gather raw SearchResults into our SearchResult class members title = item.Element("title").Value, link = item.Element("link").Value, filetype = item.Element("zoom_filetype").Value, context = item.Element("zoom_context").Value, termsMatched = Convert.ToInt32(item.Element("zoom_termsMatched").Value), score = Convert.ToInt32(item.Element("zoom_score").Value), pubDate = item.Element("pubDate").Value, fileSize = Convert.ToDouble(item.Element("zoom_fileSize").Value.TrimEnd('k')), highlightedTerms = item.Element("zoom_context").Elements("zoom_highlight").ToList<XElement>(), }).ToList<SearchResult>(); foreach (SearchResult result in SearchResults) { //post-processing, for transforms not supported in the select operation above //for example: in this set of docs, pub dates cannot be determined from file date //(many years of docs were scanned at once), //but can be determined from file structure, so parse this item's link (href) //and store the value in the results class Uri uri = new Uri(result.link); string filename = string.Empty; filename = System.IO.Path.GetFileName(uri.LocalPath).Replace(System.IO.Path.GetExtension(uri.LocalPath), string.Empty); string[] dateElements = filename.Split('-'); result.dateOfPublication = Convert.ToDateTime(dateElements[0] + "/" + dateElements[1] + "/" + dateElements[2]); result.strDateOfPublication = result.dateOfPublication.ToShortDateString(); //create link result.link = "<a href='" + result.link + "' target='news'>" + result.title + "</a>"; //format filesize for display (they are big files, so convert Zoom's default measure to MB) result.fileSize = result.fileSize / 1024f / 1000f; //extract elements for search terms tagged by Zoom //and assign css class for formatting foreach (XElement e in result.highlightedTerms) { result.context = result.context.Replace(e.Value, "<span class='search_highlight'>" + e.Value + "</span>"); } } if (sortExpression.Length > 0) { //use linq.dynamic to sort according to user request (typically specified by clicking on a column header) SearchResults = SearchResults.OrderBy(sortExpression + " " + sortDirection); //set properties that will be used during pagination (wherein we'll re-use the current sort order) //(note: pagination is performed using the GridView object's native pagination function) previousSortDirection = sortDirection; previousSortExpression = sortExpression; } //tell the user how many records we found this.lblSearchStatus.Text = SearchResults.Count() + " records were found for the search term '" + SearchString + "'..."; //display results in an ASP.Net GridView control this.gvCollection.DataSource = SearchResults.ToList(); this.gvCollection.DataBind(); }
This could all be done better, but hope it helps!
Comment