author_field_filter( array( * 'prefixes' => array( '@' ), * 'wpcom_id_field' => 'author_id', * 'must_query_fields' => array( 'author.engram', 'author_login.engram' ), * 'boost_query_fields' => array( 'author^2', 'author_login^2', 'title.default.engram' ), * ) ); * * //remainder of query * $match_content_fields = $parser->merge_ml_fields( * array( * 'all_content' => 0.1, * ), * array( * 'all_content.default.engram^0.1', * ) * ); * $boost_content_fields = $parser->merge_ml_fields( * array( * 'title' => 2, * 'description' => 1, * 'tags' => 1, * ), * array( * 'author_login^2', * 'author^2', * ) * ); * * $parser->phrase_filter( array( * 'must_query_fields' => $match_content_fields, * 'boost_query_fields' => $boost_content_fields, * ) ); * $parser->remaining_query( array( * 'must_query_fields' => $match_content_fields, * 'boost_query_fields' => $boost_content_fields, * ) ); * * //Boost on phrases * $parser->remaining_query( array( * 'boost_query_fields' => $boost_content_fields, * 'boost_query_type' => 'phrase', * ) ); * * //boosting * $parser->add_max_boost_to_functions( 20 ); * $parser->add_function( 'field_value_factor', array( * 'follower_count' => array( * 'modifier' => 'sqrt', * 'factor' => 1, * 'missing' => 0, * ) ) ); * * //Filtering * $parser->add_filter( array( * 'exists' => array( 'field' => 'langs.' . $lang ) * ) ); * * //run the query * $es_query_args = array( * 'name' => 'feeds', * 'blog_id' => false, * 'security_strategy' => 'a8c', * 'type' => 'feed,blog', * 'fields' => array( 'blog_id', 'feed_id' ), * 'query' => $parser->build_query(), * 'filter' => $parser->build_filter(), * 'size' => $size, * 'from' => $from * ); * $es_results = es_api_search_index( $es_query_args, 'api-feed-find' ); * */ jetpack_require_lib( 'jetpack-wpes-query-builder' ); class Jetpack_WPES_Search_Query_Parser extends Jetpack_WPES_Query_Builder { var $orig_query = ''; var $current_query = ''; var $langs; var $avail_langs = array( 'ar', 'bg', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'hy', 'id', 'it', 'ja', 'ko', 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' ); function __construct( $user_query, $langs ) { $this->orig_query = $user_query; $this->current_query = $this->orig_query; $this->langs = $this->norm_langs( $langs ); } var $extracted_phrases = array(); /////////////////////////////////////////////////////// // Methods for Building arrays of multilingual fields /* * Normalize language codes */ function norm_langs( $langs ) { $lst = array(); foreach( $langs as $l ) { $l = strtok( $l, '-_' ); if ( in_array( $l, $this->avail_langs ) ) { $lst[$l] = true; } else { $lst['default'] = true; } } return array_keys( $lst ); } /* * Take a list of field prefixes and expand them for multi-lingual * with the provided boostings. */ function merge_ml_fields( $fields2boosts, $additional_fields ) { $flds = array(); foreach( $fields2boosts as $f => $b ) { foreach( $this->langs as $l ) { $flds[] = $f . '.' . $l . '^' . $b; } } foreach( $additional_fields as $f ) { $flds[] = $f; } return $flds; } //////////////////////////////////// // Extract Fields for Filtering on /* * Extract any @mentions from the user query * use them as a filter if we can find a wp.com id * otherwise use them as a * * args: * wpcom_id_field: wp.com id field * must_query_fields: array of fields to search for matching results (optional) * boost_query_fields: array of fields to search in for boosting results (optional) * prefixes: array of prefixes that the user can use to indicate an author * * returns true/false of whether any were found * * See also: https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java */ function author_field_filter( $args ) { $defaults = array( 'wpcom_id_field' => 'author_id', 'must_query_fields' => null, 'boost_query_fields' => null, 'prefixes' => array( '@' ), ); $args = wp_parse_args( $args, $defaults ); $names = array(); foreach( $args['prefixes'] as $p ) { $found = $this->get_fields( $p ); if ( $found ) { foreach( $found as $f ) { $names[] = $f; } } } if ( empty( $names ) ) { return false; } foreach( $args['prefixes'] as $p ) { $this->remove_fields( $p ); } $user_ids = array(); $query_names = array(); //loop through the matches and separate into filters and queries foreach( $names as $n ) { //check for exact match on login $userdata = get_user_by( 'login', strtolower( $n ) ); $filtering = false; if ( $userdata ) { $user_ids[ $userdata->ID ] = true; $filtering = true; } $is_phrase = false; if ( preg_match( '/"/', $n ) ) { $is_phrase = true; $n = preg_replace( '/"/', '', $n ); } if ( !empty( $args['must_query_fields'] ) && !$filtering ) { if ( $is_phrase ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['must_query_fields'], 'query' => $n, 'type' => 'phrase', ) ) ); } else { $this->add_query( array( 'multi_match' => array( 'fields' => $args['must_query_fields'], 'query' => $n, ) ) ); } } if ( !empty( $args['boost_query_fields'] ) ) { if ( $is_phrase ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $n, 'type' => 'phrase', ) ), 'should' ); } else { $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $n, ) ), 'should' ); } } } if ( ! empty( $user_ids ) ) { $user_ids = array_keys( $user_ids ); $this->add_filter( array( 'terms' => array( $args['wpcom_id_field'] => $user_ids ) ) ); } return true; } /* * Extract any prefix followed by text use them as a must clause, * and optionally as a boost to the should query * This can be used for hashtags. eg #News, or #"current events", * but also works for any arbitrary field. eg from:Greg * * args: * must_query_fields: array of fields that must match the tag (optional) * boost_query_fields: array of fields to boost search on (optional) * prefixes: array of prefixes that the user can use to indicate a tag * * returns true/false of whether any were found * */ function text_field_filter( $args ) { $defaults = array( 'must_query_fields' => array( 'tag.name' ), 'boost_query_fields' => array( 'tag.name' ), 'prefixes' => array( '#' ), ); $args = wp_parse_args( $args, $defaults ); $tags = array(); foreach( $args['prefixes'] as $p ) { $found = $this->get_fields( $p ); if ( $found ) { foreach( $found as $f ) { $tags[] = $f; } } } if ( empty( $tags ) ) { return false; } foreach( $args['prefixes'] as $p ) { $this->remove_fields( $p ); } foreach( $tags as $t ) { $is_phrase = false; if ( preg_match( '/"/', $t ) ) { $is_phrase = true; $t = preg_replace( '/"/', '', $t ); } if ( ! empty( $args['must_query_fields'] ) ) { if ( $is_phrase ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['must_query_fields'], 'query' => $t, 'type' => 'phrase', ) ) ); } else { $this->add_query( array( 'multi_match' => array( 'fields' => $args['must_query_fields'], 'query' => $t, ) ) ); } } if ( ! empty( $args['boost_query_fields'] ) ) { if ( $is_phrase ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $t, 'type' => 'phrase', ) ), 'should' ); } else { $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $t, ) ), 'should' ); } } } return true; } /* * Extract anything surrounded by quotes or if there is an opening quote * that is not complete, and add them to the query as a phrase query. * Quotes can be either '' or "" * * args: * must_query_fields: array of fields that must match the phrases * boost_query_fields: array of fields to boost the phrases on (optional) * * returns true/false of whether any were found * */ function phrase_filter( $args ) { $defaults = array( 'must_query_fields' => array( 'all_content' ), 'boost_query_fields' => array( 'title' ), ); $args = wp_parse_args( $args, $defaults ); $phrases = array(); if ( preg_match_all( '/"([^"]+)"/', $this->current_query, $matches ) ) { foreach ( $matches[1] as $match ) { $phrases[] = $match; } $this->current_query = preg_replace( '/"([^"]+)"/', '', $this->current_query ); } if ( preg_match_all( "/'([^']+)'/", $this->current_query, $matches ) ) { foreach ( $matches[1] as $match ) { $phrases[] = $match; } $this->current_query = preg_replace( "/'([^']+)'/", '', $this->current_query ); } //look for a final, uncompleted phrase $phrase_prefix = false; if ( preg_match_all( '/"([^"]+)$/', $this->current_query, $matches ) ) { $phrase_prefix = $matches[1][0]; $this->current_query = preg_replace( '/"([^"]+)$/', '', $this->current_query ); } if ( preg_match_all( "/'([^']+)$/", $this->current_query, $matches ) ) { $phrase_prefix = $matches[1][0]; $this->current_query = preg_replace( "/'([^']+)$/", '', $this->current_query ); } if ( $phrase_prefix ) { $phrases[] = $phrase_prefix; } if ( empty( $phrases ) ) { return false; } foreach ( $phrases as $p ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['must_query_fields'], 'query' => $p, 'type' => 'phrase', ) ) ); if ( ! empty( $args['boost_query_fields'] ) ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $p, 'operator' => 'and', ) ), 'should' ); } } return true; } /* * Query fields based on the remaining parts of the query * This could be the final AND part of the query terms to match, or it * could be boosting certain elements of the query * * args: * must_query_fields: array of fields that must match the remaining terms (optional) * boost_query_fields: array of fields to boost the remaining terms on (optional) * */ function remaining_query( $args ) { $defaults = array( 'must_query_fields' => null, 'boost_query_fields' => null, 'boost_operator' => 'and', 'boost_query_type' => 'best_fields', ); $args = wp_parse_args( $args, $defaults ); if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) { return; } if ( ! empty( $args['must_query_fields'] ) ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['must_query_fields'], 'query' => $this->current_query, 'operator' => 'and', ) ) ); } if ( ! empty( $args['boost_query_fields'] ) ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $this->current_query, 'operator' => $args['boost_operator'], 'type' => $args['boost_query_type'], ) ), 'should' ); } } /* * Query fields using a prefix query (alphabetical expansions on the index). * This is not recommended. Slower performance and worse relevancy. * * (UNTESTED! Copied from old prefix expansion code) * * args: * must_query_fields: array of fields that must match the remaining terms (optional) * boost_query_fields: array of fields to boost the remaining terms on (optional) * */ function remaining_prefix_query( $args ) { $defaults = array( 'must_query_fields' => array( 'all_content' ), 'boost_query_fields' => array( 'title' ), 'boost_operator' => 'and', 'boost_query_type' => 'best_fields', ); $args = wp_parse_args( $args, $defaults ); if ( empty( $this->current_query ) || ctype_space( $this->current_query ) ) { return; } ////////////////////////////////// // Example cases to think about: // "elasticse" // "elasticsearch" // "elasticsearch " // "elasticsearch lucen" // "elasticsearch lucene" // "the future" - note the stopword which will match nothing! // "F1" - an exact match that also has tons of expansions // "こんにちは" ja "hello" // "こんにちは友人" ja "hello friend" - we just rely on the prefix phrase and ES to split words // - this could still be better I bet. Maybe we need to analyze with ES first? // ///////////////////////////// //extract pieces of query // eg: "PREFIXREMAINDER PREFIXWORD" // "elasticsearch lucen" $prefix_word = false; $prefix_remainder = false; if ( preg_match_all( '/([^ ]+)$/', $this->current_query, $matches ) ) { $prefix_word = $matches[1][0]; } $prefix_remainder = preg_replace( '/([^ ]+)$/', '', $this->current_query ); if ( ctype_space( $prefix_remainder ) ) { $prefix_remainder = false; } if ( ! $prefix_word ) { //Space at the end of the query, so skip using a prefix query if ( ! empty( $args['must_query_fields'] ) ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['must_query_fields'], 'query' => $this->current_query, 'operator' => 'and', ) ) ); } if ( ! empty( $args['boost_query_fields'] ) ) { $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $this->current_query, 'operator' => $args['boost_operator'], 'type' => $args['boost_query_type'], ) ), 'should' ); } } else { //must match the prefix word and the prefix remainder if ( ! empty( $args['must_query_fields'] ) ) { //need to do an OR across a few fields to handle all cases $must_q = array( 'bool' => array( 'should' => array( ), 'minimum_should_match' => 1 ) ); //treat all words as an exact search (boosts complete word like "news" //from prefixes of "newspaper") $must_q['bool']['should'][] = array( 'multi_match' => array( 'fields' => $this->all_fields, 'query' => $full_text, 'operator' => 'and', 'type' => 'cross_fields', ) ); //always optimistically try and match the full text as a phrase //prefix "the futu" should try to match "the future" //otherwise the first stopword kinda breaks //This also works as the prefix match for a single word "elasticsea" $must_q['bool']['should'][] = array( 'multi_match' => array( 'fields' => $this->phrase_fields, 'query' => $full_text, 'operator' => 'and', 'type' => 'phrase_prefix', 'max_expansions' => 100, ) ); if ( $prefix_remainder ) { //Multiple words found, so treat each word on its own and not just as //a part of a phrase //"elasticsearch lucen" => "elasticsearch" exact AND "lucen" prefix $q['bool']['should'][] = array( 'bool' => array( 'must' => array( array( 'multi_match' => array( 'fields' => $this->phrase_fields, 'query' => $prefix_word, 'operator' => 'and', 'type' => 'phrase_prefix', 'max_expansions' => 100, ) ), array( 'multi_match' => array( 'fields' => $this->all_fields, 'query' => $prefix_remainder, 'operator' => 'and', 'type' => 'cross_fields', ) ), ) ) ); } $this->add_query( $must_q ); } //Now add any boosting of the query if ( ! empty( $args['boost_query_fields'] ) ) { //treat all words as an exact search (boosts complete word like "news" //from prefixes of "newspaper") $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $this->current_query, 'operator' => $args['boost_query_operator'], 'type' => $args['boost_query_type'], ) ), 'should' ); //optimistically boost the full phrase prefix match $this->add_query( array( 'multi_match' => array( 'fields' => $args['boost_query_fields'], 'query' => $this->current_query, 'operator' => 'and', 'type' => 'phrase_prefix', 'max_expansions' => 100, ) ) ); } } } /* * Boost results based on the lang probability overlaps * * args: * langs2prob: list of languages to search in with associated boosts */ function boost_lang_probs( $langs2prob ) { foreach( $langs2prob as $l => $p ) { $this->add_function( 'field_value_factor', array( 'modifier' => 'none', 'factor' => $p, 'missing' => 0.01, //1% chance doc did not have right lang detected ) ); } } //////////////////////////////////// // Helper Methods //Get the text after some prefix. eg @gibrown, or @"Greg Brown" protected function get_fields( $field_prefix ) { $regex = '/' . $field_prefix . '(("[^"]+")|([^\\p{Z}]+))/'; if ( preg_match_all( $regex, $this->current_query, $match ) ) { return $match[1]; } return false; } //Remove the prefix and text from the query protected function remove_fields( $field_name ) { $regex = '/' . $field_name . '(("[^"]+")|([^\\p{Z}]+))/'; $this->current_query = preg_replace( $regex, '', $this->current_query ); } //Best effort string truncation that splits on word breaks function truncate_string( $string, $limit, $break=" " ) { if ( mb_strwidth( $string ) <= $limit ) { return $string; } // walk backwards from $limit to find first break $breakpoint = $limit; $broken = false; while ( $breakpoint > 0 ) { if ( $break === mb_strimwidth( $string, $breakpoint, 1 ) ) { $string = mb_strimwidth( $string, 0, $breakpoint ); $broken = true; break; } $breakpoint--; } // if we weren't able to find a break, need to chop mid-word if ( !$broken ) { $string = mb_strimwidth( $string, 0, $limit ); } return $string; } }