This repository has been archived on 2024-09-05. You can view files and clone it, but cannot push or open issues or pull requests.
shimmie2/core/imageboard/search.php
2024-08-31 22:00:30 +01:00

467 lines
17 KiB
PHP

<?php
declare(strict_types=1);
namespace Shimmie2;
use GQLA\Query;
/**
* A small chunk of SQL code + parameters, to be used in a larger query
*
* eg
*
* $q = new Querylet("SELECT * FROM images");
* $q->append(new Querylet(" WHERE id = :id", ["id" => 123]));
* $q->append(new Querylet(" AND rating = :rating", ["rating" => "safe"]));
* $q->append(new Querylet(" ORDER BY id DESC"));
*
* becomes
*
* SELECT * FROM images WHERE id = :id AND rating = :rating ORDER BY id DESC
* ["id" => 123, "rating" => "safe"]
*/
class Querylet
{
/**
* @param string $sql
* @param array<string, mixed> $variables
*/
public function __construct(
public string $sql,
public array $variables = [],
) {
}
public function append(Querylet $querylet): void
{
$this->sql .= $querylet->sql;
$this->variables = array_merge($this->variables, $querylet->variables);
}
}
/**
* When somebody has searched for a tag, "cat", "cute", "-angry", etc
*/
class TagCondition
{
public function __construct(
public string $tag,
public bool $positive = true,
) {
}
}
/**
* When somebody has searched for a specific image property, like "rating:safe",
* "id:123", "width:100", etc
*/
class ImgCondition
{
public function __construct(
public Querylet $qlet,
public bool $positive = true,
) {
}
}
class Search
{
/**
* The search code is dark and full of horrors, and it's not always clear
* what's going on. This is a list of the steps that the search code took
* to find the images that it returned.
*
* @var list<string>
*/
public static array $_search_path = [];
/**
* Build a search query for a given set of tags and return
* the results as a PDOStatement (raw SQL rows)
*
* @param list<string> $tags
*/
private static function find_images_internal(int $start = 0, ?int $limit = null, array $tags = []): \FFSPHP\PDOStatement
{
global $config, $database, $user;
if ($start < 0) {
$start = 0;
}
if ($limit !== null && $limit < 1) {
$limit = 1;
}
if (Extension::is_enabled(SpeedHaxInfo::KEY) && $config->get_int(SpeedHaxConfig::BIG_SEARCH) > 0) {
$anon_limit = $config->get_int(SpeedHaxConfig::BIG_SEARCH);
if (!$user->can(Permissions::BIG_SEARCH) and count($tags) > $anon_limit) {
throw new PermissionDenied("Anonymous users may only search for up to $anon_limit tags at a time");
}
}
[$tag_conditions, $img_conditions, $order] = self::terms_to_conditions($tags);
$querylet = self::build_search_querylet($tag_conditions, $img_conditions, $order, $limit, $start);
return $database->get_all_iterable($querylet->sql, $querylet->variables);
}
/**
* Search for an array of images
*
* @param list<string> $tags
* @return Image[]
*/
#[Query(name: "posts", type: "[Post!]!", args: ["tags" => "[string!]"])]
public static function find_images(int $offset = 0, ?int $limit = null, array $tags = []): array
{
$result = self::find_images_internal($offset, $limit, $tags);
$images = [];
foreach ($result as $row) {
$images[] = new Image($row);
}
return $images;
}
/**
* Search for an array of images, returning a iterable object of Image
*
* @param list<string> $tags
* @return \Generator<Image>
*/
public static function find_images_iterable(int $start = 0, ?int $limit = null, array $tags = []): \Generator
{
$result = self::find_images_internal($start, $limit, $tags);
foreach ($result as $row) {
yield new Image($row);
}
}
/**
* Get a specific set of images, in the order that the set specifies,
* with all the search stuff (rating filters etc) taken into account
*
* @param int[] $ids
* @return Image[]
*/
public static function get_images(array $ids): array
{
$visible_images = [];
foreach (Search::find_images(tags: ["id=" . implode(",", $ids)]) as $image) {
$visible_images[$image->id] = $image;
}
$visible_ids = array_keys($visible_images);
$visible_popular_ids = array_filter($ids, fn ($id) => in_array($id, $visible_ids));
$images = array_map(fn ($id) => $visible_images[$id], $visible_popular_ids);
return $images;
}
/*
* Image-related utility functions
*/
public static function count_tag(string $tag): int
{
global $database;
return (int)$database->get_one(
"SELECT count FROM tags WHERE LOWER(tag) = LOWER(:tag)",
["tag" => $tag]
);
}
private static function count_total_images(): int
{
global $database;
return cache_get_or_set("image-count", fn () => (int)$database->get_one("SELECT COUNT(*) FROM images"), 600);
}
/**
* Count the number of image results for a given search
*
* @param list<string> $tags
*/
public static function count_images(array $tags = []): int
{
global $cache, $config, $database;
$tag_count = count($tags);
// speed_hax ignores the fact that extensions can add img_conditions
// even when there are no tags being searched for
$speed_hax = (Extension::is_enabled(SpeedHaxInfo::KEY) && $config->get_bool(SpeedHaxConfig::LIMIT_COMPLEX));
if ($speed_hax && $tag_count === 0) {
// total number of images in the DB
$total = self::count_total_images();
} elseif ($speed_hax && $tag_count === 1 && !preg_match("/[:=><\*\?]/", $tags[0])) {
if (!str_starts_with($tags[0], "-")) {
// one positive tag - we can look that up directly
$total = self::count_tag($tags[0]);
} else {
// one negative tag - subtract from the total
$total = self::count_total_images() - self::count_tag(substr($tags[0], 1));
}
} else {
// complex query
// implode(tags) can be too long for memcache, so use the hash of tags as the key
$cache_key = "image-count:" . md5(Tag::implode($tags));
$total = $cache->get($cache_key);
if (is_null($total)) {
[$tag_conditions, $img_conditions, $order] = self::terms_to_conditions($tags);
$querylet = self::build_search_querylet($tag_conditions, $img_conditions, null);
$total = (int)$database->get_one("SELECT COUNT(*) AS cnt FROM ($querylet->sql) AS tbl", $querylet->variables);
if ($speed_hax && $total > 5000) {
// when we have a ton of images, the count
// won't change dramatically very often
$cache->set($cache_key, $total, 3600);
}
}
}
return $total;
}
/**
* @return list<int>
*/
private static function tag_or_wildcard_to_ids(string $tag): array
{
global $database;
$sq = "SELECT id FROM tags WHERE LOWER(tag) LIKE LOWER(:tag)";
if ($database->get_driver_id() === DatabaseDriverID::SQLITE) {
$sq .= "ESCAPE '\\'";
}
return $database->get_col($sq, ["tag" => Tag::sqlify($tag)]);
}
/**
* Turn a human input string into a an abstract search query
*
* (This is only public for testing purposes, nobody should be calling this
* directly from outside this class)
*
* @param string[] $terms
* @return array{0: TagCondition[], 1: ImgCondition[], 2: string}
*/
public static function terms_to_conditions(array $terms): array
{
global $config;
$tag_conditions = [];
$img_conditions = [];
$order = null;
/*
* Turn a bunch of strings into a bunch of TagCondition
* and ImgCondition objects
*/
$stpen = 0; // search term parse event number
foreach (array_merge([null], $terms) as $term) {
$stpe = send_event(new SearchTermParseEvent($stpen++, $term, $terms));
$order ??= $stpe->order;
$img_conditions = array_merge($img_conditions, $stpe->img_conditions);
$tag_conditions = array_merge($tag_conditions, $stpe->tag_conditions);
}
$order = ($order ?: "images.".$config->get_string(IndexConfig::ORDER));
return [$tag_conditions, $img_conditions, $order];
}
/**
* Turn an abstract search query into an SQL Querylet
*
* (This is only public for testing purposes, nobody should be calling this
* directly from outside this class)
*
* @param TagCondition[] $tag_conditions
* @param ImgCondition[] $img_conditions
*/
public static function build_search_querylet(
array $tag_conditions,
array $img_conditions,
?string $order = null,
?int $limit = null,
?int $offset = null
): Querylet {
// no tags, do a simple search
if (count($tag_conditions) === 0) {
static::$_search_path[] = "no_tags";
$query = new Querylet("SELECT images.* FROM images WHERE 1=1");
}
// one tag sorted by ID - we can fetch this from the image_tags table,
// and do the offset / limit there, which is 10x faster than fetching
// all the image_tags and doing the offset / limit on the result.
elseif (
count($tag_conditions) === 1
&& $tag_conditions[0]->positive
// We can only do this if img_conditions is empty, because
// we're going to apply the offset / limit to the image_tags
// subquery, and applying extra conditions to the top-level
// query might reduce the total results below the target limit
&& empty($img_conditions)
// We can only do this if we're sorting by ID, because
// we're going to be using the image_tags table, which
// only has image_id and tag_id, not any other columns
&& ($order == "id DESC" || $order == "images.id DESC")
// This is only an optimisation if we are applying limit
// and offset
&& !is_null($limit)
&& !is_null($offset)
) {
static::$_search_path[] = "fast";
$tc = $tag_conditions[0];
// IN (SELECT id FROM tags) is 100x slower than doing a separate
// query and then a second query for IN(first_query_results)??
$tag_array = self::tag_or_wildcard_to_ids($tc->tag);
if (count($tag_array) == 0) {
// if wildcard expanded to nothing, take a shortcut
static::$_search_path[] = "invalid_tag";
$query = new Querylet("SELECT images.* FROM images WHERE 1=0");
} else {
$set = implode(', ', $tag_array);
$query = new Querylet("
SELECT images.*
FROM images INNER JOIN (
SELECT DISTINCT it.image_id
FROM image_tags it
WHERE it.tag_id IN ($set)
ORDER BY it.image_id DESC
LIMIT :limit OFFSET :offset
) a on a.image_id = images.id
WHERE 1=1
", ["limit" => $limit, "offset" => $offset]);
// don't offset at the image level because
// we already offset at the image_tags level
$limit = null;
$offset = null;
}
}
// more than one tag, or more than zero other conditions, or a non-default sort order
else {
static::$_search_path[] = "general";
$positive_tag_id_array = [];
$positive_wildcard_id_array = [];
$negative_tag_id_array = [];
$all_nonexistent_negatives = true;
foreach ($tag_conditions as $tq) {
$tag_ids = self::tag_or_wildcard_to_ids($tq->tag);
$tag_count = count($tag_ids);
if ($tq->positive) {
$all_nonexistent_negatives = false;
if ($tag_count == 0) {
# one of the positive tags had zero results, therefor there
# can be no results; "where 1=0" should shortcut things
static::$_search_path[] = "invalid_tag";
return new Querylet("SELECT images.* FROM images WHERE 1=0");
} elseif ($tag_count == 1) {
// All wildcard terms that qualify for a single tag can be treated the same as non-wildcards
$positive_tag_id_array[] = $tag_ids[0];
} else {
// Terms that resolve to multiple tags act as an OR within themselves
// and as an AND in relation to all other terms,
$positive_wildcard_id_array[] = $tag_ids;
}
} else {
if ($tag_count > 0) {
$all_nonexistent_negatives = false;
// Unlike positive criteria, negative criteria are all handled in an OR fashion,
// so we can just compile them all into a single sub-query.
$negative_tag_id_array = array_merge($negative_tag_id_array, $tag_ids);
}
}
}
assert($positive_tag_id_array || $positive_wildcard_id_array || $negative_tag_id_array || $all_nonexistent_negatives, _get_query());
if ($all_nonexistent_negatives) {
static::$_search_path[] = "all_nonexistent_negatives";
$query = new Querylet("SELECT images.* FROM images WHERE 1=1");
} elseif (!empty($positive_tag_id_array) || !empty($positive_wildcard_id_array)) {
static::$_search_path[] = "some_positives";
$inner_joins = [];
if (!empty($positive_tag_id_array)) {
foreach ($positive_tag_id_array as $tag) {
$inner_joins[] = "= $tag";
}
}
if (!empty($positive_wildcard_id_array)) {
foreach ($positive_wildcard_id_array as $tags) {
$positive_tag_id_list = join(', ', $tags);
$inner_joins[] = "IN ($positive_tag_id_list)";
}
}
$first = array_shift($inner_joins);
$sub_query = "SELECT DISTINCT it.image_id FROM image_tags it ";
$i = 0;
foreach ($inner_joins as $inner_join) {
$i++;
$sub_query .= " INNER JOIN image_tags it$i ON it$i.image_id = it.image_id AND it$i.tag_id $inner_join ";
}
if (!empty($negative_tag_id_array)) {
$negative_tag_id_list = join(', ', $negative_tag_id_array);
$sub_query .= " LEFT JOIN image_tags negative ON negative.image_id = it.image_id AND negative.tag_id IN ($negative_tag_id_list) ";
}
$sub_query .= "WHERE it.tag_id $first ";
if (!empty($negative_tag_id_array)) {
$sub_query .= " AND negative.image_id IS NULL";
}
$sub_query .= " GROUP BY it.image_id ";
$query = new Querylet("
SELECT images.*
FROM images
INNER JOIN ($sub_query) a on a.image_id = images.id
");
} elseif (!empty($negative_tag_id_array)) {
static::$_search_path[] = "only_negative_tags";
$negative_tag_id_list = join(', ', $negative_tag_id_array);
$query = new Querylet("
SELECT images.*
FROM images
LEFT JOIN image_tags negative ON negative.image_id = images.id AND negative.tag_id in ($negative_tag_id_list)
WHERE negative.image_id IS NULL
");
} else {
throw new InvalidInput("No criteria specified");
}
}
/*
* Merge all the image metadata searches into one generic querylet
* and append to the base querylet with "AND blah"
*/
if (!empty($img_conditions)) {
$n = 0;
$img_sql = "";
$img_vars = [];
foreach ($img_conditions as $iq) {
if ($n++ > 0) {
$img_sql .= " AND";
}
if (!$iq->positive) {
$img_sql .= " NOT";
}
$img_sql .= " (" . $iq->qlet->sql . ")";
$img_vars = array_merge($img_vars, $iq->qlet->variables);
}
$query->append(new Querylet(" AND "));
$query->append(new Querylet($img_sql, $img_vars));
}
if (!is_null($order)) {
$query->append(new Querylet(" ORDER BY ".$order));
}
if (!is_null($limit)) {
$query->append(new Querylet(" LIMIT :limit ", ["limit" => $limit]));
$query->append(new Querylet(" OFFSET :offset ", ["offset" => $offset]));
}
return $query;
}
}