Server IP : 80.87.202.40 / Your IP : 216.73.216.169 Web Server : Apache System : Linux rospirotorg.ru 5.14.0-539.el9.x86_64 #1 SMP PREEMPT_DYNAMIC Thu Dec 5 22:26:13 UTC 2024 x86_64 User : bitrix ( 600) PHP Version : 8.2.27 Disable Function : NONE MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : OFF | Sudo : ON | Pkexec : ON Directory : /home/bitrix/ext_www/rospirotorg.ru/bitrix/modules/search/tools/ |
Upload File : |
<?php class CSearchLanguage { var $_abc = []; var $_lang_id; var $_lang_bigramm_cache; var $_trigrams = []; var $_has_bigramm_info = null; var $_bigrams = null; function __construct($lang_id) { $this->_lang_id = $lang_id; } //Function loads language class static function GetLanguage($sLang) { static $arLanguages = []; if (!isset($arLanguages[$sLang])) { $obLanguage = null; $class_name = mb_strtolower('CSearchLanguage' . $sLang); if (!class_exists($class_name)) { //First try to load customized class $strDirName = $_SERVER['DOCUMENT_ROOT'] . BX_PERSONAL_ROOT . '/php_interface/' . $sLang . '/search'; $strFileName = $strDirName . '/language.php'; if (file_exists($strFileName)) { $obLanguage = @include $strFileName; } if (!is_object($obLanguage)) { if (!class_exists($class_name)) { //Then module class $strDirName = $_SERVER['DOCUMENT_ROOT'] . '/bitrix/modules/search/tools/' . $sLang; $strFileName = $strDirName . '/language.php'; if (file_exists($strFileName)) { if (\Bitrix\Main\Localization\Translation::allowConvertEncoding()) { \Bitrix\Main\Localization\StreamConverter::include($strFileName, $sLang); } else { @include $strFileName; } } if (!class_exists($class_name)) { $class_name = 'CSearchLanguage'; } } } } if (!is_object($obLanguage)) { $obLanguage = new $class_name($sLang); } $obLanguage->LoadTrigrams($strDirName); $arStemInfo = stemming_init($sLang); if (is_array($arStemInfo)) { $obLanguage->_abc = array_flip($obLanguage->StrToArray($arStemInfo['abc'])); } $obLanguage->_has_bigramm_info = is_callable([$obLanguage, 'getbigrammletterfreq']); $arLanguages[$sLang] = $obLanguage; } return $arLanguages[$sLang]; } //Reads file with trigrams (combinations not allowed in the words) function LoadTrigrams($dir_name) { if (empty($this->_trigrams)) { $file_name = $dir_name . '/trigram'; if (file_exists($file_name) && is_file($file_name)) { $cache_id = filemtime($file_name) . ',v1,' . $file_name; $obCache = new CPHPCache; if ($obCache->StartDataCache(360000, $cache_id, 'search')) { $text = file_get_contents($file_name); $keyboard = $this->GetKeyboardLayout(); if (isset($keyboard['trigram_charset'])) { $text = \Bitrix\Main\Text\Encoding::convertEncoding($text, $keyboard['trigram_charset'], 'utf8'); } $ar = explode("\n", $text); foreach ($ar as $trigramm) { if (mb_strlen($trigramm) == 3) { $strScanCodesTmp = $this->ConvertToScancode($trigramm, false, true); if (mb_strlen($strScanCodesTmp) == 3) { $this->_trigrams[$strScanCodesTmp] = true; } } } $obCache->EndDataCache($this->_trigrams); } else { $this->_trigrams = $obCache->GetVars(); } } } } function HasTrigrams() { return !empty($this->_trigrams); } //Check phrase against trigrams function CheckTrigrams($arScanCodes) { $result = 0; $check = ''; $len = 0; foreach ($arScanCodes as $i => $code) { if ($code === false) //new word starts here { $check = ''; $len = 0; } else { //running window of 3 bytes if ($len < 3) { $check .= chr($code + 1); $len++; } else { $check = $check[1] . $check[2] . chr($code + 1); $len = 3; } } if ($len >= 3) { if (isset($this->_trigrams[$check])) { $result++; } } } return $result; } //This function returns positions of the letters //on the keyboard. This one is default English layout function GetKeyboardLayout() { return [ 'lo' => '` - ' . 'qwertyuiop[]' . "asdfghjkl;'" . 'zxcvbnm,. ', 'hi' => '~ ' . 'QWERTYUIOP{}' . 'ASDFGHJKL:"' . 'ZXCVBNM<> ' ]; } function ConvertFromScancode($arScancode) { $result = ''; $keyboard = $this->GetKeyboardLayout(); foreach ($arScancode as $code) { $result .= mb_substr($keyboard['lo'], $code, 1); } return $result; } public static function StrToArray($str) { $result = []; $len = mb_strlen($str); for ($i = 0;$i < $len; $i++) { $result[] = mb_substr($str, $i, 1); } return $result; } //This function converts text between layouts public static function ConvertKeyboardLayout($text, $from, $to) { static $keyboards = []; $combo = $from . '|' . $to; if (!isset($keyboards[$combo])) { //Fill local cache if (!array_key_exists($from, $keyboards)) { $ob = CSearchLanguage::GetLanguage($from); $keyboard = $ob->GetKeyboardLayout(); if (is_array($keyboard)) { $keyboards[$from] = array_merge($ob->StrToArray($keyboard['lo']), $ob->StrToArray($keyboard['hi'])); } else { $keyboards[$from] = null; } } if (!array_key_exists($to, $keyboards)) { $ob = CSearchLanguage::GetLanguage($to); $keyboard = $ob->GetKeyboardLayout(); if (is_array($keyboard)) { $keyboards[$to] = array_merge($ob->StrToArray($keyboard['lo']), $ob->StrToArray($keyboard['hi'])); } else { $keyboards[$to] = null; } } //when both layouts defined if (isset($keyboards[$from]) && isset($keyboards[$to])) { $keyboards[$combo] = []; foreach ($keyboards[$from] as $i => $ch) { if ($ch != false) { $keyboards[$combo][$ch] = $keyboards[$to][$i]; } } } } if (isset($keyboards[$combo])) { $text = static::StrToArray($text); foreach ($text as $pos => $char) { if (isset($keyboards[$combo][$char])) { $text[$pos] = $keyboards[$combo][$char]; } } return implode('', $text); } else { return $text; } } //This function converts text into array of character positions //on the keyboard. Not defined chars turns into "false" value. function ConvertToScancode($text, $strict=false, $binary=false) { static $cache = []; if (!isset($cache[$this->_lang_id])) { $cache[$this->_lang_id] = []; $keyboard = $this->GetKeyboardLayout(); foreach ($this->StrToArray($keyboard['lo']) as $pos => $ch) { $cache[$this->_lang_id][$ch] = $pos; } foreach ($this->StrToArray($keyboard['hi']) as $pos => $ch) { $cache[$this->_lang_id][$ch] = $pos; } } $scancodes = &$cache[$this->_lang_id]; if ($binary) { $result = ''; foreach ($this->StrToArray($text) as $ch) { if ( isset($scancodes[$ch]) && !($ch === ' ') && !($strict && !isset($this->_abc[$ch])) ) { $result .= chr($scancodes[$ch] + 1); } } } else { $result = []; foreach ($this->StrToArray($text) as $ch) { if ($ch === ' ') { $result[] = false; } elseif ($strict && !isset($this->_abc[$ch])) { $result[] = false; } elseif (isset($scancodes[$ch])) { $result[] = $scancodes[$ch]; } else { $result[] = false; } } } return $result; } function PreGuessLanguage($text, $lang=false) { //Indicates that there is no own guess return false; //In subclasses you should return array("from" => lang, "to" => lang) to translate //or return true when no translation nedded //or parent::GuessLanguage for futher processing } public static function GuessLanguage($text, $lang=false) { if ($text == '') { return false; } static $cache = []; if (empty($cache)) { $cache[] = 'en';//English is always in mind and on the first place $rsLanguages = CLanguage::GetList(); while ($arLanguage = $rsLanguages->Fetch()) { if ($arLanguage['LID'] != 'en') { $cache[] = $arLanguage['LID']; } } } if (is_array($lang)) { $arLanguages = $lang; } else { $arLanguages = $cache; } if (count($arLanguages) < 2) { return false; } //Give customized languages a chance to guess foreach ($arLanguages as $lang) { $ob = CSearchLanguage::GetLanguage($lang); $res = $ob->PreGuessLanguage($text, $lang); if (is_array($res)) { return $res; } elseif ($res === true) { return false; } } //First try to detect language which //was used to type the phrase $max_len = 0; $languages_from = []; foreach ($arLanguages as $lang) { $ob = CSearchLanguage::GetLanguage($lang); $arScanCodesTmp1 = $ob->ConvertToScancode($text, true); $_cnt = count(array_filter($arScanCodesTmp1)); if ($_cnt > $max_len) { $max_len = $_cnt; } $languages_from[$lang] = $arScanCodesTmp1; } if (empty($languages_from)) { return false; } if ($max_len < 2) { return false; } $languages_from = array_filter($languages_from, function($a) use($max_len) { return count(array_filter($a)) >= $max_len; } ); uasort($languages_from, function($a, $b) { return count(array_filter($b)) - count(array_filter($a)); } ); //If more than one language is detected as input //try to get one with best trigram info $arDetectionFrom = []; $i = 0; foreach ($languages_from as $lang => $arScanCodes) { $ob = CSearchLanguage::GetLanguage($lang); //Calculate how far sequence of scan codes //is from language model $deviation = $ob->GetDeviation($arScanCodes); $arDetectionFrom[$lang] = [ $ob->HasTrigrams(), $ob->CheckTrigrams($arScanCodes), $deviation[1], intval($deviation[0] * 100), $i, ]; $i++; } uasort($arDetectionFrom, ['CSearchLanguage', 'cmp']); //Now try the best to detect the language $arDetection = []; $i = 0; foreach ($arDetectionFrom as $lang_from => $arTemp) { foreach ($arLanguages as $lang) { $lang_from_to = $lang_from . '=>' . $lang; $arDetection[$lang_from_to] = []; $ob = CSearchLanguage::GetLanguage($lang); $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $lang_from, $lang); $arScanCodes = $ob->ConvertToScancode($alt_text, true); $arDetection[$lang_from_to][] = $ob->HasBigrammInfo() ? 0 : 1; $arDetection[$lang_from_to][] = $ob->CheckTrigrams($arScanCodes); $arDetection[$lang_from_to][] = -count(array_filter($arScanCodes)); //Calculate how far sequence of scan codes //is from language model $deviation = $ob->GetDeviation($arScanCodes); $arDetection[$lang_from_to][] = $deviation[1]; $arDetection[$lang_from_to][] = $deviation[0]; $arDetection[$lang_from_to][] = $i; $arDetection[$lang_from_to][] = $lang_from_to; $i++; } } uasort($arDetection, ['CSearchLanguage', 'cmp']); $language_from_to = key($arDetection); list($language_from, $language_to) = explode('=>', $language_from_to); $alt_text = CSearchLanguage::ConvertKeyboardLayout($text, $language_from, $language_to); if ($alt_text === $text) { return false; } return ['from' => $language_from, 'to' => $language_to]; } //Compare to results of text analysis static function cmp($a, $b) { $c = count($a); for ($i = 0; $i < $c; $i++) { if ($a[$i] < $b[$i]) { return -1; } elseif ($a[$i] > $b[$i]) { return 1; } } return 0;//never happens } //Function returns distance of the text (sequence of scan codes) //from language model function GetDeviation($arScanCodes) { //This is language model $lang_bigrams = $this->GetBigrammScancodeFreq(); $lang_count = $lang_bigrams['count']; unset($lang_bigrams['count']); //This is text model $text_bigrams = $this->ConvertToBigramms($arScanCodes); $count = $text_bigrams['count']; unset($text_bigrams['count']); $deviation = 0; $zeroes = 0; foreach ($text_bigrams as $key => $value) { for ($i = 0;$i < $value; $i++) { if (!isset($lang_bigrams[$key])) { $zeroes++; $deviation += 1 / $count; } else { $deviation += abs(1 / $count - $lang_bigrams[$key] / $lang_count); } } } return [$deviation, $zeroes]; } //Function returns bigramms of the text (array of scancodes) //For example "FAT RAT" will be //array("FA", "AT", "RA", "AT") //This is model of the text function ConvertToBigramms($arScancodes) { $result = ['count' => 0]; $len = count($arScancodes) - 1; for ($i = 0; $i < $len; $i++) { $code1 = $arScancodes[$i]; $code2 = $arScancodes[$i + 1]; if ($code1 !== false && $code2 !== false) { $result['count']++; if (!isset($result[$code1 . ' ' . $code2])) { $result[$code1 . ' ' . $code2] = 0; } $result[$code1 . ' ' . $code2]++; } } return $result; } function HasBigrammInfo() { return $this->_has_bigramm_info; } //Function returns model of the language function GetBigrammScancodeFreq() { if (!$this->HasBigrammInfo()) { return ['count' => 1]; } if (!isset($this->_lang_bigramm_cache)) { $bigramms = $this->GetBigrammLetterFreq(); $keyboard = $this->GetKeyboardLayout(); $keyboard_lo = $keyboard['lo']; $keyboard_hi = $keyboard['hi']; $result = ['count' => 0]; foreach ($bigramms as $letter1 => $row) { $p1 = mb_strpos($keyboard_lo, $letter1); if ($p1 === false) { $p1 = mb_strpos($keyboard_hi, $letter1); } $i = 0; foreach ($bigramms as $letter2 => $tmp) { $p2 = mb_strpos($keyboard_lo, $letter2); if ($p2 === false) { $p2 = mb_strpos($keyboard_hi, $letter2); } $weight = $row[$i]; $result['count'] += $weight; $result[$p1 . ' ' . $p2] = $weight; $i++; } } $this->_lang_bigramm_cache = $result; } return $this->_lang_bigramm_cache; } }