Subir archivos a 'utils/vadersentiment'
This commit is contained in:
parent
a2c6225973
commit
6fedceb429
2 changed files with 224 additions and 0 deletions
112
utils/vadersentiment/sentitext.php
Normal file
112
utils/vadersentiment/sentitext.php
Normal file
|
@ -0,0 +1,112 @@
|
|||
<?php
|
||||
/*
|
||||
Identify sentiment-relevant string-level properties of input text.
|
||||
*/
|
||||
const PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
|
||||
"!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"];
|
||||
class SentiText {
|
||||
|
||||
private $text = "";
|
||||
public $words_and_emoticons = null;
|
||||
public $is_cap_diff = null;
|
||||
|
||||
function __construct($text){
|
||||
//checking that is string
|
||||
//if (!isinstance(text, str)){
|
||||
// text = str(text.encode('utf-8'));
|
||||
//}
|
||||
$this->text = $text;
|
||||
$this->words_and_emoticons = $this->_words_and_emoticons();
|
||||
// doesn't separate words from\
|
||||
// adjacent punctuation (keeps emoticons & contractions)
|
||||
$this->is_cap_diff = $this->allcap_differential($this->words_and_emoticons);
|
||||
}
|
||||
|
||||
/*
|
||||
Remove all punctation from a string
|
||||
*/
|
||||
function strip_punctuation($string) {
|
||||
//$string = strtolower($string);
|
||||
return preg_replace("/[[:punct:]]+/", "", $string);
|
||||
}
|
||||
|
||||
function array_count_values_of($haystack, $needle) {
|
||||
if(!in_array($needle,$haystack)){
|
||||
return 0;
|
||||
}
|
||||
$counts = array_count_values($haystack);
|
||||
return $counts[$needle];
|
||||
}
|
||||
|
||||
/*
|
||||
Check whether just some words in the input are ALL CAPS
|
||||
|
||||
:param list words: The words to inspect
|
||||
:returns: `True` if some but not all items in `words` are ALL CAPS
|
||||
*/
|
||||
private function allcap_differential($words){
|
||||
|
||||
$is_different = false;
|
||||
$allcap_words = 0;
|
||||
foreach($words as $word){
|
||||
//ctype is affected by the local of the processor see manual for more details
|
||||
if(ctype_upper($word)){
|
||||
$allcap_words += 1;
|
||||
}
|
||||
}
|
||||
$cap_differential = count($words) - $allcap_words;
|
||||
if ($cap_differential > 0 && $cap_differential < count($words)){
|
||||
$is_different = true;
|
||||
}
|
||||
return $is_different;
|
||||
}
|
||||
|
||||
function _words_only(){
|
||||
$text_mod = $this->strip_punctuation($this->text);
|
||||
// removes punctuation (but loses emoticons & contractions)
|
||||
$words_only = preg_split('/\s+/',$text_mod);
|
||||
# get rid of empty items or single letter "words" like 'a' and 'I'
|
||||
$works_only = array_filter($words_only,function($word){ return strlen($word) > 1; });
|
||||
return $words_only;
|
||||
}
|
||||
|
||||
function _words_and_emoticons(){
|
||||
|
||||
$wes = preg_split('/\s+/',$this->text);
|
||||
|
||||
# get rid of residual empty items or single letter words
|
||||
$wes = array_filter($wes,function($word){ return strlen($word) > 1; });
|
||||
//Need to remap the indexes of the array
|
||||
$wes = array_values ($wes);
|
||||
$words_only = $this->_words_only();
|
||||
|
||||
foreach($words_only as $word){
|
||||
|
||||
foreach(PUNC_LIST as $punct){
|
||||
|
||||
//replace all punct + word combinations with word
|
||||
$pword = $punct .$word;
|
||||
|
||||
$x1 = $this->array_count_values_of($wes,$pword);
|
||||
while ($x1 > 0){
|
||||
$i = array_search($pword,$wes);
|
||||
unset($wes[$i]);
|
||||
array_splice($wes,$i,0,$word);
|
||||
$x1 = $this->array_count_values_of($wes,$pword);
|
||||
}
|
||||
//Do the same as above but word then punct
|
||||
$wordp = $word . $punct;
|
||||
$x2 = $this->array_count_values_of($wes, $wordp);
|
||||
while ($x2 > 0){
|
||||
$i = array_search($wordp, $wes);
|
||||
unset($wes[$i]);
|
||||
array_splice($wes,$i,0,$word);
|
||||
$x2 = $this->array_count_values_of($wes, $wordp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $wes;
|
||||
}
|
||||
}
|
||||
?>
|
112
utils/vadersentiment/sentitext_cat.php
Normal file
112
utils/vadersentiment/sentitext_cat.php
Normal file
|
@ -0,0 +1,112 @@
|
|||
<?php
|
||||
/*
|
||||
Identify sentiment-relevant string-level properties of input text.
|
||||
*/
|
||||
const PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
|
||||
"!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"];
|
||||
class SentiText {
|
||||
|
||||
private $text = "";
|
||||
public $words_and_emoticons = null;
|
||||
public $is_cap_diff = null;
|
||||
|
||||
function __construct($text){
|
||||
//checking that is string
|
||||
//if (!isinstance(text, str)){
|
||||
// text = str(text.encode('utf-8'));
|
||||
//}
|
||||
$this->text = $text;
|
||||
$this->words_and_emoticons = $this->_words_and_emoticons();
|
||||
// doesn't separate words from\
|
||||
// adjacent punctuation (keeps emoticons & contractions)
|
||||
$this->is_cap_diff = $this->allcap_differential($this->words_and_emoticons);
|
||||
}
|
||||
|
||||
/*
|
||||
Remove all punctation from a string
|
||||
*/
|
||||
function strip_punctuation($string) {
|
||||
//$string = strtolower($string);
|
||||
return preg_replace("/[[:punct:]]+/", "", $string);
|
||||
}
|
||||
|
||||
function array_count_values_of($haystack, $needle) {
|
||||
if(!in_array($needle,$haystack)){
|
||||
return 0;
|
||||
}
|
||||
$counts = array_count_values($haystack);
|
||||
return $counts[$needle];
|
||||
}
|
||||
|
||||
/*
|
||||
Check whether just some words in the input are ALL CAPS
|
||||
|
||||
:param list words: The words to inspect
|
||||
:returns: `True` if some but not all items in `words` are ALL CAPS
|
||||
*/
|
||||
private function allcap_differential($words){
|
||||
|
||||
$is_different = false;
|
||||
$allcap_words = 0;
|
||||
foreach($words as $word){
|
||||
//ctype is affected by the local of the processor see manual for more details
|
||||
if(ctype_upper($word)){
|
||||
$allcap_words += 1;
|
||||
}
|
||||
}
|
||||
$cap_differential = count($words) - $allcap_words;
|
||||
if ($cap_differential > 0 && $cap_differential < count($words)){
|
||||
$is_different = true;
|
||||
}
|
||||
return $is_different;
|
||||
}
|
||||
|
||||
function _words_only(){
|
||||
$text_mod = $this->strip_punctuation($this->text);
|
||||
// removes punctuation (but loses emoticons & contractions)
|
||||
$words_only = preg_split('/\s+/',$text_mod);
|
||||
# get rid of empty items or single letter "words" like 'a' and 'I'
|
||||
$works_only = array_filter($words_only,function($word){ return strlen($word) > 1; });
|
||||
return $words_only;
|
||||
}
|
||||
|
||||
function _words_and_emoticons(){
|
||||
|
||||
$wes = preg_split('/\s+/',$this->text);
|
||||
|
||||
# get rid of residual empty items or single letter words
|
||||
$wes = array_filter($wes,function($word){ return strlen($word) > 1; });
|
||||
//Need to remap the indexes of the array
|
||||
$wes = array_values ($wes);
|
||||
$words_only = $this->_words_only();
|
||||
|
||||
foreach($words_only as $word){
|
||||
|
||||
foreach(PUNC_LIST as $punct){
|
||||
|
||||
//replace all punct + word combinations with word
|
||||
$pword = $punct .$word;
|
||||
|
||||
$x1 = $this->array_count_values_of($wes,$pword);
|
||||
while ($x1 > 0){
|
||||
$i = array_search($pword,$wes);
|
||||
unset($wes[$i]);
|
||||
array_splice($wes,$i,0,$word);
|
||||
$x1 = $this->array_count_values_of($wes,$pword);
|
||||
}
|
||||
//Do the same as above but word then punct
|
||||
$wordp = $word . $punct;
|
||||
$x2 = $this->array_count_values_of($wes, $wordp);
|
||||
while ($x2 > 0){
|
||||
$i = array_search($wordp, $wes);
|
||||
unset($wes[$i]);
|
||||
array_splice($wes,$i,0,$word);
|
||||
$x2 = $this->array_count_values_of($wes, $wordp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $wes;
|
||||
}
|
||||
}
|
||||
?>
|
Loading…
Reference in a new issue