(feat): add text normalization for Thai and emoji#1130
(feat): add text normalization for Thai and emoji#1130triphopp wants to merge 1 commit intoPyThaiNLP:devfrom
Conversation
|
|
Thank you. Currently our text normalization functions are in If you like, you can try to fit the new functions in that file structure. |
Thank you for pointing that out. |
|
|
||
| """ | ||
|
|
||
| thai_special_chars_unicode = { |
There was a problem hiding this comment.
You can also try to use the list defined here:
pythainlp/pythainlp/__init__.py
Lines 20 to 22 in a069230
(it doesn't include ฯลฯ though)
| emoji_sentiment = { | ||
| "positive": [ | ||
| "😊", "😁", "😂", "🤣", "😄", "😍", "😘", "😻", "👍", "👏", "💕", "❤️", "😇", "😎", "🥰", "😃", "☺️" | ||
| ], | ||
|
|
||
| "negative": [ | ||
| "😢", "😭", "😠", "😡", "😤", "👎", "💔", "😞", "😖", "😩", "😣", "😫", "😓", "😰", "😱", "😿" | ||
| ], | ||
|
|
||
| "neutral": [ | ||
| "😐", "😶", "🤔", "😑", "😬", "😴", "😕", "😒", "🙄", "😮", "🤨", "😲" | ||
| ] | ||
| } | ||
|
|
||
| def replace_emoji_with_sentiment(sentence: str, emoji_dict: dict) -> str: | ||
| for emo in emoji_dict["positive"]: | ||
| sentence = sentence.replace(emo, " <<EMO_POS>> ") | ||
| for emo in emoji_dict["negative"]: | ||
| sentence = sentence.replace(emo, " <<EMO_NEG>> ") | ||
| for emo in emoji_dict["neutral"]: | ||
| sentence = sentence.replace(emo, " <<EMO_NEU>> ") | ||
| return sentence |
There was a problem hiding this comment.
| emoji_sentiment = { | |
| "positive": [ | |
| "😊", "😁", "😂", "🤣", "😄", "😍", "😘", "😻", "👍", "👏", "💕", "❤️", "😇", "😎", "🥰", "😃", "☺️" | |
| ], | |
| "negative": [ | |
| "😢", "😭", "😠", "😡", "😤", "👎", "💔", "😞", "😖", "😩", "😣", "😫", "😓", "😰", "😱", "😿" | |
| ], | |
| "neutral": [ | |
| "😐", "😶", "🤔", "😑", "😬", "😴", "😕", "😒", "🙄", "😮", "🤨", "😲" | |
| ] | |
| } | |
| def replace_emoji_with_sentiment(sentence: str, emoji_dict: dict) -> str: | |
| for emo in emoji_dict["positive"]: | |
| sentence = sentence.replace(emo, " <<EMO_POS>> ") | |
| for emo in emoji_dict["negative"]: | |
| sentence = sentence.replace(emo, " <<EMO_NEG>> ") | |
| for emo in emoji_dict["neutral"]: | |
| sentence = sentence.replace(emo, " <<EMO_NEU>> ") | |
| return sentence | |
| emoji_sentiment = { | |
| "POS": [ | |
| "😊", "😁", "😂", "🤣", "😄", "😍", "😘", "😻", "👍", "👏", "💕", "❤️", "😇", "😎", "🥰", "😃", "☺️" | |
| ], | |
| "NEG": [ | |
| "😢", "😭", "😠", "😡", "😤", "👎", "💔", "😞", "😖", "😩", "😣", "😫", "😓", "😰", "😱", "😿" | |
| ], | |
| "NEU": [ | |
| "😐", "😶", "🤔", "😑", "😬", "😴", "😕", "😒", "🙄", "😮", "🤨", "😲" | |
| ] | |
| } | |
| # Create an emoji-sentiment map from `emoji_sentiment` | |
| emoji_to_tag = {} | |
| for sentiment, emojis in emoji_sentiment.items(): | |
| tag = f" <<EMO_{sentiment}>> " | |
| for emo in emojis: | |
| emoji_to_tag[emo] = tag | |
| # Alternatively, we can have the static map predefined. | |
| def replace_emoji_with_sentiment(sentence: str) -> str: | |
| for emo, tag in emoji_to_tag.items(): | |
| sentence = sentence.replace(emo, tag) | |
| return sentence |
We can also try to reduce the number of text passes from three to one by having an emoji-sentiment map.
There was a problem hiding this comment.
Thanks! would it be okay if I add this code to the text normalization function?
There was a problem hiding this comment.
please take it and use it in any way you want :)
There was a problem hiding this comment.
Okay, I think this is the same as emojiconv.py
There was a problem hiding this comment.
@triphopMahithi Hello! Any update?
|
This PR is stale because it has been open for 30 days with no activity. |
|
This PR is stale because it has been open for 30 days with no activity. |
|
please rebase as we have updated a test. |



What does this changes
What was wrong
How this fixes it
Applies sentiment-aware emoji replacement based on a defined dictionary
Your checklist for this pull request