Giter Club home page Giter Club logo

tha's Introduction

Tha (ថអ)

Khmer Text Normalization and Verbalization Toolkit.

Install

pip install tha
import tha.normalize
import tha.phone_numbers
import tha.urls
import tha.datetime
import tha.hashtags
import tha.ascii_lines
import tha.license_plate
import tha.cardinals
import tha.decimals
import tha.ordinals
import tha.currency
import tha.parenthesis
import tha.repeater

## Normalize
assert tha.normalize.processor("αž˜αž·αž“\u200bαž²αŸ’αž™") == "αž˜αž·αž“αž±αŸ’αž™"

## Phone Numbers
assert tha.phone_numbers.processor("010123123", chunk_size=2) == "0▁10▁12▁31▁23"
assert tha.phone_numbers.processor("010123123", chunk_size=3) == "0▁10▁123▁123"
assert tha.phone_numbers.processor("0961231234", chunk_size=3) == "0▁96▁123▁1234"

## URLs and emails
assert tha.urls.processor("[email protected]") == "example at g▁mail dot com"
assert tha.urls.processor("https://google.com") == "google dot com"
assert tha.urls.processor("http://google.com") == "google dot com"
assert tha.urls.processor("google.com") == "google dot com"
assert tha.urls.processor("google.gov.kh") == "google dot gov dot k▁h"
assert tha.urls.processor("google.com.kh") == "google dot com dot k▁h"

## Time
assert tha.datetime.time_processor("10:23AM") == "10 23▁A▁M"
assert tha.datetime.time_processor("10:23PM") == "10 23▁P▁M"
assert tha.datetime.time_processor("1:23PM") == "1 23▁P▁M"

## Date
assert tha.datetime.date_processor("2024-01-02") == "2024 01 02"
assert tha.datetime.date_processor("01-02-2034") == "01 02 2034"

## Hashtags
assert (
  tha.hashtags.processor("Hello world #this_will_remove hello") == "Hello world  hello"
)
assert tha.hashtags.processor("Hello world #αž›αž»αž” hello") == "Hello world  hello"
assert tha.hashtags.processor("Hello world #αž›αž»αž”1234 hello") == "Hello world  hello"

## ASCII Lines
assert tha.ascii_lines.processor("Remove --- asdasd") == "Remove  asdasd"
assert tha.ascii_lines.processor("Remove\n###\nasdasd") == "Remove\n\nasdasd"

## Cambodia License Plate
assert tha.license_plate.processor("1A 1234") == "1 A 12▁34"
assert tha.license_plate.processor("1A 4444") == "1 A αž€αžΆαžšαŸ‰αŸ4"

## Number - Cardinals
assert tha.cardinals.processor("1234") == "αž˜αž½αž™αž–αžΆαž“αŸ‹β–αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("1") == "αž˜αž½αž™"
assert tha.cardinals.processor("1▁2") == "αž˜αž½αž™β–αž–αžΈαžš"
assert tha.cardinals.processor("-1") == "αžŠαž€β–αž˜αž½αž™"
assert tha.cardinals.processor("10") == "αžŠαž”αŸ‹"
assert tha.cardinals.processor("15") == "αžŠαž”αŸ‹αž”αŸ’αžšαžΆαŸ†"
assert tha.cardinals.processor("100") == "αž˜αž½αž™αžšαž™"
assert tha.cardinals.processor("10000") == "αž˜αž½αž™αž˜αŸ‰αžΊαž“"
assert tha.cardinals.processor("10000.234") == "αž˜αž½αž™αž˜αŸ‰αžΊαž“.αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("-10000.234") == "αžŠαž€β–αž˜αž½αž™αž˜αŸ‰αžΊαž“.αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"
assert tha.cardinals.processor("-10000,234") == "αžŠαž€β–αž˜αž½αž™αž˜αŸ‰αžΊαž“,αž–αžΈαžšαžšαž™β–αžŸαžΆαž˜αžŸαž·αž”αž”αž½αž“"

## Number - Decimals
assert tha.decimals.processor("123.324") == "αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αž”αžΈαžšαž™β–αž˜αŸ’αž—αŸƒαž”αž½αž“"
assert tha.decimals.processor("123.001") == "αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αž˜αž½αž™"
assert tha.decimals.processor("-123.0012") == "αžŠαž€β–αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŠαž”αŸ‹αž–αžΈαžš"
assert tha.decimals.processor("-123,0012") == "αžŠαž€β–αž˜αž½αž™αžšαž™β–αž˜αŸ’αž—αŸƒαž”αžΈβ–αž€αŸ’αž”αŸ€αžŸβ–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŠαž”αŸ‹αž–αžΈαžš"

## Number - Ordinals
assert tha.ordinals.processor("5th") == "αž‘αžΈβ–αž”αŸ’αžšαžΆαŸ†"
assert tha.ordinals.processor("3rd") == "αž‘αžΈβ–αž”αžΈ"
assert tha.ordinals.processor("1st") == "αž‘αžΈβ–αž˜αž½αž™"
assert tha.ordinals.processor("10th") == "αž‘αžΈβ–αžŠαž”αŸ‹"
assert tha.ordinals.processor("10") == "10"

## Number - Currency
assert tha.currency.processor("$100.01") == "αž˜αž½αž™αžšαž™αžŠαž»αž›αŸ’αž›αžΆαžšβ–αž˜αž½αž™αžŸαŸαž“"
assert tha.currency.processor("$100") == "αž˜αž½αž™αžšαž™β–αžŠαž»αž›αŸ’αž›αžΆαžš"
assert tha.currency.processor("100$") == "αž˜αž½αž™αžšαž™αžŠαž»αž›αŸ’αž›αžΆαžš"
assert tha.currency.processor("100αŸ›") == "αž˜αž½αž™αžšαž™αžšαŸ€αž›"
assert tha.currency.processor("100.32αŸ›") == "αž˜αž½αž™αžšαž™β–αž…αž»αž…β–αžŸαžΆαž˜αžŸαž·αž”αž–αžΈαžšαžšαŸ€αž›"
assert tha.currency.processor("100.0032αŸ›") == "αž˜αž½αž™αžšαž™β–αž…αž»αž…β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΌαž“αŸ’αž™β–αžŸαžΆαž˜αžŸαž·αž”αž–αžΈαžšαžšαŸ€αž›"

## Parenthesis
assert tha.parenthesis.processor("Hello (this will be ignored) world") == "Hello world"


## Iteration Mark
def fake_tokenizer(_):
  return ["αž‚αžΆαžαŸ‹", "αž”αžΆαž“", "αž‘αŸ…", "αž”αž“αŸ’αžαž·αž…", "αž˜αŸ’αžŠαž„"]


assert (
  tha.repeater.processor("αž‚αžΆαžαŸ‹αž”αžΆαž“αž‘αŸ…αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„αŸ—αž αžΎαž™", tokenizer=fake_tokenizer)
  == "αž‚αžΆαžαŸ‹αž”αžΆαž“αž‘αŸ…αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„β–αž”αž“αŸ’αžαž·αž…αž˜αŸ’αžŠαž„αž αžΎαž™"
)

tha's People

Contributors

seanghay avatar

Stargazers

xyber avatar Sila Rim avatar  avatar Sliden avatar Vc3nt avatar Khiev Boraty avatar RinYato avatar

Watchers

 avatar

Forkers

awesome-service

tha's Issues

issue installation with windows python 3.12

I have issue installation with windows python 3.12 , pip install tha

image

I think it error because of README content contain utf-8

with open("README.md", encoding="utf-8") as f:
  long_description = f.read()

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    πŸ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. πŸ“ŠπŸ“ˆπŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❀️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.