Comments (3)
And here's the script comparing in reverse (all the keys in stopword
that are not in natural
):
const _ = require('lodash');
const sw = require('stopword');
const stopwordsEn = _.difference(
sw.eng,
require('natural/lib/natural/util/stopwords').words
);
console.log('EN');
console.log(JSON.stringify(stopwordsEn, null, 2));
const stopwordsEs = _.difference(
sw.spa,
require('natural/lib/natural/util/stopwords_es').words
);
console.log('ES');
console.log(JSON.stringify(stopwordsEs, null, 2));
const stopwordsFa = _.difference(
sw.fas,
require('natural/lib/natural/util/stopwords_fa').words
);
console.log('FA');
console.log(JSON.stringify(stopwordsFa, null, 2));
const stopwordsFr = _.difference(
sw.fra,
require('natural/lib/natural/util/stopwords_fr').words
);
console.log('FR');
console.log(JSON.stringify(stopwordsFr, null, 2));
const stopwordsId = _.difference(
sw.ind,
require('natural/lib/natural/util/stopwords_id').words
);
console.log('ID');
console.log(JSON.stringify(stopwordsId, null, 2));
const stopwordsJa = _.difference(
sw.jpn,
require('natural/lib/natural/util/stopwords_ja').words
);
console.log('JA');
console.log(JSON.stringify(stopwordsJa, null, 2));
const stopwordsIt = _.difference(
sw.ita,
require('natural/lib/natural/util/stopwords_it').words
);
console.log('IT');
console.log(JSON.stringify(stopwordsIt, null, 2));
const stopwordsNl = _.difference(
sw.nld,
require('natural/lib/natural/util/stopwords_nl').words
);
console.log('NL');
console.log(JSON.stringify(stopwordsNl, null, 2));
const stopwordsNo = _.difference(
sw.nob,
require('natural/lib/natural/util/stopwords_no').words
);
console.log('NO');
console.log(JSON.stringify(stopwordsNo, null, 2));
const stopwordsPl = _.difference(
sw.pol,
require('natural/lib/natural/util/stopwords_pl').words
);
console.log('PL');
console.log(JSON.stringify(stopwordsPl, null, 2));
const stopwordsPt = _.difference(
[...sw.por, ...sw.porBr],
require('natural/lib/natural/util/stopwords_pt').words
);
console.log('PT');
console.log(JSON.stringify(stopwordsPt, null, 2));
const stopwordsRu = _.difference(
sw.rus,
require('natural/lib/natural/util/stopwords_ru').words
);
console.log('RU');
console.log(JSON.stringify(stopwordsRu, null, 2));
const stopwordsSv = _.difference(
sw.swe,
require('natural/lib/natural/util/stopwords_sv').words
);
console.log('SV');
console.log(JSON.stringify(stopwordsSv, null, 2));
const stopwordsZh = _.difference(
sw.zho,
require('natural/lib/natural/util/stopwords_zh').words
);
console.log('ZH');
console.log(JSON.stringify(stopwordsZh, null, 2));
Output:
❯ node test
EN
[]
ES
[]
FA
[
"به",
"اکنون",
"اگر",
"اگرچه",
"الا",
"اما",
"اندر",
"اینکه",
"باری",
"بالعکس",
"بدون",
"بر",
"بلکه",
"بنابراین",
"بی",
"پس",
"تا",
"جز",
"چنانچه",
"چه",
"چون",
"در",
"را",
"روی",
"زیرا",
"سپس",
"غیر",
"که",
"لیکن",
"مانند",
"مثل",
"مگر",
"نه",
"نیز",
"هرچند",
"هم",
"همان",
"وانگهی",
"ولی",
"ولو",
"همانند",
"همچو"
]
FR
[]
ID
[
"rata",
"tahun",
"tengah",
"tinggi",
"umum",
"waktu"
]
JA
[]
IT
[]
NL
[]
NO
[]
PL
[]
PT
[
"adeus",
"agora",
"aí",
"ainda",
"além",
"algo",
"alguém",
"algum",
"alguma",
"algumas",
"alguns",
"ali",
"ampla",
"amplas",
"amplo",
"amplos",
"ano",
"anos",
"ante",
"antes",
"apenas",
"apoio",
"após",
"aqui",
"área",
"assim",
"atrás",
"através",
"baixo",
"bastante",
"bem",
"boa",
"boas",
"bom",
"bons",
"breve",
"cá",
"cada",
"catorze",
"cedo",
"cento",
"certamente",
"certeza",
"cima",
"cinco",
"coisa",
"coisas",
"conselho",
"contra",
"contudo",
"custa",
"dá",
"dão",
"daquela",
"daquelas",
"daquele",
"daqueles",
"dar",
"debaixo",
"demais",
"dentro",
"desde",
"dessa",
"dessas",
"desse",
"desses",
"desta",
"destas",
"deste",
"destes",
"deve",
"devem",
"devendo",
"dever",
"deverá",
"deverão",
"deveria",
"deveriam",
"devia",
"deviam",
"dez",
"dezanove",
"dezasseis",
"dezassete",
"dezoito",
"dia",
"diante",
"disse",
"disso",
"disto",
"dito",
"diz",
"dizem",
"dizer",
"dois",
"doze",
"duas",
"dúvida",
"é",
"embora",
"enquanto",
"era",
"eram",
"éramos",
"és",
"está",
"estamos",
"estão",
"estar",
"estás",
"estava",
"estavam",
"estávamos",
"esteja",
"estejam",
"estejamos",
"esteve",
"estive",
"estivemos",
"estiver",
"estivera",
"estiveram",
"estivéramos",
"estiverem",
"estivermos",
"estivesse",
"estivessem",
"estivéssemos",
"estiveste",
"estivestes",
"estou",
"etc",
"exemplo",
"faço",
"falta",
"favor",
"faz",
"fazeis",
"fazem",
"fazemos",
"fazendo",
"fazer",
"fazes",
"feita",
"feitas",
"feito",
"feitos",
"fez",
"fim",
"final",
"foi",
"fomos",
"for",
"fora",
"foram",
"fôramos",
"forem",
"forma",
"formos",
"fosse",
"fossem",
"fôssemos",
"foste",
"fostes",
"fui",
"geral",
"grande",
"grandes",
"grupo",
"há",
"haja",
"hajam",
"hajamos",
"hão",
"havemos",
"havia",
"hei",
"hoje",
"hora",
"horas",
"houve",
"houvemos",
"houver",
"houvera",
"houverá",
"houveram",
"houvéramos",
"houverão",
"houverei",
"houverem",
"houveremos",
"houveria",
"houveriam",
"houveríamos",
"houvermos",
"houvesse",
"houvessem",
"houvéssemos",
"la",
"lá",
"lado",
"lo",
"local",
"logo",
"longe",
"lugar",
"maior",
"maioria",
"mal",
"máximo",
"meio",
"menor",
"menos",
"mês",
"meses",
"mesma",
"mesmas",
"mesmos",
"mil",
"momento",
"muita",
"muitas",
"nada",
"naquela",
"naquelas",
"naquele",
"naqueles",
"nenhum",
"nenhuma",
"nessa",
"nessas",
"nesse",
"nesses",
"nesta",
"nestas",
"neste",
"nestes",
"ninguém",
"nível",
"noite",
"nome",
"nova",
"novas",
"nove",
"novo",
"novos",
"número",
"nunca",
"obra",
"obrigada",
"obrigado",
"oitava",
"oitavo",
"oito",
"onde",
"ontem",
"onze",
"outra",
"outras",
"outro",
"outros",
"parece",
"parte",
"partir",
"paucas",
"pequena",
"pequenas",
"pequeno",
"pequenos",
"per",
"perante",
"perto",
"pode",
"pude",
"pôde",
"podem",
"podendo",
"poder",
"poderia",
"poderiam",
"podia",
"podiam",
"põe",
"põem",
"pois",
"ponto",
"pontos",
"porém",
"porque",
"porquê",
"posição",
"possível",
"possivelmente",
"posso",
"pouca",
"poucas",
"pouco",
"poucos",
"primeira",
"primeiras",
"primeiro",
"primeiros",
"própria",
"próprias",
"próprio",
"próprios",
"próxima",
"próximas",
"próximo",
"próximos",
"pude",
"puderam",
"quáis",
"quanto",
"quantos",
"quarta",
"quarto",
"quatro",
"quê",
"quer",
"quereis",
"querem",
"queremas",
"queres",
"quero",
"questão",
"quinta",
"quinto",
"quinze",
"relação",
"sabe",
"sabem",
"são",
"segunda",
"segundo",
"sei",
"seis",
"seja",
"sejam",
"sejamos",
"sempre",
"sendo",
"ser",
"será",
"serão",
"serei",
"seremos",
"seria",
"seriam",
"seríamos",
"sete",
"sétima",
"sétimo",
"sexta",
"sexto",
"si",
"sido",
"sim",
"sistema",
"sob",
"sobre",
"sois",
"somos",
"sou",
"tal",
"talvez",
"tampouco",
"tanta",
"tantas",
"tanto",
"tão",
"tarde",
"tem",
"tém",
"têm",
"temos",
"tendes",
"tendo",
"tenha",
"tenham",
"tenhamos",
"tenho",
"tens",
"ter",
"terá",
"terão",
"terceira",
"terceiro",
"terei",
"teremos",
"teria",
"teriam",
"teríamos",
"teve",
"ti",
"tido",
"tinha",
"tinham",
"tínhamos",
"tive",
"tivemos",
"tiver",
"tivera",
"tiveram",
"tivéramos",
"tiverem",
"tivermos",
"tivesse",
"tivessem",
"tivéssemos",
"tiveste",
"tivestes",
"toda",
"todas",
"todavia",
"todo",
"todos",
"trabalho",
"três",
"treze",
"tudo",
"última",
"últimas",
"último",
"últimos",
"uns",
"vai",
"vais",
"vão",
"vários",
"vem",
"vêm",
"vendo",
"vens",
"ver",
"vez",
"vezes",
"viagem",
"vindo",
"vinte",
"vir",
"vós",
"vossa",
"vossas",
"zero"
]
RU
[
"не",
"со",
"то",
"она",
"да",
"же",
"вы",
"бы",
"по",
"ее",
"мне",
"было",
"вот",
"меня",
"еще",
"нет",
"ему",
"теперь",
"когда",
"даже",
"ну",
"ли",
"уже",
"ни",
"него",
"вас",
"нибудь",
"уж",
"вам",
"ведь",
"потом",
"себя",
"ничего",
"ей",
"может",
"тут",
"есть",
"надо",
"ней",
"тебя",
"чем",
"была",
"сам",
"чтоб",
"без",
"будто",
"чего",
"раз",
"себе",
"будет",
"тогда",
"того",
"этого",
"какой",
"совсем",
"ним",
"этом",
"почти",
"тем",
"чтобы",
"нее",
"были",
"куда",
"всех",
"сегодня",
"можно",
"при",
"об",
"хоть",
"больше",
"эти",
"нас",
"про",
"всего",
"них",
"какая",
"разве",
"эту",
"моя",
"свою",
"этой",
"перед",
"иногда",
"лучше",
"чуть",
"том",
"нельзя",
"такой",
"им",
"всегда",
"конечно",
"всю",
"это",
"лишь"
]
SV
[]
ZH
[]
from natural.
I found these differences due to the word "hello" being in Natural's "id" stopwords list, while not in stopword.ind
list.
❯ node
Welcome to Node.js v16.15.1.
Type ".help" for more information.
> require('stopword').ind.indexOf('hello')
-1
> require('natural/lib/natural/util/stopwords_id').words.indexOf('hello')
267
from natural.
Thanks for your comparison. I think it is a matter of taste and application which words are considers stopwords. For now I will leave it like it is. Maybe in the future we re-use the stopwords package.
from natural.
Related Issues (20)
- Bug report in SequenceTokenizerNew HOT 4
- Jasmine tests for TypeScript definitions
- Unneeded `tsc` dependency HOT 4
- SentenceTokenizer works badly with numbered lists HOT 1
- Build errors trying to use natural with create react app (CRA) react v 18.2.0, natural v6.2.0 HOT 2
- Save data in Mongodb HOT 10
- Bayes Classifier Restore Issue
- Release updated type declarations HOT 1
- Trying to import natural in TypeScript yields a linter error
- Missing typings for LevenshteinDistanceSearch as well as for DamerauLevenshteinDistanceSearch HOT 2
- Sanity check - client side only possible? HOT 2
- Version 6.5.0 introduces a breaking change for sentence tokenizing HOT 2
- Error when using natural from deno using esm.sh HOT 1
- Enhancement: Stricter Typing for tag Field in BrillPOSTaggedWord Interface HOT 1
- Error in `natural.JaroWinklerDistance()` when missing optional `options` parameter
- natural.AggressiveTokenizerRu() doesn't filter out non-cyrillic characters HOT 1
- Bulk mode that is mindful of event loop HOT 2
- Check for Eszett symbol ß in German tokenizer HOT 1
- Noob Question. Challenge to devs? Help?
- Typo in docs HOT 1
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from natural.