Giter Club home page Giter Club logo

document-analysis2's Introduction

To build and run

mvn package

set RAW_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\input\03_A_ranger
set RAW_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\input\test
set RAW_PATH=Z:\AMINA\03_A_ranger
set AVRO_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\avro
set PARSED_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\parsed
set PARSED_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\parsed_simple
set PARSED_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\parsed_simple2
set PARSED_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\parsed_simple3
set DECRYPTED_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\decrypted
set WORDCOUNT_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\wordcount
set CLUSTERING_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\clustering
set CLUSTERING_MODEL_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\
set LANG_PATH=C:\Users\Mathias\vagrant\hadoop\document-analysis\output\lang
set TESSERACT_PATH=C:\Users\Mathias\Work\Tools\tessdata-master
set TESSERACT_LANG=fra

java -cp target\scala-2.10\document-analysis-1.0-SNAPSHOT-hadoop2.6.0.jar org.grozeille.CopyToAvro -i %RAW_PATH% -o %AVRO_PATH%

set SPARK_SUBMIT_OPTS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=5005



spark-submit --master local[8] --class org.grozeille.DocumentAnalysisAvro target\scala-2.10\document-analysis-1.0-SNAPSHOT-hadoop2.6.0.jar -i %AVRO_PATH% -o %PARSED_PATH% -t %TESSERACT_PATH% > out 2>&1

spark-submit --master local[8] --class org.grozeille.DocumentAnalysisAvroEncryptedPdf target\scala-2.10\document-analysis-1.0-SNAPSHOT-hadoop2.6.0.jar -i %AVRO_PATH% -o %DECRYPTED_PATH% -t %TESSERACT_PATH%

bin\solr start -c -m 1g -z localhost:2181 -f
bin\solr create -p 8983 -c ineodoc -d basic_configs -s 1 -rf 1
bin\solr delete -p 8983 -c ineodoc

server\scripts\cloud-scripts\zkcli.bat -zkhost localhost:2181 -cmd putfile server/solr/configsets/ineodoc_configs/conf/schema.xml /configs/ineodoc/schema.xml
server\scripts\cloud-scripts\zkcli.bat -zkhost localhost:2181 -cmd upconfig  -confname ineodoc  -confdir server/solr/configsets/ineodoc_configs/conf
curl -X GET "http://localhost:8983/solr/admin/cores?wt=json&action=RELOAD&core=ineodoc_shard1_replica1"

curl -GET "http://localhost:8983/solr/ineodoc/update?stream.body=<delete><query>*:*</query></delete>"


curl -X POST http://localhost:8983/solr/ineodoc/config -d '{"set-property":{"updateHandler.autoSoftCommit.maxTime":"2000"}}'



spark-submit --master local[8] --class org.grozeille.DetectLang target\scala-2.10\document-analysis-1.0-SNAPSHOT-hadoop2.6.0.jar -i %PARSED_PATH% -o %LANG_PATH%

spark-submit --master local[8] --class org.grozeille.Clustering target\scala-2.10\document-analysis-1.0-SNAPSHOT-hadoop2.6.0.jar -i %PARSED_PATH% -o %CLUSTERING_PATH% -m %CLUSTERING_MODEL_PATH%

spark-submit --master local[8] --class org.grozeille.IndexToSolr target\scala-2.10\document-analysis-1.0-SNAPSHOT-hadoop2.6.0.jar -i %PARSED_PATH% -z localhost:2181 -c ineodoc -l %LANG_PATH% -k %CLUSTERING_PATH%

DROP TABLE documents;
CREATE EXTERNAL TABLE documents (
  path string,
  body string,
  fileName string,
  lang string)
STORED AS AVRO
LOCATION 'C:\\Users\\Mathias\\vagrant\\hadoop\\document-analysis\\output\\parsed_simple2'
TBLPROPERTIES (
    'avro.schema.literal'='{
                             "type" : "record",
                             "name" : "topLevelRecord",
                             "fields" : [ {
                               "name" : "path",
                               "type" : [ "string", "null" ]
                             }, {
                               "name" : "body",
                               "type" : [ "string", "null" ]
                             }, {
                               "name" : "fileName",
                               "type" : [ "string", "null" ]
                             }, {
                               "name" : "lang",
                               "type" : [ "string", "null" ]
                             } ]
                           }')
;

DROP TABLE lang;
CREATE EXTERNAL TABLE lang (
  path string,
  lang string)
STORED AS AVRO
LOCATION 'C:\\Users\\Mathias\\vagrant\\hadoop\\document-analysis\\output\\lang'
TBLPROPERTIES (
    'avro.schema.literal'='{
                              "type" : "record",
                              "name" : "topLevelRecord",
                              "fields" : [ {
                                "name" : "path",
                                "type" : [ "string", "null" ]
                              }, {
                                "name" : "lang",
                                "type" : [ "string", "null" ]
                              } ]
                            }')
;

DROP TABLE wordcount;
CREATE EXTERNAL TABLE wordcount (
  f string,
  w string,
  c bigint)
STORED AS AVRO
LOCATION 'C:\\Users\\Mathias\\vagrant\\hadoop\\document-analysis\\output\\wordcount'
TBLPROPERTIES (
    'avro.schema.literal'='{
                             "type" : "record",
                             "name" : "topLevelRecord",
                             "fields" : [ {
                               "name" : "f",
                               "type" : [ "string", "null" ]
                             }, {
                               "name" : "w",
                               "type" : [ "string", "null" ]
                             }, {
                               "name" : "c",
                               "type" : [ "long", "null" ]
                             } ]
                           }')
;

DROP TABLE clusters;
CREATE EXTERNAL TABLE clusters (
  path string,
  cluster bigint)
STORED AS AVRO
LOCATION 'C:\\Users\\Mathias\\vagrant\\hadoop\\document-analysis\\output\\clustering'
TBLPROPERTIES (
    'avro.schema.literal'='{
                             "type" : "record",
                             "name" : "topLevelRecord",
                             "fields" : [ {
                               "name" : "path",
                               "type" : [ "string", "null" ]
                             }, {
                               "name" : "cluster",
                               "type" : [ "int", "null" ]
                             } ]
                           }')
;

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.