diff options
author | zkwip | 2015-05-20 17:32:35 +0200 |
---|---|---|
committer | zkwip | 2015-05-20 17:32:35 +0200 |
commit | f6ed9b9c0f10770bd8da27f677a0777f9465b999 (patch) | |
tree | e0b2a1169dc97b7e5c1bd7dee807045d8991d9e4 /app/src/main/java/org | |
parent | start smmry, pak waarschijnlijk iets anders anyway (diff) |
summary shit
geeft wel maar een regel
Diffstat (limited to 'app/src/main/java/org')
10 files changed, 1787 insertions, 50 deletions
diff --git a/app/src/main/java/org/rssin/summaries/SentenceItem.java b/app/src/main/java/org/rssin/summaries/SentenceItem.java new file mode 100644 index 0000000..e8e0e0b --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/SentenceItem.java @@ -0,0 +1,53 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package org.rssin.summaries; + +public class SentenceItem +{ + + private String rsen = ""; + private String srsen = ""; + + private double wght; + + public SentenceItem(String s) + { + rsen = new String(s); + wght = 0.0; + } + + + public void setRawSentecse(String sen) + { + rsen = sen; + } + + public void setSRSentence(String rsen) + { + srsen = rsen; + } + + public String getRawSentence() + { + return rsen; + } + public String getSRSentence() + { + return srsen; + } + + public void addWeight(double wg) + { + wght=wght+wg; + } + + public double getWeight() + { + return wght; + } + + +}
\ No newline at end of file diff --git a/app/src/main/java/org/rssin/summaries/Special.java b/app/src/main/java/org/rssin/summaries/Special.java new file mode 100644 index 0000000..36cb0ad --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/Special.java @@ -0,0 +1,60 @@ +package org.rssin.summaries; + +import java.io.*; +import java.util.StringTokenizer; + +public class Special +{ + + char spch[]; + + public Special() + { + /*int cnt=0,sz=0;char bt[]=null; + try { + File fp=new File("specials.txt"); + FileReader fis=new FileReader(fp); + sz=(int)fp.length(); + bt=new char[sz]; + fis.read(bt); + fis.close(); + } + catch(IOException ex) {} + spch=getTokens(new String(bt));*/ + spch = new TxtStatic().getSpecial(); + } + + + public char[] getTokens(String sen) + { + int sz=0,cnt=0;char words[]=null; + StringTokenizer stk=new StringTokenizer(sen); + sz=stk.countTokens(); + words=new char[sz]; + while ( stk.hasMoreTokens()) + { + words[cnt]=new String(stk.nextToken()).charAt(0); + cnt++; + } + return words; + } + + public String remove(String sen) + { + String dsen=new String(sen); + + for (int j=0;j<spch.length; j++) + { + + char csh[]=dsen.toCharArray(); + dsen=""; + for(int i=0;i<csh.length;i++ ) + if(csh[i] != spch[j]) + dsen=dsen + csh[i]; + + } + + return dsen; + } + +}
\ No newline at end of file diff --git a/app/src/main/java/org/rssin/summaries/Stemmer.java b/app/src/main/java/org/rssin/summaries/Stemmer.java new file mode 100644 index 0000000..1cad62f --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/Stemmer.java @@ -0,0 +1,852 @@ +package org.rssin.summaries; + +import java.util.*; + +public class Stemmer { + + + private static boolean m_CompMode = false; + private static HashMap m_l11 = null; + private static HashMap m_l10 = null; + private static HashMap m_l9 = null; + private static HashMap m_l8 = null; + private static HashMap m_l7 = null; + private static HashMap m_l6 = null; + private static HashMap m_l5 = null; + private static HashMap m_l4 = null; + private static HashMap m_l3 = null; + private static HashMap m_l2 = null; + private static HashMap m_l1 = null; + + static { + + m_l11 = new HashMap(); + m_l11.put("alistically", "B"); + m_l11.put("arizability", "A"); + m_l11.put("izationally", "B"); + m_l10 = new HashMap(); + m_l10.put("antialness", "A"); + m_l10.put("arisations", "A"); + m_l10.put("arizations", "A"); + m_l10.put("entialness", "A"); + m_l9 = new HashMap(); + m_l9.put("allically", "C"); + m_l9.put("antaneous", "A"); + m_l9.put("antiality", "A"); + m_l9.put("arisation", "A"); + m_l9.put("arization", "A"); + m_l9.put("ationally", "B"); + m_l9.put("ativeness", "A"); + m_l9.put("eableness", "E"); + m_l9.put("entations", "A"); + m_l9.put("entiality", "A"); + m_l9.put("entialize", "A"); + m_l9.put("entiation", "A"); + m_l9.put("ionalness", "A"); + m_l9.put("istically", "A"); + m_l9.put("itousness", "A"); + m_l9.put("izability", "A"); + m_l9.put("izational", "A"); + m_l8 = new HashMap(); + m_l8.put("ableness", "A"); + m_l8.put("arizable", "A"); + m_l8.put("entation", "A"); + m_l8.put("entially", "A"); + m_l8.put("eousness", "A"); + m_l8.put("ibleness", "A"); + m_l8.put("icalness", "A"); + m_l8.put("ionalism", "A"); + m_l8.put("ionality", "A"); + m_l8.put("ionalize", "A"); + m_l8.put("iousness", "A"); + m_l8.put("izations", "A"); + m_l8.put("lessness", "A"); + m_l7 = new HashMap(); + m_l7.put("ability", "A"); + m_l7.put("aically", "A"); + m_l7.put("alistic", "B"); + m_l7.put("alities", "A"); + m_l7.put("ariness", "E"); + m_l7.put("aristic", "A"); + m_l7.put("arizing", "A"); + m_l7.put("ateness", "A"); + m_l7.put("atingly", "A"); + m_l7.put("ational", "B"); + m_l7.put("atively", "A"); + m_l7.put("ativism", "A"); + m_l7.put("elihood", "E"); + m_l7.put("encible", "A"); + m_l7.put("entally", "A"); + m_l7.put("entials", "A"); + m_l7.put("entiate", "A"); + m_l7.put("entness", "A"); + m_l7.put("fulness", "A"); + m_l7.put("ibility", "A"); + m_l7.put("icalism", "A"); + m_l7.put("icalist", "A"); + m_l7.put("icality", "A"); + m_l7.put("icalize", "A"); + m_l7.put("ication", "G"); + m_l7.put("icianry", "A"); + m_l7.put("ination", "A"); + m_l7.put("ingness", "A"); + m_l7.put("ionally", "A"); + m_l7.put("isation", "A"); + m_l7.put("ishness", "A"); + m_l7.put("istical", "A"); + m_l7.put("iteness", "A"); + m_l7.put("iveness", "A"); + m_l7.put("ivistic", "A"); + m_l7.put("ivities", "A"); + m_l7.put("ization", "F"); + m_l7.put("izement", "A"); + m_l7.put("oidally", "A"); + m_l7.put("ousness", "A"); + m_l6 = new HashMap(); + m_l6.put("aceous", "A"); + m_l6.put("acious", "B"); + m_l6.put("action", "G"); + m_l6.put("alness", "A"); + m_l6.put("ancial", "A"); + m_l6.put("ancies", "A"); + m_l6.put("ancing", "B"); + m_l6.put("ariser", "A"); + m_l6.put("arized", "A"); + m_l6.put("arizer", "A"); + m_l6.put("atable", "A"); + m_l6.put("ations", "B"); + m_l6.put("atives", "A"); + m_l6.put("eature", "Z"); + m_l6.put("efully", "A"); + m_l6.put("encies", "A"); + m_l6.put("encing", "A"); + m_l6.put("ential", "A"); + m_l6.put("enting", "C"); + m_l6.put("entist", "A"); + m_l6.put("eously", "A"); + m_l6.put("ialist", "A"); + m_l6.put("iality", "A"); + m_l6.put("ialize", "A"); + m_l6.put("ically", "A"); + m_l6.put("icance", "A"); + m_l6.put("icians", "A"); + m_l6.put("icists", "A"); + m_l6.put("ifully", "A"); + m_l6.put("ionals", "A"); + m_l6.put("ionate", "D"); + m_l6.put("ioning", "A"); + m_l6.put("ionist", "A"); + m_l6.put("iously", "A"); + m_l6.put("istics", "A"); + m_l6.put("izable", "E"); + m_l6.put("lessly", "A"); + m_l6.put("nesses", "A"); + m_l6.put("oidism", "A"); + m_l5 = new HashMap(); + m_l5.put("acies", "A"); + m_l5.put("acity", "A"); + m_l5.put("aging", "B"); + m_l5.put("aical", "A"); + if (!m_CompMode) { + m_l5.put("alist", "A"); + } + m_l5.put("alism", "B"); + m_l5.put("ality", "A"); + m_l5.put("alize", "A"); + m_l5.put("allic", "b"); + m_l5.put("anced", "B"); + m_l5.put("ances", "B"); + m_l5.put("antic", "C"); + m_l5.put("arial", "A"); + m_l5.put("aries", "A"); + m_l5.put("arily", "A"); + m_l5.put("arity", "B"); + m_l5.put("arize", "A"); + m_l5.put("aroid", "A"); + m_l5.put("ately", "A"); + m_l5.put("ating", "I"); + m_l5.put("ation", "B"); + m_l5.put("ative", "A"); + m_l5.put("ators", "A"); + m_l5.put("atory", "A"); + m_l5.put("ature", "E"); + m_l5.put("early", "Y"); + m_l5.put("ehood", "A"); + m_l5.put("eless", "A"); + if (!m_CompMode) { + m_l5.put("elily", "A"); + } else { + m_l5.put("elity", "A"); + } + m_l5.put("ement", "A"); + m_l5.put("enced", "A"); + m_l5.put("ences", "A"); + m_l5.put("eness", "E"); + m_l5.put("ening", "E"); + m_l5.put("ental", "A"); + m_l5.put("ented", "C"); + m_l5.put("ently", "A"); + m_l5.put("fully", "A"); + m_l5.put("ially", "A"); + m_l5.put("icant", "A"); + m_l5.put("ician", "A"); + m_l5.put("icide", "A"); + m_l5.put("icism", "A"); + m_l5.put("icist", "A"); + m_l5.put("icity", "A"); + m_l5.put("idine", "I"); + m_l5.put("iedly", "A"); + m_l5.put("ihood", "A"); + m_l5.put("inate", "A"); + m_l5.put("iness", "A"); + m_l5.put("ingly", "B"); + m_l5.put("inism", "J"); + m_l5.put("inity", "c"); + m_l5.put("ional", "A"); + m_l5.put("ioned", "A"); + m_l5.put("ished", "A"); + m_l5.put("istic", "A"); + m_l5.put("ities", "A"); + m_l5.put("itous", "A"); + m_l5.put("ively", "A"); + m_l5.put("ivity", "A"); + m_l5.put("izers", "F"); + m_l5.put("izing", "F"); + m_l5.put("oidal", "A"); + m_l5.put("oides", "A"); + m_l5.put("otide", "A"); + m_l5.put("ously", "A"); + m_l4 = new HashMap(); + m_l4.put("able", "A"); + m_l4.put("ably", "A"); + m_l4.put("ages", "B"); + m_l4.put("ally", "B"); + m_l4.put("ance", "B"); + m_l4.put("ancy", "B"); + m_l4.put("ants", "B"); + m_l4.put("aric", "A"); + m_l4.put("arly", "K"); + m_l4.put("ated", "I"); + m_l4.put("ates", "A"); + m_l4.put("atic", "B"); + m_l4.put("ator", "A"); + m_l4.put("ealy", "Y"); + m_l4.put("edly", "E"); + m_l4.put("eful", "A"); + m_l4.put("eity", "A"); + m_l4.put("ence", "A"); + m_l4.put("ency", "A"); + m_l4.put("ened", "E"); + m_l4.put("enly", "E"); + m_l4.put("eous", "A"); + m_l4.put("hood", "A"); + m_l4.put("ials", "A"); + m_l4.put("ians", "A"); + m_l4.put("ible", "A"); + m_l4.put("ibly", "A"); + m_l4.put("ical", "A"); + m_l4.put("ides", "L"); + m_l4.put("iers", "A"); + m_l4.put("iful", "A"); + m_l4.put("ines", "M"); + m_l4.put("ings", "N"); + m_l4.put("ions", "B"); + m_l4.put("ious", "A"); + m_l4.put("isms", "B"); + m_l4.put("ists", "A"); + m_l4.put("itic", "H"); + m_l4.put("ized", "F"); + m_l4.put("izer", "F"); + m_l4.put("less", "A"); + m_l4.put("lily", "A"); + m_l4.put("ness", "A"); + m_l4.put("ogen", "A"); + m_l4.put("ward", "A"); + m_l4.put("wise", "A"); + m_l4.put("ying", "B"); + m_l4.put("yish", "A"); + m_l3 = new HashMap(); + m_l3.put("acy", "A"); + m_l3.put("age", "B"); + m_l3.put("aic", "A"); + m_l3.put("als", "b"); + m_l3.put("ant", "B"); + m_l3.put("ars", "O"); + m_l3.put("ary", "F"); + m_l3.put("ata", "A"); + m_l3.put("ate", "A"); + m_l3.put("eal", "Y"); + m_l3.put("ear", "Y"); + m_l3.put("ely", "E"); + m_l3.put("ene", "E"); + m_l3.put("ent", "C"); + m_l3.put("ery", "E"); + m_l3.put("ese", "A"); + m_l3.put("ful", "A"); + m_l3.put("ial", "A"); + m_l3.put("ian", "A"); + m_l3.put("ics", "A"); + m_l3.put("ide", "L"); + m_l3.put("ied", "A"); + m_l3.put("ier", "A"); + m_l3.put("ies", "P"); + m_l3.put("ily", "A"); + m_l3.put("ine", "M"); + m_l3.put("ing", "N"); + m_l3.put("ion", "Q"); + m_l3.put("ish", "C"); + m_l3.put("ism", "B"); + m_l3.put("ist", "A"); + m_l3.put("ite", "a"); + m_l3.put("ity", "A"); + m_l3.put("ium", "A"); + m_l3.put("ive", "A"); + m_l3.put("ize", "F"); + m_l3.put("oid", "A"); + m_l3.put("one", "R"); + m_l3.put("ous", "A"); + m_l2 = new HashMap(); + m_l2.put("ae", "A"); + m_l2.put("al", "b"); + m_l2.put("ar", "X"); + m_l2.put("as", "B"); + m_l2.put("ed", "E"); + m_l2.put("en", "F"); + m_l2.put("es", "E"); + m_l2.put("ia", "A"); + m_l2.put("ic", "A"); + m_l2.put("is", "A"); + m_l2.put("ly", "B"); + m_l2.put("on", "S"); + m_l2.put("or", "T"); + m_l2.put("um", "U"); + m_l2.put("us", "V"); + m_l2.put("yl", "R"); + m_l2.put("s\'", "A"); + m_l2.put("\'s", "A"); + m_l1 = new HashMap(); + m_l1.put("a", "A"); + m_l1.put("e", "A"); + m_l1.put("i", "A"); + m_l1.put("o", "A"); + m_l1.put("s", "W"); + m_l1.put("y", "B"); + } + + private String removeEnding(String word) { + + int length = word.length(); + int el = 11; + + while (el > 0) { + if (length - el > 1) { + String ending = word.substring(length - el); + String conditionCode = null; + switch (el) { + case 11: conditionCode = (String)m_l11.get(ending); + break; + case 10: conditionCode = (String)m_l10.get(ending); + break; + case 9: conditionCode = (String)m_l9.get(ending); + break; + case 8: conditionCode = (String)m_l8.get(ending); + break; + case 7: conditionCode = (String)m_l7.get(ending); + break; + case 6: conditionCode = (String)m_l6.get(ending); + break; + case 5: conditionCode = (String)m_l5.get(ending); + break; + case 4: conditionCode = (String)m_l4.get(ending); + break; + case 3: conditionCode = (String)m_l3.get(ending); + break; + case 2: conditionCode = (String)m_l2.get(ending); + break; + case 1: conditionCode = (String)m_l1.get(ending); + break; + default: + } + if (conditionCode != null) { + switch (conditionCode.charAt(0)) { + case 'A': + return word.substring(0, length - el); + case 'B': + if (length - el > 2) { + return word.substring(0, length - el); + } + break; + case 'C': + if (length - el > 3) { + return word.substring(0, length - el); + } + break; + case 'D': + if (length - el > 4) { + return word.substring(0, length - el); + } + break; + case 'E': + if (word.charAt(length - el - 1) != 'e') { + return word.substring(0, length - el); + } + break; + case 'F': + if ((length - el > 2) && + (word.charAt(length - el - 1) != 'e')) { + return word.substring(0, length - el); + } + break; + case 'G': + if ((length - el > 2) && + (word.charAt(length - el - 1) == 'f')) { + return word.substring(0, length - el); + } + break; + case 'H': + if ((word.charAt(length - el - 1) == 't') || + ((word.charAt(length - el - 1) == 'l') && + (word.charAt(length - el - 2) == 'l'))) { + return word.substring(0, length - el); + } + break; + case 'I': + if ((word.charAt(length - el - 1) != 'o') && + (word.charAt(length - el - 1) != 'e')) { + return word.substring(0, length - el); + } + break; + case 'J': + if ((word.charAt(length - el - 1) != 'a') && + (word.charAt(length - el - 1) != 'e')) { + return word.substring(0, length - el); + } + break; + case 'K': + if ((length - el > 2) && + ((word.charAt(length - el - 1) == 'l') || + (word.charAt(length - el - 1) == 'i') || + ((word.charAt(length - el - 1) == 'e') && + (word.charAt(length - el - 3) == 'u')))) { + return word.substring(0, length - el); + } + break; + case 'L': + if ((word.charAt(length - el - 1) != 'u') && + (word.charAt(length - el - 1) != 'x') && + ((word.charAt(length - el - 1) != 's') || + (word.charAt(length - el - 2) == 'o'))) { + return word.substring(0, length - el); + } + break; + case 'M': + if ((word.charAt(length - el - 1) != 'a') && + (word.charAt(length - el - 1) != 'c') && + (word.charAt(length - el - 1) != 'e') && + (word.charAt(length - el - 1) != 'm')) { + return word.substring(0, length - el); + } + break; + case 'N': + if ((length - el > 3) || + ((length - el == 3) && + ((word.charAt(length - el - 3) != 's')))) { + return word.substring(0, length - el); + } + break; + case 'O': + if ((word.charAt(length - el - 1) == 'l') || + (word.charAt(length - el - 1) == 'i')) { + return word.substring(0, length - el); + } + break; + case 'P': + if (word.charAt(length - el - 1) != 'c') { + return word.substring(0, length - el); + } + break; + case 'Q': + if ((length - el > 2) && + (word.charAt(length - el - 1) != 'l') && + (word.charAt(length - el - 1) != 'n')) { + return word.substring(0, length - el); + } + break; + case 'R': + if ((word.charAt(length - el - 1) == 'n') || + (word.charAt(length - el - 1) == 'r')) { + return word.substring(0, length - el); + } + break; + case 'S': + if (((word.charAt(length - el - 1) == 'r') && + (word.charAt(length - el - 2) == 'd')) || + ((word.charAt(length - el - 1) == 't') && + (word.charAt(length - el - 2) != 't'))) { + return word.substring(0, length - el); + } + break; + case 'T': + if ((word.charAt(length - el - 1) == 's') || + ((word.charAt(length - el - 1) == 't') && + (word.charAt(length - el - 2) != 'o'))) { + return word.substring(0, length - el); + } + break; + case 'U': + if ((word.charAt(length - el - 1) == 'l') || + (word.charAt(length - el - 1) == 'm') || + (word.charAt(length - el - 1) == 'n') || + (word.charAt(length - el - 1) == 'r')) { + return word.substring(0, length - el); + } + break; + case 'V': + if (word.charAt(length - el - 1) == 'c') { + return word.substring(0, length - el); + } + break; + case 'W': + if ((word.charAt(length - el - 1) != 's') && + (word.charAt(length - el - 1) != 'u')) { + return word.substring(0, length - el); + } + break; + case 'X': + if ((word.charAt(length - el - 1) == 'l') || + (word.charAt(length - el - 1) == 'i') || + ((length - el > 2) && + (word.charAt(length - el - 1) == 'e') && + (word.charAt(length - el - 3) == 'u'))) { + return word.substring(0, length - el); + } + break; + case 'Y': + if ((word.charAt(length - el - 1) == 'n') && + (word.charAt(length - el - 2) == 'i')) { + return word.substring(0, length - el); + } + break; + case 'Z': + if (word.charAt(length - el - 1) != 'f') { + return word.substring(0, length - el); + } + break; + case 'a': + if ((word.charAt(length - el - 1) == 'd') || + (word.charAt(length - el - 1) == 'f') || + (((word.charAt(length - el - 1) == 'h') && + (word.charAt(length - el - 2) == 'p'))) || + (((word.charAt(length - el - 1) == 'h') && + (word.charAt(length - el - 2) == 't'))) || + (word.charAt(length - el - 1) == 'l') || + (((word.charAt(length - el - 1) == 'r') && + (word.charAt(length - el - 2) == 'e'))) || + (((word.charAt(length - el - 1) == 'r') && + (word.charAt(length - el - 2) == 'o'))) || + (((word.charAt(length - el - 1) == 's') && + (word.charAt(length - el - 2) == 'e'))) || + (word.charAt(length - el - 1) == 't')) { + return word.substring(0, length - el); + } + break; + case 'b': + if (m_CompMode) { + if (((length - el == 3 ) && + (!((word.charAt(length - el - 1) == 't') && + (word.charAt(length - el - 2) == 'e') && + (word.charAt(length - el - 3) == 'm')))) || + ((length - el > 3) && + (!((word.charAt(length - el - 1) == 't') && + (word.charAt(length - el - 2) == 's') && + (word.charAt(length - el - 3) == 'y') && + (word.charAt(length - el - 4) == 'r'))))) { + return word.substring(0, length - el); + } + } else { + if ((length - el > 2) && + (!((word.charAt(length - el - 1) == 't') && + (word.charAt(length - el - 2) == 'e') && + (word.charAt(length - el - 3) == 'm'))) && + ((length - el < 4) || + (!((word.charAt(length - el - 1) == 't') && + (word.charAt(length - el - 2) == 's') && + (word.charAt(length - el - 3) == 'y') && + (word.charAt(length - el - 4) == 'r'))))) { + return word.substring(0, length - el); + } + } + break; + case 'c': + if (word.charAt(length - el - 1) == 'l') { + return word.substring(0, length - el); + } + break; + default: + throw new IllegalArgumentException("Fatal error."); + } + } + } + el--; + } + return word; + } + + private String recodeEnding(String word) { + + int lastPos = word.length() - 1; + + // Rule 1 + if (word.endsWith("bb") || + word.endsWith("dd") || + word.endsWith("gg") || + word.endsWith("ll") || + word.endsWith("mm") || + word.endsWith("nn") || + word.endsWith("pp") || + word.endsWith("rr") || + word.endsWith("ss") || + word.endsWith("tt")) { + word = word.substring(0, lastPos); + lastPos--; + } + + // Rule 2 + if (word.endsWith("iev")) { + word = word.substring(0, lastPos - 2).concat("ief"); + } + + // Rule 3 + if (word.endsWith("uct")) { + word = word.substring(0, lastPos - 2).concat("uc"); + lastPos--; + } + + // Rule 4 + if (word.endsWith("umpt")) { + word = word.substring(0, lastPos - 3).concat("um"); + lastPos -= 2; + } + + // Rule 5 + if (word.endsWith("rpt")) { + word = word.substring(0, lastPos - 2).concat("rb"); + lastPos--; + } + + // Rule 6 + if (word.endsWith("urs")) { + word = word.substring(0, lastPos - 2).concat("ur"); + lastPos--; + } + + // Rule 7 + if (word.endsWith("istr")) { + word = word.substring(0, lastPos - 3).concat("ister"); + lastPos++; + } + + // Rule 7a + if (word.endsWith("metr")) { + word = word.substring(0, lastPos - 3).concat("meter"); + lastPos++; + } + + // Rule 8 + if (word.endsWith("olv")) { + word = word.substring(0, lastPos - 2).concat("olut"); + lastPos++; + } + + // Rule 9 + if (word.endsWith("ul")) { + if ((lastPos - 2 < 0) || + ((word.charAt(lastPos - 2) != 'a') && + (word.charAt(lastPos - 2) != 'i') && + (word.charAt(lastPos - 2) != 'o'))) { + word = word.substring(0, lastPos - 1).concat("l"); + lastPos--; + } + } + + // Rule 10 + if (word.endsWith("bex")) { + word = word.substring(0, lastPos - 2).concat("bic"); + } + + // Rule 11 + if (word.endsWith("dex")) { + word = word.substring(0, lastPos - 2).concat("dic"); + } + + // Rule 12 + if (word.endsWith("pex")) { + word = word.substring(0, lastPos - 2).concat("pic"); + } + + // Rule 13 + if (word.endsWith("tex")) { + word = word.substring(0, lastPos - 2).concat("tic"); + } + + // Rule 14 + if (word.endsWith("ax")) { + word = word.substring(0, lastPos - 1).concat("ac"); + } + + // Rule 15 + if (word.endsWith("ex")) { + word = word.substring(0, lastPos - 1).concat("ec"); + } + + // Rule 16 + if (word.endsWith("ix")) { + word = word.substring(0, lastPos - 1).concat("ic"); + } + + // Rule 17 + if (word.endsWith("lux")) { + word = word.substring(0, lastPos - 2).concat("luc"); + } + + // Rule 18 + if (word.endsWith("uad")) { + word = word.substring(0, lastPos - 2).concat("uas"); + } + + // Rule 19 + if (word.endsWith("vad")) { + word = word.substring(0, lastPos - 2).concat("vas"); + } + + // Rule 20 + if (word.endsWith("cid")) { + word = word.substring(0, lastPos - 2).concat("cis"); + } + + // Rule 21 + if (word.endsWith("lid")) { + word = word.substring(0, lastPos - 2).concat("lis"); + } + + // Rule 22 + if (word.endsWith("erid")) { + word = word.substring(0, lastPos - 3).concat("eris"); + } + + // Rule 23 + if (word.endsWith("pand")) { + word = word.substring(0, lastPos - 3).concat("pans"); + } + + // Rule 24 + if (word.endsWith("end")) { + if ((lastPos - 3 < 0) || + (word.charAt(lastPos - 3) != 's')) { + word = word.substring(0, lastPos - 2).concat("ens"); + } + } + + // Rule 25 + if (word.endsWith("ond")) { + word = word.substring(0, lastPos - 2).concat("ons"); + } + + // Rule 26 + if (word.endsWith("lud")) { + word = word.substring(0, lastPos - 2).concat("lus"); + } + + // Rule 27 + if (word.endsWith("rud")) { + word = word.substring(0, lastPos - 2).concat("rus"); + } + + // Rule 28 + if (word.endsWith("her")) { + if ((lastPos - 3 < 0) || + ((word.charAt(lastPos - 3) != 'p') && + (word.charAt(lastPos - 3) != 't'))) { + word = word.substring(0, lastPos - 2).concat("hes"); + } + } + + // Rule 29 + if (word.endsWith("mit")) { + word = word.substring(0, lastPos - 2).concat("mis"); + } + + // Rule 30 + if (word.endsWith("end")) { + if ((lastPos - 3 < 0) || + (word.charAt(lastPos - 3) != 'm')) { + word = word.substring(0, lastPos - 2).concat("ens"); + } + } + + // Rule 31 + if (word.endsWith("ert")) { + word = word.substring(0, lastPos - 2).concat("ers"); + } + + // Rule 32 + if (word.endsWith("et")) { + if ((lastPos - 2 < 0) || + (word.charAt(lastPos - 2) != 'n')) { + word = word.substring(0, lastPos - 1).concat("es"); + } + } + + // Rule 33 + if (word.endsWith("yt")) { + word = word.substring(0, lastPos - 1).concat("ys"); + } + + // Rule 34 + if (word.endsWith("yz")) { + word = word.substring(0, lastPos - 1).concat("ys"); + } + + return word; + } + + public String stem(String word) { + + if (word.length() > 2) { + return recodeEnding(removeEnding(word.toLowerCase())); + } else { + return word.toLowerCase(); + } + } + + public String stemString(String str) { + + StringBuffer result = new StringBuffer(); + int start = -1; + for (int j = 0; j < str.length(); j++) { + char c = str.charAt(j); + if (Character.isLetterOrDigit(c)) { + if (start == -1) { + start = j; + } + } else if (c == '\'') { + if (start == -1) { + result.append(c); + } + } else { + if (start != -1) { + result.append(stem(str.substring(start, j))); + start = -1; + } + result.append(c); + } + } + if (start != -1) { + result.append(stem(str.substring(start, str.length()))); + } + return result.toString(); + } + +} + + diff --git a/app/src/main/java/org/rssin/summaries/Stopword.java b/app/src/main/java/org/rssin/summaries/Stopword.java new file mode 100644 index 0000000..af3bee6 --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/Stopword.java @@ -0,0 +1,74 @@ +package org.rssin.summaries; + +import java.io.*; +import java.util.StringTokenizer; + +public class Stopword +{ + + String stwd[]; + + public Stopword () + { + /*int cnt=0,sz=0;char bt[]=null; + try + { + File fp = new File("stopwords.txt"); + FileReader fis = new FileReader(fp); + sz = (int)fp.length(); + bt = new char [sz]; + fis.read(bt); + fis.close(); + } + catch(IOException ex) {} + + stwd=getTokens(new String(bt)); */ + + stwd = new TxtStatic().getStopWords(); + } + +public void display () + { + for (int i=0;i<stwd.length;i++) + System.out.println(stwd[i]); + } + + public boolean isStopword( String word) + { + boolean flag=false; + for (int i =0;i<stwd.length;i++) { + if(stwd[i].equalsIgnoreCase(word) ) { + flag=true; + break; + } + } + return flag; + } + + public String[] getTokens(String sen) + { + int sz=0,cnt=0;String words[]=null; + StringTokenizer stk=new StringTokenizer(sen) ; + sz=stk.countTokens(); + words=new String[sz]; + while ( stk.hasMoreTokens()) + { + words[cnt]=new String(stk.nextToken()); + cnt++; + } + return words; + } + + public String remove(String sen) + { + String dsen=""; + String words[]=getTokens(sen); + for (int j=0;j<words.length; j++) + { + if ( ! isStopword(words[j] ) ) + dsen = dsen +words[j] +" "; + } + return dsen; + } + +}
\ No newline at end of file diff --git a/app/src/main/java/org/rssin/summaries/SummaryAPI.java b/app/src/main/java/org/rssin/summaries/SummaryAPI.java index 68bc38d..83c3fa6 100644 --- a/app/src/main/java/org/rssin/summaries/SummaryAPI.java +++ b/app/src/main/java/org/rssin/summaries/SummaryAPI.java @@ -1,53 +1,283 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ package org.rssin.summaries; -import java.net.URL; +import java.util.ArrayList; +import java.util.Enumeration; +import java.util.Hashtable; +import java.util.Set; +import org.rssin.rss.FeedItem; + +/** + * Summary API van AST + * @author Joep + */ public class SummaryAPI implements SummaryAPIInterface { - - private final String APIURL = "http://api.smmry.com/"; - private final String APIKEY = "D5DDCDBD6F"; - private final int LINES = 3; - - public SummaryAPI() - { - - } - - @Override; - public Summary getSummary(FeedItem f) - { - String desc = f.description; - String sum = sendRequest(desc); - - } - - private String sendRequest(String desc) - { - String q = APIURL + "?SM_API_KEY=" + APIKEY + "&SM_LENGTH=" + LINES; - - // Create a new HttpClient and Post Header - HttpClient httpclient = new DefaultHttpClient(); - HttpPost httppost = new HttpPost(q); - - try { - - List<NameValuePair> nameValuePairs = new ArrayList<NameValuePair>(2); - nameValuePairs.add(new BasicNameValuePair("sm_api_input", desc)); - httppost.setEntity(new UrlEncodedFormEntity(nameValuePairs)); - - // Execute HTTP Post Request - HttpResponse response = httpclient.execute(httppost); - - HttpEntity con = response.getEntity(); - InputStream in = con.getInputStream(); - String encoding = con.getContentEncoding(); - encoding = encoding == null ? "UTF-8" : encoding; - String body = IOUtils.toString(in, encoding); - - } catch (ClientProtocolException e) { - // TODO Auto-generated catch block - } catch (IOException e) { - // TODO Auto-generated catch block - } - } -}
\ No newline at end of file + + @Override + public Summary getSummary(FeedItem f) { + //todo + String desc = f.description; + String t = getSumText(desc); + Summary s = new Summary(t); + + return s; + } + + @Override + public Summary getSummaryFromText(String t) { + Summary s = new Summary(t); + + return s; + } + + public String getSumText(String desc) + { + Hashtable hs = new Hashtable(); + ArrayList zinnen = getSentences(desc); + + //remove stopwords + + Stopword stop = new Stopword(); + Special specl = new Special(); + for( int i=0; i<zinnen.size(); i++) + { + SentenceItem sl = (SentenceItem)zinnen.get(i); + sl.setSRSentence(specl.remove(sl.getRawSentence())); + sl.setSRSentence(stop.remove(sl.getSRSentence())); + } + + //unique words + + for(int i=0;i<zinnen.size(); i++) + { + SentenceItem sl = (SentenceItem)zinnen.get(i); + String sen = sl.getSRSentence(); + + int wordcount=0; + String[] words = sen.split(" "); + + for (String tok : words) { + + tok = tok.trim(); + + wordcount++; + + if(!hs.containsKey(tok) && tok.length() >=3) + addword(tok,i,wordcount,hs); + else if(hs.containsKey(tok) ) + upword(tok,i,wordcount,hs); + } + } + + //stemming + + stemming(hs); + + //significant + + Enumeration key=hs.keys(); + while (key.hasMoreElements() ) + delword(key.nextElement(), hs); + + //getWeight + + key=hs.keys(); + while (key.hasMoreElements()) + setWeight(key.nextElement(),hs, zinnen); + + //ranking + return :P + return ranking(zinnen, hs); + } + + private void setWeight(Object tok, Hashtable hs, ArrayList zinnen) + { + double wg=0.0; + WordItem wl=(WordItem)hs.get(tok); + double scnt = (double)zinnen.size(); + double tf=wl.getcount(); + double df=wl.sentensecount(); + wg = tf*Math.log10(scnt/df); + wl.addWeight(wg); + } + + public String ranking(ArrayList<SentenceItem> zinnen, Hashtable hs) + { + SentenceItem sl=null; + double max=0.0; + int mi=0; + + for(int i = 0; i < zinnen.size(); i++) + { + sl = zinnen.get(i); + String sen=sl.getSRSentence(); + Enumeration key = hs.keys(); + while(key.hasMoreElements()) + { + String str=(String)key.nextElement(); + if(sen.indexOf(str) != -1 ) + { + WordItem wl=(WordItem)hs.get(str); + sl.addWeight(wl.getWeight()); + } + } + } + + for(int i = 0; i < zinnen.size(); i++) + { + sl = (SentenceItem)zinnen.get(i); + if( sl.getWeight() > max ) + { + max = sl.getWeight(); + mi=i; + } + } + + String str1 = sl.getRawSentence(); + sl = zinnen.get(mi); + + return str1; + } + + private void delword(Object tok, Hashtable hs) + { + WordItem wl=(WordItem)hs.remove(tok); + if( wl.getcount() > 3 ) + hs.put(tok,wl); + } + + + private void stemword(String w1,String w2, Hashtable hs) + { + if( !hs.containsKey(w2) || !hs.containsKey(w1) ) + { + // System.out.print("return:"); + return; + } + WordItem wl1=(WordItem)hs.remove(w1); + WordItem wl2=(WordItem)hs.remove(w2); + + ArrayList wp=wl2.getwordpos(); + ArrayList sp=wl2.getsentensepos(); + + + for(int i=0;i<wp.size();i++) + { + String wp2=(String)wp.get(i); + String sp2=(String)sp.get(i); + wl1.incrcount(Integer.parseInt(wp2),Integer.parseInt(sp2)); + } + + hs.put(w1,wl1); + } + + + private double difpos(String str1,String str2) + { + int sz = Math.min(str1.length(), str2.length()); + int mz = Math.max(str1.length(), str2.length()); + + double dp=mz; + double sm=0; + + for (int i = 0; i < mz; i++ ) + { + if( str1.charAt(i) != str2.charAt(i) ) + { + dp=i+1; + break; + } + else + sm++; + } + return(sm*(dp/sz)); + } + + + private void stemming(Hashtable hs) + { + int sz=hs.size(); + + double wdis[][] = new double[sz][sz]; + + Set s1=hs.keySet(); + + Object obj[]=s1.toArray(); + + for(int i = 0; i < sz; i++) + { + String str1 = (String) obj[i]; + for(int j=0;j<sz;j++) + { + String str2=(String) obj[j]; + if(i!=j) + wdis[i][j]=difpos(str1,str2); + } + } + + for(int i=0;i<sz;i++) + { + String str1=(String)obj[i]; + for(int j=0;j<sz;j++) + { + String str2 = (String)obj[j]; + if(i != j && wdis[i][j] >= 3.0 ) + { + stemword(str1,str2,hs); + } + } + } + } + + private void addword(String tok,int sp,int wp, Hashtable hs) + { + WordItem wl=new WordItem(tok); + wl.incrcount(sp+1,wp); + hs.put(tok,wl); + } + private void upword(String tok,int sp,int wp, Hashtable hs) + { + WordItem wl=(WordItem)hs.remove(tok); + wl.incrcount(sp+1,wp); + hs.put(tok,wl); + } + + private ArrayList<SentenceItem> getSentences(String doc) + { + ArrayList<SentenceItem> als = new ArrayList(); + + int fs1=0; + int fs2=0; + int nx=0; + + while ( nx < (doc.length()-1) ) + { + nx=doc.indexOf(".",fs2); + if(nx==-1) + break; + else if( nx==doc.lastIndexOf(".") ) + { + String str=(doc.substring(fs1,nx+1)).toLowerCase(); + als.add(new SentenceItem(str)); + fs2=nx+1; + fs1=fs2; + break; + } + else if( doc.charAt(nx+1) ==' ' || doc.charAt(nx+1) =='\r' || doc.charAt(nx+1) =='\n') + { + String str=(doc.substring(fs1,nx+1).toLowerCase()).trim(); + als.add(new SentenceItem(str)); + fs2=nx+1; + fs1=fs2; + } + else + fs2=nx+1; + } + + return als; + } +} diff --git a/app/src/main/java/org/rssin/summaries/SummaryAPIInterface.java b/app/src/main/java/org/rssin/summaries/SummaryAPIInterface.java index 1caf922..17636de 100644 --- a/app/src/main/java/org/rssin/summaries/SummaryAPIInterface.java +++ b/app/src/main/java/org/rssin/summaries/SummaryAPIInterface.java @@ -1,9 +1,10 @@ package org.rssin.summaries; -import rss.FeedItem; +import org.rssin.rss.FeedItem; public interface SummaryAPIInterface { public Summary getSummary(FeedItem f); + public Summary getSummaryFromText(String t); }
\ No newline at end of file diff --git a/app/src/main/java/org/rssin/summaries/TxtStatic.java b/app/src/main/java/org/rssin/summaries/TxtStatic.java new file mode 100644 index 0000000..d686beb --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/TxtStatic.java @@ -0,0 +1,26 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package org.rssin.summaries; + +/** + * + * @author Joep + */ +public class TxtStatic { + + private final String[] stopwords = {"a","about","above","across","after","afterwards","again","against","all","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","another","any","anyhow","anyone","anything","anyway","anywhere","are","around","as","at","back","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","below","beside","besides","between","beyond","bill","both","bottom","but","by","by","call","can","common","cannot","cant","co","computer","con","could","couldnt","cry","de","describe","detail","do","does","done","down","due","during","each","eg","eight","either","eleven","else","elsewhere","empty","enough","etc","even","ever","every","everyone","everything","everywhere","except","few","fifteen","fify","fill","find","fire","first","five","for","former","formerly","forty","found","four","from","front","full","further","get","give","go","had","has","hasnt","have","he","hence","her","here","hereafter","hereby","herein","hereupon","hers","herself","him","himself","his","how","however","hundred","i","ie","if","in","inc","indeed","interest","into","is","it","its","itself","keep","last","latter","latterly","least","less","ltd","made","many","may","me","meanwhile","might","mill","mine","more","moreover","most","mostly","move","much","must","my","myself","name","namely","neither","never","nevertheless","next","nine","no","nobody","none","noone","nor","not","nothing","now","nowhere","of","off","often","on","once","one","only","onto","or","other","others","otherwise","our","ours","ourselves","out","over","own","part","per","perhaps","please","put","rather","re","same","see","seem","seemed","seeming","seems","serious","several","she","should","show","side","since","sincere","six","sixty","so","some","somehow","someone","something","sometime","sometimes","somewhere","still","such","system","take","ten","than","that","the","their","them","themselves","then","thence","there","thereafter","thereby","therefore","therein","thereupon","these","they","thick","thin","third","this","those","though","three","through","throughout","thru","thus","to","together","too","top","toward","towards","twelve","twenty","two","un","under","until","up","upon","us","usually","usual","very","via","was","we","well","were","what","whatever","when","whence","whenever","where","whereafter","whereas","whereby","whereinwhereupon","wherever","whether","which","while","whither","who","whoever","whole","whom","whose","why","will","with","within","without","would","yet","you","your","yours","yourself","yourselves","don't","won't","can't","didn't","it's","is'nt","aren't","wasn't","haven't","hasn't","hadn't","you've","it'hv","you'd","you're","hasn't","we�ll","you�re","we're","we've"}; + + private final char[] special = {'"', ',', ';', '!', '&', '/', '$', ':', '|', '%', ')', '(', '[', ']', '�', '�', '\'', '.', ' ', '\n'}; + + public String[] getStopWords() + { + return stopwords; + } + public char[] getSpecial() + { + return special; + } +} diff --git a/app/src/main/java/org/rssin/summaries/WordItem.java b/app/src/main/java/org/rssin/summaries/WordItem.java new file mode 100644 index 0000000..fb47749 --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/WordItem.java @@ -0,0 +1,78 @@ +package org.rssin.summaries; + +import java.util.ArrayList; + +public class WordItem +{ + private String word; + private double cnt; + private double scnt; + private double wght; + + private ArrayList spl; + private ArrayList wpl; + + public WordItem (String wd) + { + word= new String(wd); + spl=new ArrayList(); + wpl=new ArrayList(); + cnt=0;scnt=0; + } + + public void incrcount(int sp,int wp) + { + cnt++; + sentensepos(sp); + wordpos(wp); + } + + public double getcount() + { + return cnt; + } + + public String getword() + { + return word; + } + + public void sentensepos(int sp) + { + if(! spl.contains(sp+"")) + scnt++; + spl.add(sp+""); + } + + public void wordpos(int wp) + { + wpl.add(wp+""); + } + + public ArrayList getwordpos() + { + return wpl; + } + + public ArrayList getsentensepos() + { + return spl; + } + + public void addWeight(double wg) + { + wght=wg; + } + + public double getWeight() + { + return wght; + } + + public double sentensecount() + { + return scnt; + } + + +}
\ No newline at end of file diff --git a/app/src/main/java/org/rssin/summaries/specials.txt b/app/src/main/java/org/rssin/summaries/specials.txt new file mode 100644 index 0000000..046d05b --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/specials.txt @@ -0,0 +1,18 @@ +, +" +; +! +& +/ +$ +: +| +% +) +( +[ +] +” +“ +' +. diff --git a/app/src/main/java/org/rssin/summaries/stopwords.txt b/app/src/main/java/org/rssin/summaries/stopwords.txt new file mode 100644 index 0000000..e621f2a --- /dev/null +++ b/app/src/main/java/org/rssin/summaries/stopwords.txt @@ -0,0 +1,345 @@ +a +about +above +across +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +am +among +amongst +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anyway +anywhere +are +around +as +at +back +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +between +beyond +bill +both +bottom +but +by +by +call +can +common +cannot +cant +co +computer +con +could +couldnt +cry +de +describe +detail +do +does +done +down +due +during +each +eg +eight +either +eleven +else +elsewhere +empty +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifteen +fify +fill +find +fire +first +five +for +former +formerly +forty +found +four +from +front +full +further +get +give +go +had +has +hasnt +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +herself +him +himself +his +how +however +hundred +i +ie +if +in +inc +indeed +interest +into +is +it +its +itself +keep +last +latter +latterly +least +less +ltd +made +many +may +me +meanwhile +might +mill +mine +more +moreover +most +mostly +move +much +must +my +myself +name +namely +neither +never +nevertheless +next +nine +no +nobody +none +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +other +others +otherwise +our +ours +ourselves +out +over +own +part +per +perhaps +please +put +rather +re +same +see +seem +seemed +seeming +seems +serious +several +she +should +show +side +since +sincere +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +such +system +take +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thick +thin +third +this +those +though +three +through +throughout +thru +thus +to +together +too +top +toward +towards +twelve +twenty +two +un +under +until +up +upon +us +usually +usual +very +via +was +we +well +were +what +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +whoever +whole +whom +whose +why +will +with +within +without +would +yet +you +your +yours +yourself +yourselves +don't +won't +can't +didn't +it's +is'nt +aren't +wasn't +haven't +hasn't +hadn't +you've +it'hv +you'd +you're +hasn't +we’ll +you’re +we're +we've + |