BAB VI KESIMPULAN DAN SARAN
6.2. Saran
1. Pengembangan pengujian dengan jumlah koleksi yang lebih besar.
2. Membandingkan kinerja kedua metodeHash tabledengan JavaHash table. 3. Melakukan pengujian waktu pencarian yang lebih akurat.
95
DAFTAR PUSTAKA
Agusta, L.2009.Perbandingan Algoritma Stemming Porter Dengan Algoritma Nazief dan Adriani Untuk Stemming Dokumen Teks Bahasa Indonesia. Konferensi Nasional Sistem dan Informatika 2009. Bali.
Aho, Alfred V., Sethi Ravi, Ullman, Jeffrey D. 1986. Compilers : Priciples, Techniques, and Tools.Pearson Education. China.page435-436.
Bobby, A., A. Nazief, dan Mirna Adriani. 1996. Confix Stripping: Approach to Stemming Algorithm for Bahasa Indonesia. Internal publication. Faculty of Computer Science, University of Indonesia, Depok.
Borko, Harold. 1977. Toward a Theory of Indexing, information Processing and Management Vol.13.Pergamon Press.page355-365
Cormen, Thomas H. 1990.Introduction to Algorithms Second Edition. Massachusetts Institute of Technology Press.page201.
Indonesia. Undang-Undang Nomor 43 Tahun 2007 tentangPerpustakaan.
Kowalski, Gerald. 2011.Information Retrieval Architexture and Algorithms. Springer Science+Business Media. page 29-32.
Kowalski, G.J, Maybury, M.T. 2002. Information Storage and Retrieval Systems : Theory and Implementation, second edition”. Kluwer Academic Publishers. page 83.
Kruse, Robert L., Ryba, Alexander J. 1998. Data Structures and Program Design in C++. Prentice Hall.page406-407.
Loudon, Kyle. 1999. “Mastering Algorithms With C”. O’Reilly Media, Inc. page 153-158.
Manning, Christopher D. Prabhakar Raghavan & Hinrich Schütze. 2008.
“Introduction to Information Retrieval”. Cambridge University Press. page
23-24.
McKenzie, B.J., Harries, R., and Bell, T.C.. 1990.“Selecting a hashing algorithm,”
Software Practice and Experience20. 209–224.
Porter, M. F. 1980. An Algorithm for Suffix Stripping. Computer Laboratory. Corn Exchange Street. Cambridge
Rijsbergen, C. J. van. 1979. “Information Retrieval”. Information Retrieval Group. University of Glasglow.page6-7.
Schimd.H. 2008. Tokenizing and Part-of-Speech Tagging in Corpus Linguistics : An International handbook. Walter de Gruyter GmbH & Co. Berlin, Germany.
page 527-552
Yates, Ricardo Baeza & Neto, Berthier Ribeiro. 1992. Modern Information Retrieval : The Concepts and Technology Behind Search.ACM Press Book.
Whitten, Bentley Dittman. 2004. Sistem Analysis and Design Method 6th Edition, Irwin/McGraw-Hill, New York.
97
LAMPIRAN
Lampiran 1 Listing PreProcessing
package skripsi; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.ImageRenderInfo; import com.itextpdf.text.pdf.parser.PdfTextExtractor; import com.itextpdf.text.pdf.parser.TextExtractionStrategy; import com.itextpdf.text.pdf.parser.TextRenderInfo; import com.sun.org.apache.xalan.internal.xsltc.runtime.Hashtable; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Dictionary; import java.util.ArrayList; import java.util.logging.Level; import java.util.logging.Logger; /** * * @author Lie_Lianx */
public class PreProcessing { Hashtable kamusHash; Hashtable kamusStopWordHash; Hashtable kamusStopWordEngHash; PdfReader reader; Stemmer porterStemmer; public PreProcessing() {
//permulaan membaca daftar kata dari kamus .txt double timeStart = (double) System.nanoTime(); String str;
String kamus = "";
String[] kamusArray = new String[0]; kamusHash = new Hashtable(); kamusStopWordHash = new Hashtable(); kamusStopWordEngHash = new Hashtable(); try {
BufferedReader in = new BufferedReader(new FileReader("G:\\Skripsi\\pustaka\\Stopword n stemming\\kamus.txt")); while ((str = in.readLine()) != null) {
str=str.substring(0, (str.length()-1)); kamusHash.put(str, str); } in.close(); } catch (IOException e) { } try {
BufferedReader in = new BufferedReader(new FileReader("G:\\Skripsi\\pustaka\\Stopword n stemming\\stopword.txt")); while ((str = in.readLine()) != null) {
kamusStopWordHash.put(str, str); } in.close(); in.close(); } catch (IOException e) { }
// //permulaan membaca daftar stopwords english dari sebuah .txt String stopwordsEng = "";
try {
BufferedReader in = new BufferedReader(new FileReader("G:\\Skripsi\\pustaka\\Stopword n stemming\\stopwordeng.txt"));
while ((str = in.readLine()) != null) { stopwordsEng = str;
}
stopwordsEngArray = stopwordsEng.split(","); for (int i = 0; i < stopwordsEngArray.length; i++) {
kamusStopWordEngHash.put(stopwordsEngArray[i], stopwordsEngArray[i]); }
in.close();
} catch (IOException e) { }
double timeFinish = (double) System.nanoTime();
double time = ((double) (timeFinish - timeStart)) / 1000000000; System.out.println("Start time read dictionary : " + timeStart); System.out.println("Finish time read dictionary : " + timeFinish); System.out.println("Total time use : " + time);
}
public Object[] stemming(Object[] hasilStopwordRemoval) { double timeStart = (double) System.nanoTime(); char[] sequenceWord;
porterStemmer = new Stemmer();
ArrayList<String> hasilStemming = new ArrayList<String>(); //proses stemming dimulai
String rootWord = "";
for (int i = 0; i < hasilStopwordRemoval.length; i++) {
sequenceWord = ((String) hasilStopwordRemoval[i]).toCharArray(); porterStemmer.add(sequenceWord, sequenceWord.length); porterStemmer.stem();
rootWord = porterStemmer.toString().toLowerCase(); hasilStemming.add(rootWord);
}
//perhitungan waktu proses stemming
double timeFinish = (double) System.nanoTime();
double time = ((double) (timeFinish - timeStart)) / 1000000000; System.out.println("Start time stemming : " + timeStart); System.out.println("Finish time stemming : " + timeFinish); System.out.println("Total time use : " + time);
//menampilkan jumlah kata setelah stemming
System.out.println("Jumlah kata setelah stemming : " + hasilStemming.size()); //mengembalikan hasil data array stemming
return hasilStemming.toArray(); }
public boolean isInStopWordDictionary(String string) { string = string.toLowerCase();
boolean isInDict = false;
String temp = (String) kamusStopWordHash.get(string); if (string.equalsIgnoreCase(temp)) {
isInDict = true; } else {
temp = (String) kamusStopWordEngHash.get(string); if (string.equalsIgnoreCase(temp)) {
isInDict = true; } } return isInDict; }
public boolean isInDictionary(String string) { string = string.toLowerCase();
boolean isInDict = false;
String temp = (String) kamusHash.get(string); if (string.equalsIgnoreCase(temp)) {
isInDict = true; } return isInDict; }
double timeStart = (double) System.nanoTime(); String checkedData;
ArrayList<String> hasilStopWordRemoval = new ArrayList<String>(); //permulaan pengecekan dan penghapusan stopwords dengan data hasil Ektraksi for (int i = 0; i < dataEkstraksi.length; i++) {
checkedData = (String) dataEkstraksi[i];
if (isInStopWordDictionary(checkedData) || checkedData.equals ("") || checkedData.length() <= 2 || checkedData.contains(" ")) {
} else {
hasilStopWordRemoval.add(checkedData); } } //perhitungan waktu proses stopwordsRemoval
double timeFinish = (double) System.nanoTime();
double time = ((double) (timeFinish - timeStart)) / 1000000000; System.out.println("Start time stopword Removal : " + timeStart); System.out.println("Finish time stopword Removal : " + timeFinish); System.out.println("Total time use : " + time);
//menampilkan jumlah kata setelah stopword removal
System.out.println("Jumlah kata setelah stopword removal : " + hasilStopWordRemoval.size()); //mengembalikan hasil data array stopword removal
return hasilStopWordRemoval.toArray(); } public Object[] bacaDocument(String bookSource) {
double timeStart = (double) System.nanoTime(); String inputFile = bookSource;
Object[] dataTextArray = null;
ArrayList<String> dataTextVector = new ArrayList<String>(); try {
reader = new PdfReader(inputFile);
// System.out.println(reader.getNumberOfPages()); int numberOfPages = reader.getNumberOfPages(); // System.out.println("page"+numberOfPages);
//dimulai pengulangan untuk ektraksi kata pada tiap halaman String dataText = "";
// for (int i = 0; i < numberOfPages - (numberOfPages - 5); i++) { for (int i = 0; i < numberOfPages; i++) {
//perubahan for menjadi pendek
// dataText = dataText.concat(PdfTextExtractor.getTextFromPage(reader, i + 1)); dataText = PdfTextExtractor.getTextFromPage(reader, i + 1); // System.out.println(dataText.length()); dataText = dataText.replaceAll("[^a-zA-Z0-9]", " "); dataText = dataText.replaceAll(" ", " "); dataText = dataText.replaceAll(" ", " "); dataText = dataText.replaceAll(" ", " "); dataText = dataText.replaceAll(" ", " "); // System.out.println(dataText);
//tokenizing kata menjadi array dataTextArray = dataText.split(" "); // System.out.println(dataTextArray.length);
for (int j = 0; j < dataTextArray.length; j++) {
dataTextArrayList.add(dataTextArray[j].toString()); } } reader.close();
} catch (IOException ex) {
Logger.getLogger(PreProcessing.class.getName()).log(Level.SEVERE, null, ex); } //perhitungan waktu pemrosesan ektraksi kata
double timeFinish = (double) System.nanoTime();
double time = ((double) (timeFinish - timeStart)) / 1000000000; System.out.println("Start time extraction : " + timeStart); System.out.println("Finish time extraction : " + timeFinish); System.out.println("Total time use : " + time);
//menampilkan hasil panjang array keseluruhan kata dari semua halaman System.out.println("Jumlah keseluruhan array kata : " + dataTextVector.size()); //mengembalikan hasil ekstraksi
dataTextArray = dataTextVector.toArray(); return dataTextArray; }
PreProcessing preProcess = new PreProcessing(); Object[] hasilStopwordRemoval = {"pentingnya"};
Object[] hasilStemming = preProcess.stemming(hasilStopwordRemoval); System.out.println(""+(String)hasilStemming[0]); }}
Lampiran 2 Listing IndexingProcess
package skripsi; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.logging.Level; import java.util.logging.Logger; import javax.naming.spi.DirectoryManager; import skripsi.Koleksi; /** * * @author Lie_Lianx */
public class IndexingProcess {
private ArrayList<Koleksi>[] koleksiArrayList; private int jumlahBucket;
private String workSpace;
GeneralHashFunctionLibrary ghfl = new GeneralHashFunctionLibrary(); int wordCount;
double knuth;
public IndexingProcess() {
knuth=((double)(((Math.sqrt(5))) - 1) / 2); } public IndexingProcess(int jumlahBucket) {
this.jumlahBucket = jumlahBucket; } /**
* @param jumlahBucket the jumlahBucket to set */
public void setJumlahBucket(int jumlahBucket) { this.jumlahBucket = jumlahBucket; } /**
* @return the jumlahBucket */
public int getJumlahBucket() { return jumlahBucket; } /**
* @param koleksiArrayList the koleksiArrayList to set */
public void setKoleksiArrayList(ArrayList<Koleksi>[] koleksiArrayList) { this.koleksiArrayList = koleksiArrayList; }
/**
* @return the koleksiArrayList */
public ArrayList<Koleksi>[] getKoleksiArrayList() { return koleksiArrayList; }
public static void main(String[] args) { // //comment jika LOAD
// PreProcessing preProcess = new PreProcessing(); // //comment jika LOAD
IndexingProcess indexingProcess = new IndexingProcess();
indexingProcess.setWorkSpace("C:\\Users\\Lie_Lianx\\Desktop\\TA\\finish4\\try1\\"); //// comment jika LOAD
//// indexingProcess.setJumlahBucket(193); //// indexingProcess.setJumlahBucket(383); //// indexingProcess.setJumlahBucket(769); //// indexingProcess.setJumlahBucket(1531); //// indexingProcess.setJumlahBucket(3067); //// indexingProcess.setJumlahBucket(6143); //// indexingProcess.setJumlahBucket(12289); //// indexingProcess.setJumlahBucket(24571); //// indexingProcess.setJumlahBucket(49157); //// indexingProcess.setJumlahBucket(98299); // // String hashMethod; // int mixingStepCode; // boolean skip = true; //
// //book source
// String bookSource = "G:\\Skripsi\\pustaka\\E-Book\\training_ready\\ebook"; //// String bookSource = "H:\\ROY BACKUP\\Skripsi\\Stopword_stemming"; // //end of book source
//
// //akses buku didalam folder // File dir = new File(bookSource); // String[] bookName = dir.list(); //// System.out.println(bookName.length); // //selesai akses buku didalam folder // // Object[] dataEkstraksi; //// ArrayList<String> dataEkstraksi2; // Object[] hasilStopwordRemoval; // Object[] hasilStemming; //
// //memulai perulangan untuk semua mixingStep yang ada // for (int j = 1; j < 5; j++) {
// mixingStepCode = j;
// //memulai perulangan untuk 2 jenis Hash Function // for (int k = 0; k < 2; k++) { // if (k == 0) { // hashMethod = "division"; // } else { // hashMethod = "multiplication"; // } //
//// System.out.println("Total Memory Allowed : " + Runtime.getRuntime().totalMemory()); //// System.out.println("Max Memory Allowed : " + Runtime.getRuntime().maxMemory()); //// System.out.println("Free Memory before GC : " + Runtime.getRuntime().freeMemory()); //// System.gc();
//// System.out.println("Free Memory after GC : " + Runtime.getRuntime().freeMemory()); //
// //mulai menghitung waktu proses dan free memory // double timeStart = (double) System.nanoTime(); //// System.gc();
// long freeMemoryStart = Runtime.getRuntime().freeMemory(); //
// //mulai perulangan sebanyak file buku yang ada di dalam folder
// //bookCount untuk menghitung jumlah yg merupakan buku di dalam folder source // int bookCount = 0;
// for (int i = 0; i < bookName.length; i++) { // if (bookName[i].endsWith(".pdf")) { // bookCount++;
// //preprocessing data
// System.out.println(i + "-->" + bookSource + "\\" + bookName[i]);
// dataEkstraksi = preProcess.bacaDocument(bookSource + "\\" + bookName[i]); //// dataEkstraksi2 = preProcess.bacaDocument2(bookSource + "\\" + bookName[i]); // hasilStopwordRemoval = preProcess.stopwordRemoval(dataEkstraksi);
// dataEkstraksi = null;
//// for (int l = 0; l < hasilStopwordRemoval.length; l++) { //// System.out.println(hasilStopwordRemoval[l]); //// }
// // Object[] hasilStopwordRemoval = {"seseorang"}; // hasilStemming = preProcess.stemming(hasilStopwordRemoval); // hasilStopwordRemoval = null;
// //selesai preprocessing data //
// //mulai proses indexing
// String idBuku = "buku " + bookCount; // if (bookCount == 1) {
// indexingProcess.startIndexingProcess(hasilStemming, "start", idBuku, hashMethod, mixingStepCode); // } else {
// indexingProcess.startIndexingProcess(hasilStemming, "continue", idBuku, hashMethod, mixingStepCode); // }
// hasilStemming = null; // //selesai proses indexing // }
// }
// //selesai perulangan untuk semua buku dalam folder //
//
// //hitung free memory setelah process
// long freeMemoryFinish = Runtime.getRuntime().freeMemory(); //
////// comment jika LOAD //// if (!skip) {
// //
//// //save HashTable
// indexingProcess.saveIndexHashTable("HashTableEBooks_" + hashMethod + "_" + mixingStepCode + "_" + indexingProcess.getJumlahBucket() + ".txt");
//// //selesai save HashTable //selingan jika Hash berasal dari LOAD
// System.out.println(indexingProcess.getKoleksiArrayList().length); indexingProcess.loadIndexHashTable("HashTableEBooks_division_1_383.txt"); // System.out.println(indexingProcess.getKoleksiArrayList()[0].get(3).getKata()); // System.out.println(indexingProcess.getKoleksiArrayList().length);
//selesai selingan jika Hash berasal dari LOAD //// //catat LOG
// indexingProcess.saveLOGHashTable("LOGHashTableEBooks_" + hashMethod + "_" + mixingStepCode + "_" + indexingProcess.getJumlahBucket() + ".txt");
//// //selesai catat LOG //proses search
try {
double timeStart = 0; double timeFinish = 0; double time; boolean cont = true; String searchWord;
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); searchingProcess search = new searchingProcess(indexingProcess.getKoleksiArrayList()); while (cont) {
System.out.print("\n\nMasukkan kata yang ingin dicari : "); searchWord = br.readLine();
if (!searchWord.equalsIgnoreCase("xxx")) { timeStart = (double) System.nanoTime();
timeFinish = (double) System.nanoTime();
time = ((double) (timeFinish - timeStart)) / 1000000000;
System.out.println("---"); System.out.println("Start time All Search HashTable : " + timeStart);
System.out.println("Finish time All Search HashTable : " + timeFinish); System.out.println("Total time use for All search : " + time);
System.out.println("---"); } else { cont = false; } System.gc(); } System.gc(); } catch (Exception e) {
System.out.println("Error reading Input!!"); }
//selesai proses search // //comment jika LOAD
// double timeFinish = (double) System.nanoTime();
// double time = ((double) (timeFinish - timeStart)) / 1000000000;
// System.out.println("Start time ALL (" + hashMethod + "_" + mixingStepCode + ") : " + timeStart); // System.out.println("Finish time ALL (" + hashMethod + "_" + mixingStepCode + ") : " + timeFinish); // System.out.println("Total time use ALL (" + hashMethod + "_" + mixingStepCode + ") : " + time); //
//
// System.out.println("Max Memory Allowed : " + Runtime.getRuntime().maxMemory()); // System.out.println("Free Memory start process : " + freeMemoryStart + " bytes"); // System.out.println("Free Memory finish process : " + freeMemoryFinish + " bytes");
// System.out.println("Memory used for process : " + ((long) (freeMemoryFinish - freeMemoryStart)) + " bytes"); // System.out.println("Free Memory before GC : " + Runtime.getRuntime().freeMemory() + " bytes");
// System.gc();
// System.out.println("Free Memory after GC : " + Runtime.getRuntime().freeMemory() + " bytes"); // // //// //untuk skip //// }else{ //// skip=false; //// j--; //// } // //
// //comment jika LOAD // } // if (skip) { // skip = false; // j--; // } // }
// //comment jika LOAD }
public void saveIndexHashTable(String namaFile) { double timeStart = (double) System.nanoTime(); double timeFinish;
double time; FileWriter fw = null; try {
String lokasiSaveHashTable = getWorkSpace() + namaFile; fw = new FileWriter(lokasiSaveHashTable);
BufferedWriter bw = new BufferedWriter(fw); bw.write(String.valueOf(getJumlahBucket())); bw.newLine();
for (int i = 0; i < getJumlahBucket(); i++) {
bw.write(String.valueOf(koleksiArrayList[i].size())); bw.newLine();
for (int j = 0; j < koleksiArrayList[i].size(); j++) { bw.write(koleksiArrayList[i].get(j).getKata()); bw.newLine();
bw.write(String.valueOf(koleksiArrayList[i].get(j).getIdBuku().size())); bw.newLine();
for (int k = 0; k < koleksiArrayList[i].get(j).getIdBuku().size(); k++) { bw.write(koleksiArrayList[i].get(j).getIdBuku().get(k));
bw.newLine();
bw.write(String.valueOf(koleksiArrayList[i].get(j).getJumlahKataDalamBuku().get(k))); bw.newLine(); } } }
bw.close();
} catch (IOException ex) {
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } finally {
try { fw.close();
} catch (IOException ex) {
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } }
timeFinish = (double) System.nanoTime();
time = ((double) (timeFinish - timeStart)) / 1000000000; System.out.println("Start time Save HashTable : " + timeStart); System.out.println("Finish time Save HashTable : " + timeFinish); System.out.println("Total time use : " + time); }
public void loadIndexHashTable(String namaFile) { double timeStart = (double) System.nanoTime(); double timeFinish;
double time; FileReader fr = null; wordCount = 0; try {
String lokasiLoadHashTable = getWorkSpace() + namaFile; fr = new FileReader(lokasiLoadHashTable);
BufferedReader br = new BufferedReader(fr); String jumlahBucket = "";
try {
//baca jumlahBucket jumlahBucket = br.readLine();
this.setJumlahBucket(Integer.parseInt(jumlahBucket)); //selesai baca jumlahBucket
//buat koleksiArrayList baru dari hasil Load sesuai jumlahBucket
setKoleksiArrayList((ArrayList<Koleksi>[]) Array.newInstance(ArrayList.class, this.getJumlahBucket())); for (int i = 0; i < this.getJumlahBucket(); i++) {
koleksiArrayList[i] = new ArrayList<Koleksi>(); }
//selesai buat koleksiArrayList baru dari hasil Load sesuai jumlahBucket for (int i = 0; i < this.getJumlahBucket(); i++) {
//baca jumlah Koleksi dari bucket i String jumlahKoleksi = br.readLine(); //for untuk jumlah Koleksi
for (int j = 0; j < Integer.parseInt(jumlahKoleksi); j++) { Koleksi koleksi = new Koleksi();
wordCount++;
String kata = br.readLine(); koleksi.setKata(kata);
String jumlahKoleksiUntukKata = br.readLine();
for (int k = 0; k < Integer.parseInt(jumlahKoleksiUntukKata); k++) { String idBuku = br.readLine();
koleksi.getIdBuku().add(idBuku);
String jumlahKataDalamBuku = br.readLine();
koleksi.getJumlahKataDalamBuku().add(Integer.parseInt(jumlahKataDalamBuku)); } koleksiArrayList[i].add(koleksi); } }
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } finally {
try { br.close();
} catch (IOException ex) {
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } }
} catch (FileNotFoundException ex) {
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } finally {
try { fr.close();
} catch (IOException ex) {
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } }
timeFinish = (double) System.nanoTime();
time = ((double) (timeFinish - timeStart)) / 1000000000; System.out.println("Start time Load HashTable : " + timeStart); System.out.println("Finish time Load HashTable : " + timeFinish); System.out.println("Total time use : " + time);
System.out.println("Total words loaded in HashTable : " + wordCount); }
public void startIndexingProcess(Object[] hasilStemmingIndonesia, String status, String idBuku, String hashMethod, int mixingStepTipe) {
double timeStart = (double) System.nanoTime(); double timeFinish;
double time;
//pengecekan status pengindeksan apakah permulaan atau sambungan if (status.equalsIgnoreCase("start")) {
wordCount = 0;
koleksiArrayList = (ArrayList<Koleksi>[]) Array.newInstance(ArrayList.class, jumlahBucket); for (int i = 0; i < jumlahBucket; i++) {
koleksiArrayList[i] = new ArrayList<Koleksi>(); } }
//mulai meletakkan kedalam hash table perkata long mixingCode;
String word;
for (int i = 0; i < hasilStemmingIndonesia.length; i++) { word = ((String) hasilStemmingIndonesia[i]); mixingCode = 0;
mixingCode=MS.PJWHash(word); //penghilangan angka negatif mixingCode = Math.abs(mixingCode); int hashCode;
if (hashMethod.equalsIgnoreCase("division")) { //division method
hashCode = (int) (mixingCode % this.getJumlahBucket()); //end division method
} else {
//multiplication method
hashCode = (int) (this.getJumlahBucket() * ((mixingCode * knuth) % 1)); //end multiplication method
}
boolean isIdBukuFind = false; boolean isWordFind = false; int posisiKataInHashTable = 0;
for (int j = 0; j < this.getKoleksiArrayList()[hashCode].size(); j++) {
if (this.getKoleksiArrayList()[hashCode].get(j).getKata().equalsIgnoreCase((String) hasilStemmingIndonesia[i])) { isWordFind = true;
posisiKataInHashTable = j;
for (int k = 0; k < this.getKoleksiArrayList()[hashCode].get(j).getIdBuku().size(); k++) { if (this.getKoleksiArrayList()[hashCode].get(j).getIdBuku().get(k).equalsIgnoreCase(idBuku)) {
isIdBukuFind = true;
this.getKoleksiArrayList()[hashCode].get(j).getJumlahKataDalamBuku().set(k, this.getKoleksiArrayList()[hashCode].get(j).getJumlahKataDalamBuku().get(k) + 1);
break; } } break;
} } if (!isWordFind) {
wordCount++;
Koleksi temp = new Koleksi();
temp.setKata((String) hasilStemmingIndonesia[i]); temp.getIdBuku().add(idBuku); temp.getJumlahKataDalamBuku().add(1); this.getKoleksiArrayList()[hashCode].add(temp); } else if (!isIdBukuFind) { koleksiArrayList[hashCode].get(posisiKataInHashTable).getIdBuku().add(idBuku); koleksiArrayList[hashCode].get(posisiKataInHashTable).getJumlahKataDalamBuku().add(1); } }
//selesai meletakkan kedalam hash table perkata // memulai perhitungan waktu Hash
timeFinish = (double) System.nanoTime();
time = ((double) (timeFinish - timeStart)) / 1000000000;
System.out.println("Start time indexing HashTable : " + timeStart); System.out.println("Finish time indexing HashTable : " + timeFinish); System.out.println("Total time use : " + time);
System.out.println("Total words created in HashTable : " + wordCount); }
public void saveLOGHashTable(String namaFile) { double timeStart = (double) System.nanoTime(); double timeFinish;
double time; FileWriter fw = null; try {
String lokasiSaveLOG = getWorkSpace() + namaFile; fw = new FileWriter(lokasiSaveLOG);
BufferedWriter bw = new BufferedWriter(fw); int total = 0;
double average = 0;
double standardDeviation = 0; double varians = 0;
int max = Integer.MIN_VALUE; int min = Integer.MAX_VALUE;
for (int i = 0; i < this.getJumlahBucket(); i++) { int ukuranChain = this.getKoleksiArrayList()[i].size(); //menulis kedalam LOG
bw.write(String.valueOf(ukuranChain)); bw.write("\t");
for (int j = 0; j < ukuranChain; j++) { bw.write("-"); } bw.newLine();
//selesai menulis kedalam LOG total = total + ukuranChain; if (ukuranChain > max) {
max = ukuranChain; } if (ukuranChain < min) {
min = ukuranChain; } } average = total / jumlahBucket;
//perhitungan standar deviasi double totalKuadrat = 0;
for (int i = 0; i < jumlahBucket; i++) {
int ukuranChain = this.getKoleksiArrayList()[i].size();
totalKuadrat = totalKuadrat + (Math.pow((ukuranChain - average), 2)); } varians = totalKuadrat / jumlahBucket;
standardDeviation = Math.sqrt((totalKuadrat / jumlahBucket)); //selesai perhitungan standar deviasi
bw.newLine();
bw.write("Total Hash Bucket=" + this.getKoleksiArrayList().length); bw.newLine();
bw.write("Total=" + total); bw.newLine(); bw.write("Max=" + max); bw.newLine(); bw.write("Min=" + min); bw.newLine(); bw.write("Average=" + average); bw.newLine(); bw.write("Varians=" + varians); bw.newLine();
bw.write("Standar Deviasi=" + standardDeviation); bw.newLine(); //tambahan int counterZeroBucket=0; int counterSector1=0,counterSector2=0,counterSector3=0,counterSector4=0,counterSector5=0; double sector1=0,sector2=0,sector3=0,sector4=0,sector5=0; double sizeOfEachSector=((double)max/5); sector1=(sizeOfEachSector*1); sector1=Double.parseDouble(String.format("%.5g%n",sector1)); sector2=(sizeOfEachSector*2); sector2=Double.parseDouble(String.format("%.5g%n",sector2)); sector3=(sizeOfEachSector*3); sector3=Double.parseDouble(String.format("%.5g%n",sector3)); sector4=(sizeOfEachSector*4); sector4=Double.parseDouble(String.format("%.5g%n",sector4)); sector5=(sizeOfEachSector*5); sector5=Double.parseDouble(String.format("%.5g%n",sector5)); for (int i = 0; i < this.getKoleksiArrayList().length; i++) {
if (this.getKoleksiArrayList()[i].size()==0) { counterZeroBucket++; }else if (this.getKoleksiArrayList()[i].size()<sector1) { counterSector1++; }else if (this.getKoleksiArrayList()[i].size()<sector2) { counterSector2++; }else if (this.getKoleksiArrayList()[i].size()<sector3) { counterSector3++; }else if (this.getKoleksiArrayList()[i].size()<sector4) { counterSector4++; }else if (this.getKoleksiArrayList()[i].size()<=sector5) { counterSector5++; } } double emptyBucket=(double)(((double)counterZeroBucket)/((double)this.getKoleksiArrayList().length))*100; double filledBucket=(double)((((double)this.getKoleksiArrayList().length)-((double)counterZeroBucket))/((double)this.getKoleksiArrayList().length))*100;
bw.write("Empty Bucket (%) =" + emptyBucket); bw.newLine();
bw.write("Filled Bucket (%) =" + filledBucket); bw.newLine();
bw.write("Sector 1 : 0<=x< " + sector1+" ="+counterSector1); bw.newLine();
bw.write("Sector 2 : "+(sector1-1)+"<x< " + sector2+" ="+counterSector2); bw.newLine();
bw.write("Sector 3 : "+(sector2-1)+"<x< " + sector3+" ="+counterSector3); bw.newLine();
bw.write("Sector 4 : "+(sector3-1)+"<x< " + sector4+" ="+counterSector4); bw.newLine();
bw.write("Sector 5 : "+(sector4-1)+"<x<= " + sector5+" ="+counterSector5); bw.newLine();
//tambahan bw.newLine(); //persiapan data excel
for (int i = 0; i < jumlahBucket; i++) {
bw.write(String.valueOf(this.getKoleksiArrayList()[i].size())); bw.newLine(); }
//selesai persiapan data excel bw.close();
} catch (IOException ex) {
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } finally {
try { fw.close();
} catch (IOException ex) {
Logger.getLogger(IndexingProcess.class.getName()).log(Level.SEVERE, null, ex); } } timeFinish = (double) System.nanoTime();
time = ((double) (timeFinish - timeStart)) / 1000000000; System.out.println("Start time LOG HashTable : " + timeStart); System.out.println("Finish time LOG HashTable : " + timeFinish); System.out.println("Total time use : " + time); }
/**
* @return the workSpace */
public String getWorkSpace() { return workSpace; } /**
* @param workSpace the workSpace to set */
public void setWorkSpace(String workSpace) { this.workSpace = workSpace; }}
Lampiran 3 Listing SearchingProcess
/*
* To change this template, choose Tools | Templates * and open the template in the editor.
*/ package skripsi; import java.util.ArrayList; /** * * @author Lie_Lianx */
public class searchingProcess {
private ArrayList<Koleksi>[] koleksiArrayList; private PreProcessing preProcessing;
double knuth;
public searchingProcess(ArrayList<Koleksi>[] koleksiArrayList) { this.koleksiArrayList = koleksiArrayList;
preProcessing = new PreProcessing(); knuth=((double)(((Math.sqrt(5))) - 1) / 2); }
public void searchInHashTable(String keyWords, String hashMethod, int mixingStepTipe) { String[] keywordArray = keyWords.split(" ");
Object[] stopwordRemoval = preProcessing.stopwordRemoval(keywordArray); Object[] stemmingIndonesia = preProcessing.stemming(stopwordRemoval); String word;
long mixingCode;
ArrayList<Koleksi> andSearch=new ArrayList<Koleksi>(); for (int i = 0; i < stemmingIndonesia.length; i++) {
String keyWord = (String) stemmingIndonesia[i]; MixingStep MS = new MixingStep();
mixingCode = 0;
word = ((String) stemmingIndonesia[i]);
//perhitungan waktu mulai pencarian menggunakan HashTable double timeStart = (double) System.nanoTime();
double timeFinish=0; double time;
//perhitungan waktu mulai pencarian menggunakan HashTable mixingCode=MS.PJWHash(word);
int hashCode;
if (hashMethod.equalsIgnoreCase("division")) { //division method
hashCode = (int) (mixingCode % this.koleksiArrayList.length); //end division method
} else {
//multiplication method
hashCode = (int) (this.koleksiArrayList.length * ((mixingCode * knuth) % 1)); //end multiplication method
}
//proses pencarian dengan menggunakan hashCode boolean isFindInHashTable = false;
ArrayList<Koleksi> bucket = this.getKoleksiArrayList()[hashCode]; for (int j = 0; j < bucket.size(); j++) {
if (bucket.get(j).getKata().equalsIgnoreCase(keyWord)) { timeFinish = (double) System.nanoTime();
System.out.println("\nBuku yang mengandung keyWord : " + keyWord + ", berhasil ditemukan di " + bucket.get(j).getIdBuku().size() + " koleksi.");
andSearch.add(this.getKoleksiArrayList()[hashCode].get(j));
System.out.println("\nBuku yang mengandung keyWord : " + keyWord + ", berhasil ditemukan : \n"); for (int k = 0; k < this.getKoleksiArrayList()[hashCode].get(j).getIdBuku().size(); k++) {
String idBuku = this.getKoleksiArrayList()[hashCode].get(j).getIdBuku().get(k);
int jumlahKataDalamBuku = this.getKoleksiArrayList()[hashCode].get(j).getJumlahKataDalamBuku().get(k); isFindInHashTable = true;
System.out.println("ID Buku : " + idBuku + " --- Jumlah kata : " + jumlahKataDalamBuku); } isFindInHashTable = true; break; } } if (!isFindInHashTable) {
timeFinish = (double) System.nanoTime();
System.out.println("\nBuku yang mengandung keyWord : " + keyWord + ", gagal ditemukan."); }
//selesai proses pencarian dengan menggunakan hashCode }
if (andSearch.size()==stemmingIndonesia.length) {
int positionOfSmallestSumOfBookID=Integer.MAX_VALUE; for (int i = 0; i < andSearch.size(); i++) {
int sumOfBookID=andSearch.get(i).getIdBuku().size(); if (sumOfBookID<=positionOfSmallestSumOfBookID) {
positionOfSmallestSumOfBookID=i; } }
Koleksi temp=new Koleksi();
Koleksi selectedKoleksi=andSearch.get(positionOfSmallestSumOfBookID); temp.setKata(selectedKoleksi.getKata()); temp.setIdBuku(selectedKoleksi.getIdBuku()); temp.setJumlahKataDalamBuku(selectedKoleksi.getJumlahKataDalamBuku()); andSearch.set(positionOfSmallestSumOfBookID, andSearch.get(0)); andSearch.set(0, temp); ArrayList<String> listOfBookID=andSearch.get(0).getIdBuku(); boolean find=true;
for (int i = 0; i < listOfBookID.size(); i++) { find=true;
ArrayList<Integer> frek=new ArrayList<Integer>(); idBuku=andSearch.get(0).getIdBuku().get(i);
frek.add(andSearch.get(0).getJumlahKataDalamBuku().get(i)); for (int j = 1; j < andSearch.size(); j++) {
if (!andSearch.get(j).getIdBuku().contains(listOfBookID.get(i))) { find=false; break; }else{ int index=andSearch.get(j).getIdBuku().indexOf(listOfBookID.get(i)); frek.add(andSearch.get(j).getJumlahKataDalamBuku().get(index)); } } if (find) {
System.out.println("Hasil pencarian AND didapat pada koleksi : "+idBuku+" dengan rincian : "); for (int j = 0; j < andSearch.size(); j++) {
System.out.println("Kata "+andSearch.get(j).getKata()+" ditemukan sebanyak : "+frek.get(j)+" buah."); } } }
}else{
System.out.println("Hasil pencarian AND, tidak berhasil ditemukan."); } }
/**
* @return the koleksiArrayList */
public ArrayList<Koleksi>[] getKoleksiArrayList() { return koleksiArrayList;
} /**
* @param koleksiArrayList the koleksiArrayList to set */
public void setKoleksiArrayList(ArrayList<Koleksi>[] koleksiArrayList) { this.koleksiArrayList = koleksiArrayList;
}}
Lampiran 4 Listing Koleksi
package skripsi; import java.util.ArrayList; /** * * @author Lie_Lianx */
public class Koleksi { private String kata;
private ArrayList<String> idBuku;
private ArrayList<Integer> jumlahKataDalamBuku; public Koleksi(){
kata="";
idBuku=new ArrayList<String>();
jumlahKataDalamBuku=new ArrayList<Integer>(); } /**
* @return the kata */
public String getKata() { return kata; } /**
* @param kata the kata to set */
public void setKata(String kata) { this.kata = kata; }
* @return the idBuku */
public ArrayList<String> getIdBuku() { return idBuku; }
/**
* @param idBuku the idBuku to set */
public void setIdBuku(ArrayList<String> idBuku) { this.idBuku = idBuku; }
/**
* @return the jumlahKataDalamBuku */
public ArrayList<Integer> getJumlahKataDalamBuku() { return jumlahKataDalamBuku; }
/**
* @param jumlahKataDalamBuku the jumlahKataDalamBuku to set */
public void setJumlahKataDalamBuku(ArrayList<Integer> jumlahKataDalamBuku) { this.jumlahKataDalamBuku = jumlahKataDalamBuku; } }
Lampiran 5 Listing Stemmer
package skripsi; /** * * @author Lie_Lianx */ /*
Porter stemmer in Java. The original paper is in
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, no. 3, pp 130-137,
See also http://www.tartarus.org/~martin/PorterStemmer History:
Release 1
Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] is then out outside the bounds of b.
Release 2 Similarly,
Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and b[j] is then outside the bounds of b.
Release 3
Considerably revised 4/9/00 in the light of many helpful suggestions from Brian Goetz of Quiotix Corporation ([email protected]). Release 4
*/
/**
* Stemmer, implementing the Porter Stemming Algorithm *
* The Stemmer class transforms a word into its root form. The input * word can be provided a character at time (by calling add()), or at once * by calling one of the various stem(something) methods.
*/
class Stemmer { private char[] b;
private int i, /* offset into b */
i_end, /* offset to end of stemmed word */ j, k;
private static final int INC = 50; /* unit of size whereby b is increased */ public Stemmer() { b = new char[INC]; i = 0; i_end = 0; } /**
* Add a character to the word being stemmed. When you are finished * adding characters, you can call stem(void) to stem the word. */
public void add(char ch) { if (i == b.length) {
char[] new_b = new char[i + INC]; for (int c = 0; c < i; c++) { new_b[c] = b[c]; } b = new_b; } b[i++] = ch; }
/** Adds wLen characters to the word being stemmed contained in a portion * of a char[] array. This is like repeated calls of add(char ch), but
* faster. */
public void add(char[] w, int wLen) { if (i + wLen >= b.length) {
char[] new_b = new char[i + wLen + INC]; for (int c = 0; c < i; c++) {
new_b[c] = b[c]; }
b = new_b; }
for (int c = 0; c < wLen; c++) { b[i++] = w[c];
} } /**
* After a word has been stemmed, it can be retrieved by toString(), * or a reference to the internal buffer can be retrieved by getResultBuffer * and getResultLength (which is generally more efficient.)
*/
public String toString() { return new String(b, 0, i_end); }
* Returns the length of the word resulting from the stemming process. */
public int getResultLength() { return i_end;
} /**
* Returns a reference to a character buffer containing the results of * the stemming process. You also need to consult getResultLength() * to determine the length of the result.
*/
public char[] getResultBuffer() { return b;
}
/* cons(i) is true <=> b[i] is a consonant. */ private final boolean cons(int i) {
switch (b[i]) { case 'a': case 'e': case 'i': case 'o': case 'u': return false; case 'y':
return (i == 0) ? true : !cons(i - 1); default:
return true; }
}
/* m() measures the number of consonant sequences between 0 and j. if c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence, <c><v> gives 0 <c>vc<v> gives 1 <c>vcvc<v> gives 2 <c>vcvcvc<v> gives 3 .... */
private final int m() { int n = 0; int i = 0; while (true) { if (i > j) { return n; } if (!cons(i)) { break; } i++; } i++; while (true) { while (true) { if (i > j) { return n; } if (cons(i)) { break; } i++; }
i++; n++; while (true) { if (i > j) { return n; } if (!cons(i)) { break; } i++; } i++; } }
/* vowelinstem() is true <=> 0,...j contains a vowel */ private final boolean vowelinstem() {
int i; for (i = 0; i <= j; i++) { if (!cons(i)) { return true; } } return false; }
/* doublec(j) is true <=> j,(j-1) contain a double consonant. */ private final boolean doublec(int j) {
if (j < 1) { return false; } if (b[j] != b[j - 1]) { return false; } return cons(j); }
/* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant and also if the second c is not w,x or y. this is used when trying to restore an e at the end of a short word. e.g.
cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
*/
private final boolean cvc(int i) {
if (i < 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) { return false; } { int ch = b[i]; if (ch == 'w' || ch == 'x' || ch == 'y') { return false; } } return true; }
private final boolean ends(String s) { int l = s.length();
int o = k - l + 1; if (o < 0) {
return false; }
for (int i = 0; i < l; i++) { if (b[o + i] != s.charAt(i)) { return false; } } j = k - l; return true; }
/* setto(s) sets (j+1),...k to the characters in the string s, readjusting k. */
private final void setto(String s) { int l = s.length();
int o = j + 1;
for (int i = 0; i < l; i++) { b[o + i] = s.charAt(i); }
k = j + l; }
/* r(s) is used further down. */ private final void r(String s) {
if (m() > 0) { setto(s); } }
/* step1() gets rid of plurals and -ed or -ing. e.g. caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat feed -> feed agreed -> agree disabled -> disable matting -> mat mating -> mate meeting -> meet milling -> mill messing -> mess meetings -> meet */
private final void step1() { if (b[k] == 's') { if (ends("sses")) { k -= 2; } else if (ends("ies")) { setto("i"); } else if (b[k - 1] != 's') { k--; } } if (ends("eed")) { if (m() > 0) { k--; }
} else if ((ends("ed") || ends("ing")) && vowelinstem()) { k = j;
if (ends("at")) { setto("ate"); } else if (ends("bl")) { setto("ble"); } else if (ends("iz")) { setto("ize"); } else if (doublec(k)) { k--; { int ch = b[k]; if (ch == 'l' || ch == 's' || ch == 'z') { k++; } } } else if (m() == 1 && cvc(k)) { setto("e"); } } }
/* step2() turns terminal y to i when there is another vowel in the stem. */ private final void step2() {
if (ends("y") && vowelinstem()) { b[k] = 'i';
} }
/* step3() maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc. note that the string before the suffix must give m() > 0. */
private final void step3() {