001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.reef.examples.group.bgd.data.parser; 020 021import org.apache.commons.lang.StringUtils; 022import org.apache.reef.examples.group.bgd.data.Example; 023import org.apache.reef.examples.group.bgd.data.SparseExample; 024 025import javax.inject.Inject; 026import java.io.*; 027import java.nio.charset.StandardCharsets; 028import java.util.ArrayList; 029import java.util.List; 030import java.util.logging.Level; 031import java.util.logging.Logger; 032 033/** 034 * A Parser for SVMLight records. 035 */ 036public class SVMLightParser implements Parser<String> { 037 038 private static final Logger LOG = Logger.getLogger(SVMLightParser.class.getName()); 039 040 @Inject 041 public SVMLightParser() { 042 } 043 044 @Override 045 public Example parse(final String line) { 046 047 final int entriesCount = StringUtils.countMatches(line, ":"); 048 final int[] indices = new int[entriesCount]; 049 final float[] values = new float[entriesCount]; 050 051 final String[] entries = StringUtils.split(line, ' '); 052 String labelStr = entries[0]; 053 054 final boolean pipeExists = labelStr.indexOf('|') != -1; 055 if (pipeExists) { 056 labelStr = labelStr.substring(0, labelStr.indexOf('|')); 057 } 058 double label = Double.parseDouble(labelStr); 059 060 if (label != 1) { 061 label = -1; 062 } 063 064 for (int j = 1; j < entries.length; ++j) { 065 final String x = entries[j]; 066 final String[] entity = StringUtils.split(x, ':'); 067 final int offset = pipeExists ? 0 : 1; 068 indices[j - 1] = Integer.parseInt(entity[0]) - offset; 069 values[j - 1] = Float.parseFloat(entity[1]); 070 } 071 return new SparseExample(label, values, indices); 072 } 073 074 public static void main(final String[] args) { 075 final Parser<String> parser = new SVMLightParser(); 076 for (int i = 0; i < 10; i++) { 077 final List<SparseExample> examples = new ArrayList<>(); 078 float avgFtLen = 0; 079 try (final BufferedReader br = new BufferedReader(new InputStreamReader( 080 new FileInputStream("C:\\Users\\shravan\\data\\splice\\hdi\\hdi_uncomp\\part-r-0000" + i), 081 StandardCharsets.UTF_8))) { 082 String line; 083 while ((line = br.readLine()) != null) { 084 final SparseExample spEx = (SparseExample) parser.parse(line); 085 avgFtLen += spEx.getFeatureLength(); 086 examples.add(spEx); 087 } 088 } catch (final IOException e) { 089 throw new RuntimeException("Exception", e); 090 } 091 092 LOG.log(Level.INFO, "OUT: {0} {1} {2}", 093 new Object[] {examples.size(), avgFtLen, avgFtLen / examples.size()}); 094 095 examples.clear(); 096 } 097 } 098}