This project has retired. For details please refer to its Attic page.
Source code
001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *   http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing,
013 * software distributed under the License is distributed on an
014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015 * KIND, either express or implied.  See the License for the
016 * specific language governing permissions and limitations
017 * under the License.
018 */
019package org.apache.reef.examples.group.bgd.data.parser;
020
021import org.apache.commons.lang.StringUtils;
022import org.apache.reef.examples.group.bgd.data.Example;
023import org.apache.reef.examples.group.bgd.data.SparseExample;
024
025import javax.inject.Inject;
026import java.io.*;
027import java.nio.charset.StandardCharsets;
028import java.util.ArrayList;
029import java.util.List;
030import java.util.logging.Level;
031import java.util.logging.Logger;
032
033/**
034 * A Parser for SVMLight records.
035 */
036public class SVMLightParser implements Parser<String> {
037
038  private static final Logger LOG = Logger.getLogger(SVMLightParser.class.getName());
039
040  @Inject
041  public SVMLightParser() {
042  }
043
044  @Override
045  public Example parse(final String line) {
046
047    final int entriesCount = StringUtils.countMatches(line, ":");
048    final int[] indices = new int[entriesCount];
049    final float[] values = new float[entriesCount];
050
051    final String[] entries = StringUtils.split(line, ' ');
052    String labelStr = entries[0];
053
054    final boolean pipeExists = labelStr.indexOf('|') != -1;
055    if (pipeExists) {
056      labelStr = labelStr.substring(0, labelStr.indexOf('|'));
057    }
058    double label = Double.parseDouble(labelStr);
059
060    if (label != 1) {
061      label = -1;
062    }
063
064    for (int j = 1; j < entries.length; ++j) {
065      final String x = entries[j];
066      final String[] entity = StringUtils.split(x, ':');
067      final int offset = pipeExists ? 0 : 1;
068      indices[j - 1] = Integer.parseInt(entity[0]) - offset;
069      values[j - 1] = Float.parseFloat(entity[1]);
070    }
071    return new SparseExample(label, values, indices);
072  }
073
074  public static void main(final String[] args) {
075    final Parser<String> parser = new SVMLightParser();
076    for (int i = 0; i < 10; i++) {
077      final List<SparseExample> examples = new ArrayList<>();
078      float avgFtLen = 0;
079      try (final BufferedReader br = new BufferedReader(new InputStreamReader(
080              new FileInputStream("C:\\Users\\shravan\\data\\splice\\hdi\\hdi_uncomp\\part-r-0000" + i),
081              StandardCharsets.UTF_8))) {
082        String line;
083        while ((line = br.readLine()) != null) {
084          final SparseExample spEx = (SparseExample) parser.parse(line);
085          avgFtLen += spEx.getFeatureLength();
086          examples.add(spEx);
087        }
088      } catch (final IOException e) {
089        throw new RuntimeException("Exception", e);
090      }
091
092      LOG.log(Level.INFO, "OUT: {0} {1} {2}",
093          new Object[] {examples.size(), avgFtLen, avgFtLen / examples.size()});
094
095      examples.clear();
096    }
097  }
098}