diff --git a/mailing_list/index.html b/mailing_list/index.html
index 76b7999..fae0321 100644
--- a/mailing_list/index.html
+++ b/mailing_list/index.html
@@ -577,8 +577,7 @@ <h1>Mailing list</h1>
 <li><a href="https://sourceforge.net/p/weka/mailman/weka-users/">Archives</a> for searching previous posted messages.</li>
 </ul>
 <p>Before posting, please read the <a href="https://ml.cms.waikato.ac.nz/weka/mailinglist_etiquette.html">mailing list etiquette</a>.</p>
-<p>Once you have subscribed to the list (a moderator may have to approve your request), you can send posts to the list
-using the following email address:</p>
+<p>Once you have subscribed to the list, you can send posts to the list using the following email address:</p>
 <div class="highlight"><pre><span></span><code><a id="__codelineno-0-1" name="__codelineno-0-1" href="#__codelineno-0-1"></a>weka-users@lists.sourceforge.net
 </code></pre></div>
 <p><strong>NB:</strong> The mailing list moved to Sourceforge.net in mid-December 2024, due to
diff --git a/search/search_index.json b/search/search_index.json
index 28063c1..8075168 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"New to Weka? # Have a look at the Frequently Asked Questions (FAQ), the Troubleshooting article or search the mailing list archives . Don't forget to check out the documentation and the online courses . You have questions regarding Weka? # You can post questions to the Weka mailing list . Please keep in mind that you cannot expect an immediate answer to your question(s). The questions are mainly answered by volunteers, Weka users just like you. You are looking for packages? # With Weka 3.7.2 and later, you can easily install packages through Weka's package manager interface, either official ones or unofficial ones. Have a look at the Packages article for more information on this topic. You want to contribute to the wiki? # The wiki is based on Markdown articles, which are turned into static HTML using MkDocs (see here for details on writing articles). The content of the wiki is available as repository on GitHub . Feel free to add/update and then do a pull request . You found a bug? # Please post the bug report to the Weka mailing list . The following information will help tracking things down: version of Weka (e.g., 3.9.6) operating system (e.g., Windows 11 or Ubuntu 20.04 64bit) Java version (e.g., 11.0.11+9) You can also run the following command in the SimpleCLI and attach the generated output as a text file to your post: java weka.core.SystemInfo","title":"Home"},{"location":"#new-to-weka","text":"Have a look at the Frequently Asked Questions (FAQ), the Troubleshooting article or search the mailing list archives . Don't forget to check out the documentation and the online courses .","title":"New to Weka?"},{"location":"#you-have-questions-regarding-weka","text":"You can post questions to the Weka mailing list . Please keep in mind that you cannot expect an immediate answer to your question(s). The questions are mainly answered by volunteers, Weka users just like you.","title":"You have questions regarding Weka?"},{"location":"#you-are-looking-for-packages","text":"With Weka 3.7.2 and later, you can easily install packages through Weka's package manager interface, either official ones or unofficial ones. Have a look at the Packages article for more information on this topic.","title":"You are looking for packages?"},{"location":"#you-want-to-contribute-to-the-wiki","text":"The wiki is based on Markdown articles, which are turned into static HTML using MkDocs (see here for details on writing articles). The content of the wiki is available as repository on GitHub . Feel free to add/update and then do a pull request .","title":"You want to contribute to the wiki?"},{"location":"#you-found-a-bug","text":"Please post the bug report to the Weka mailing list . The following information will help tracking things down: version of Weka (e.g., 3.9.6) operating system (e.g., Windows 11 or Ubuntu 20.04 64bit) Java version (e.g., 11.0.11+9) You can also run the following command in the SimpleCLI and attach the generated output as a text file to your post: java weka.core.SystemInfo","title":"You found a bug?"},{"location":"add_weights_to_dataset/","text":"The following examples show how to add weights to normal datasets and save them in the new XRFF data format. A version of Weka later than 3.5.3 (or the code from Git ) is necessary for this code to work. Add arbitrary weights # import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", sets class if necessary (in that case the last * attribute), adds some test weights and saves it as XRFF file * under \"args[1]\". E.g.: <br/> * AddWeights anneal.arff anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeights { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * set weights double factor = 0.5 / ( double ) data . numInstances (); for ( int i = 0 ; i < data . numInstances (); i ++ ) { data . instance ( i ). setWeight ( 0.5 + factor * i ); } // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 1 ] )); saver . setInstances ( data ); saver . writeBatch (); } } Add weights stored in an external file # import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.BufferedReader ; import java.io.File ; import java.io.FileReader ; /** * Loads file \"args[0]\" (can be ARFF, CSV, C4.5, etc.), sets class if necessary * (in that case the last attribute), adds weights from \"args[1]\" (one weight * per line) and saves it as XRFF file under \"args[2]\". E.g.: <br/> * AddWeightsFromFile anneal.arff weights.txt anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeightsFromFile { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * read and set weights BufferedReader reader = new BufferedReader ( new FileReader ( args [ 1 ] )); for ( int i = 0 ; i < data . numInstances (); i ++ ) { String line = reader . readLine (); double weight = Double . parseDouble ( line ); data . instance ( i ). setWeight ( weight ); } reader . close (); // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } } Add weights stored in the attribute # import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", Adds weight given in attribute with * index \"args[1]\" - 1, deletes this attribute. * sets class if necessary (in that case the last * attribute) and saves it as XRFF file * under \"args[2]\". E.g.: <br/> * AddWeightsFromAtt file.arff 2 file.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) * @author gabi (gs23 at waikato dot ac dot nz) */ public class AddWeightsFromAtt { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); * get weight index int wIndex = Integer . parseInt ( args [ 1 ] ) - 1 ; * set weights for ( int i = 0 ; i < data . numInstances (); i ++ ) { double weight = data . instance ( i ). value ( wIndex ); data . instance ( i ). setWeight ( weight ); } * delete weight attribute and set class index data . deleteAttributeAt ( wIndex ); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } } Download # AddWeights.java AddWeightsFromFile.java AddWeightsFromAtt.java See also # git The unofficial Weka package dataset-weights allows you to modify attribute/instance weights using filters - no coding required","title":"Add weights to dataset"},{"location":"add_weights_to_dataset/#add-arbitrary-weights","text":"import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", sets class if necessary (in that case the last * attribute), adds some test weights and saves it as XRFF file * under \"args[1]\". E.g.: <br/> * AddWeights anneal.arff anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeights { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * set weights double factor = 0.5 / ( double ) data . numInstances (); for ( int i = 0 ; i < data . numInstances (); i ++ ) { data . instance ( i ). setWeight ( 0.5 + factor * i ); } // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 1 ] )); saver . setInstances ( data ); saver . writeBatch (); } }","title":"Add arbitrary weights"},{"location":"add_weights_to_dataset/#add-weights-stored-in-an-external-file","text":"import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.BufferedReader ; import java.io.File ; import java.io.FileReader ; /** * Loads file \"args[0]\" (can be ARFF, CSV, C4.5, etc.), sets class if necessary * (in that case the last attribute), adds weights from \"args[1]\" (one weight * per line) and saves it as XRFF file under \"args[2]\". E.g.: <br/> * AddWeightsFromFile anneal.arff weights.txt anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeightsFromFile { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * read and set weights BufferedReader reader = new BufferedReader ( new FileReader ( args [ 1 ] )); for ( int i = 0 ; i < data . numInstances (); i ++ ) { String line = reader . readLine (); double weight = Double . parseDouble ( line ); data . instance ( i ). setWeight ( weight ); } reader . close (); // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } }","title":"Add weights stored in an external file"},{"location":"add_weights_to_dataset/#add-weights-stored-in-the-attribute","text":"import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", Adds weight given in attribute with * index \"args[1]\" - 1, deletes this attribute. * sets class if necessary (in that case the last * attribute) and saves it as XRFF file * under \"args[2]\". E.g.: <br/> * AddWeightsFromAtt file.arff 2 file.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) * @author gabi (gs23 at waikato dot ac dot nz) */ public class AddWeightsFromAtt { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); * get weight index int wIndex = Integer . parseInt ( args [ 1 ] ) - 1 ; * set weights for ( int i = 0 ; i < data . numInstances (); i ++ ) { double weight = data . instance ( i ). value ( wIndex ); data . instance ( i ). setWeight ( weight ); } * delete weight attribute and set class index data . deleteAttributeAt ( wIndex ); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } }","title":"Add weights stored in the attribute"},{"location":"add_weights_to_dataset/#download","text":"AddWeights.java AddWeightsFromFile.java AddWeightsFromAtt.java","title":"Download"},{"location":"add_weights_to_dataset/#see-also","text":"git The unofficial Weka package dataset-weights allows you to modify attribute/instance weights using filters - no coding required","title":"See also"},{"location":"adding_attributes_to_dataset/","text":"The following example class adds a nominal and a numeric attribute to the dataset identified by the filename given as first parameter. The second parameter defines whether the data is manipulated via the Add filter (= filter ) or through the Weka API directly (= java ). Usage: AddAttribute <file.arff> <filter|java> Source code: import weka.core.* ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Add ; import java.io.* ; import java.util.* ; /** * Adds a nominal and a numeric attribute to the dataset provided as first * parameter (and fills it with random values) and outputs the result to * stdout. It's either done via the Add filter (first option \"filter\") * or manual with Java (second option \"java\"). * * Usage: AddAttribute &lt;file.arff&gt; &lt;filter|java&gt; * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddAttribute { /** * adds the attributes * * @param args the commandline arguments */ public static void main ( String [] args ) throws Exception { if ( args . length != 2 ) { System . out . println ( \"\\nUsage: AddAttribute <file.arff> <filter|java>\\n\" ); System . exit ( 1 ); } // load dataset Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); Instances newData = null ; // filter or java? if ( args [ 1 ] . equals ( \"filter\" )) { Add filter ; newData = new Instances ( data ); // 1. nominal attribute filter = new Add (); filter . setAttributeIndex ( \"last\" ); filter . setNominalLabels ( \"A,B,C,D\" ); filter . setAttributeName ( \"NewNominal\" ); filter . setInputFormat ( newData ); newData = Filter . useFilter ( newData , filter ); // 2. numeric attribute filter = new Add (); filter . setAttributeIndex ( \"last\" ); filter . setAttributeName ( \"NewNumeric\" ); filter . setInputFormat ( newData ); newData = Filter . useFilter ( newData , filter ); } else if ( args [ 1 ] . equals ( \"java\" )) { newData = new Instances ( data ); // add new attributes // 1. nominal FastVector values = new FastVector (); /* FastVector is now deprecated. Users can use any java.util.List */ values . addElement ( \"A\" ); /* implementation now */ values . addElement ( \"B\" ); values . addElement ( \"C\" ); values . addElement ( \"D\" ); newData . insertAttributeAt ( new Attribute ( \"NewNominal\" , values ), newData . numAttributes ()); // 2. numeric newData . insertAttributeAt ( new Attribute ( \"NewNumeric\" ), newData . numAttributes ()); } else { System . out . println ( \"\\nUsage: AddAttribute <file.arff> <filter|java>\\n\" ); System . exit ( 2 ); } // random values Random rand = new Random ( 1 ); for ( int i = 0 ; i < newData . numInstances (); i ++ ) { // 1. nominal // index of labels A:0,B:1,C:2,D:3 newData . instance ( i ). setValue ( newData . numAttributes () - 2 , rand . nextInt ( 4 )); // 2. numeric newData . instance ( i ). setValue ( newData . numAttributes () - 1 , rand . nextDouble ()); } // output on stdout System . out . println ( newData ); } } See also # Creating an ARFF file - explains the creation of all the different attribute types Use Weka in your Java code - for general usage of the Weka API Save Instances to an ARFF File - if you want to save the output to a file instead of printing them to stdout Downloads # AddAttribute.java ( stable , developer )","title":"Adding attributes to dataset"},{"location":"adding_attributes_to_dataset/#see-also","text":"Creating an ARFF file - explains the creation of all the different attribute types Use Weka in your Java code - for general usage of the Weka API Save Instances to an ARFF File - if you want to save the output to a file instead of printing them to stdout","title":"See also"},{"location":"adding_attributes_to_dataset/#downloads","text":"AddAttribute.java ( stable , developer )","title":"Downloads"},{"location":"adding_tabs_in_the_explorer/","text":"Description # This article explains how to add extra tabs in the Explorer in order to add new functionality without the hassle of having to dig into the Explorer code oneself. With the new plugin-architecture of the Explorer it is fairly easy making your extensions available in the GUI. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0 of the stable-3.6/developer version later than 10/01/2010. Version # 3.5.5 Requirements # Here is roughly what is required in order to add a new tab (the examples go into more detail): your class must be derived from javax.swing.JPanel your class must implemented the interface weka.gui.explorer.Explorer.ExplorerPanel optional interfaces weka.gui.explorer.Explorer.LogHandler in case you want to take advantage of the logging in the Explorer weka.gui.explorer.Explorer.CapabilitiesFilterChangeListener in case your class needs to be notified of changes in the Capabilities, e.g., if new data is loaded into the Explorer adding the classname of your class to the Tabs property in the Explorer.props file Examples # The following examples demonstrate the new plugin architecture (a bold term for such a simple extension mechanism). Only the necessary details are discussed, as the full source code is available for download as well. SQL worksheet # Purpose # Displaying the SqlViewer as a tab in the Explorer instead of using it either via the Open DB... button or as standalone application. Uses the existing components already available in Weka and just assembles them in a JPanel . Since this tab does not rely on a dataset being loaded into the Explorer, it will be used as a standalone one. Useful for people who are working a lot with databases and would like to have an SQL worksheet available all the time instead of clicking on a button every time to open up a database dialog. Implementation # class is derived from javax.swing.JPanel and implements the weka.gui.explorer.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.explorer.Explorer.LogHandler interface, but that is only additional functionality): public class SqlPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"SQL\" ; * what ' s displayed as tab - title , e . g ., * Classify // } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Retrieving data from databases\" ; // the tooltip of the tab } /** ignored, since we *\"generate\"* data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements /** the actual SQL worksheet */ protected SqlViewer m_Viewer ; /** the panel for the buttons */ protected JPanel m_PanelButtons ; /** the Load button - makes the data available in the Explorer */ protected JButton m_ButtonLoad = new JButton ( \"Load data\" ); /** displays the current query */ protected JLabel m_LabelQuery = new JLabel ( \"\" ); * loading the data into the Explorer by clicking on the Load button will fire a property change: m_ButtonLoad . addActionListener ( new ActionListener () { public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { * load data InstanceQuery query = new InstanceQuery (); query . setDatabaseURL ( m_Viewer . getURL ()); query . setUsername ( m_Viewer . getUser ()); query . setPassword ( m_Viewer . getPassword ()); Instances data = query . retrieveInstances ( m_Viewer . getQuery ()); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); } } }); * In order to add our SqlPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.SqlPanel,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Screenshot # Source # SqlPanel.java ( stable-3.8 , developer ) Artificial data generation # Purpose # Instead of only having a Generate... button in the PreprocessPanel or using it from commandline, this example creates a new panel to be displayed as extra tab in the Explorer. This tab will be available regardless whether a dataset is already loaded or not (= standalone ). Implementation # class is derived from javax.swing.JPanel and implements the weka.gui.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.Explorer.LogHandler interface, but that is only additional functionality): public class GeneratorPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have (the same as for the SqlPanel class): /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces (almost identical to SqlPanel ): /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"DataGeneration\" ; // what's displayed as tab-title, e.g., Classify } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Generating artificial datasets\" ; // the tooltip of the tab } /** ignored, since we \"generate\" data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements: /** the GOE for the generators */ protected GenericObjectEditor m_GeneratorEditor = new GenericObjectEditor (); /** the text area for the output of the generated data */ protected JTextArea m_Output = new JTextArea (); /** the Generate button */ protected JButton m_ButtonGenerate = new JButton ( \"Generate\" ); /** the Use button */ protected JButton m_ButtonUse = new JButton ( \"Use\" ); * the Generate button doesn't load the generated data directly into the Explorer, but only outputs in the JTextArea (this is done with the Use button - see further down): m_ButtonGenerate . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ DataGenerator generator = ( DataGenerator ) m_GeneratorEditor . getValue (); String relName = generator . getRelationName (); String cname = generator . getClass (). getName (). replaceAll ( \".*\\\\.\" , \"\" ); String cmd = generator . getClass (). getName (); if ( generator instanceof OptionHandler ) cmd += \" \" + Utils . joinOptions ((( OptionHandler ) generator ). getOptions ()); try { * generate data StringWriter output = new StringWriter (); generator . setOutput ( new PrintWriter ( output )); DataGenerator . makeData ( generator , generator . getOptions ()); m_Output . setText ( output . toString ()); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } generator . setRelationName ( relName ); } }); * the Use button finally fires a property change event that will load the data into the Explorer: m_ButtonUse . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { Instances data = new Instances ( new StringReader ( m_Output . getText ())); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } } }); * In order to add our GeneratorPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.GeneratorPanel:standalone,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Note: the standalone option is used to make the tab available without requiring the preprocess panel to load a dataset first. Screenshot # Source # GeneratorPanel.java ( stable-3.8 , developer ) Experimenter \"light\" # Purpose # By default the Classify panel only performs 1 run of 10-fold cross-validation. Since most classifiers are rather sensitive to the order of the data being presented to them, those results can be too optimistic or pessimistic. Averaging the results over 10 runs with differently randomized train/test pairs returns more reliable results. And this is where this plugin comes in: it can be used to obtain statistical sound results for a specific classifier/dataset combination, without having to setup a whole experiment in the Experimenter. Implementation # Since this plugin is rather bulky, we omit the implementation details, but the following can be said: based on the weka.gui.explorer.ClassifierPanel the actual code doing the work follows the example in Using the Experiment API article * In order to add our ExperimentPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ExperimentPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Screenshot # Source # ExperimentPanel.java ( stable-3.6 , developer )","title":"Description"},{"location":"adding_tabs_in_the_explorer/#description","text":"This article explains how to add extra tabs in the Explorer in order to add new functionality without the hassle of having to dig into the Explorer code oneself. With the new plugin-architecture of the Explorer it is fairly easy making your extensions available in the GUI. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0 of the stable-3.6/developer version later than 10/01/2010.","title":"Description"},{"location":"adding_tabs_in_the_explorer/#version","text":"3.5.5","title":"Version"},{"location":"adding_tabs_in_the_explorer/#requirements","text":"Here is roughly what is required in order to add a new tab (the examples go into more detail): your class must be derived from javax.swing.JPanel your class must implemented the interface weka.gui.explorer.Explorer.ExplorerPanel optional interfaces weka.gui.explorer.Explorer.LogHandler in case you want to take advantage of the logging in the Explorer weka.gui.explorer.Explorer.CapabilitiesFilterChangeListener in case your class needs to be notified of changes in the Capabilities, e.g., if new data is loaded into the Explorer adding the classname of your class to the Tabs property in the Explorer.props file","title":"Requirements"},{"location":"adding_tabs_in_the_explorer/#examples","text":"The following examples demonstrate the new plugin architecture (a bold term for such a simple extension mechanism). Only the necessary details are discussed, as the full source code is available for download as well.","title":"Examples"},{"location":"adding_tabs_in_the_explorer/#sql-worksheet","text":"","title":"SQL worksheet"},{"location":"adding_tabs_in_the_explorer/#purpose","text":"Displaying the SqlViewer as a tab in the Explorer instead of using it either via the Open DB... button or as standalone application. Uses the existing components already available in Weka and just assembles them in a JPanel . Since this tab does not rely on a dataset being loaded into the Explorer, it will be used as a standalone one. Useful for people who are working a lot with databases and would like to have an SQL worksheet available all the time instead of clicking on a button every time to open up a database dialog.","title":"Purpose"},{"location":"adding_tabs_in_the_explorer/#implementation","text":"class is derived from javax.swing.JPanel and implements the weka.gui.explorer.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.explorer.Explorer.LogHandler interface, but that is only additional functionality): public class SqlPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"SQL\" ; * what ' s displayed as tab - title , e . g ., * Classify // } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Retrieving data from databases\" ; // the tooltip of the tab } /** ignored, since we *\"generate\"* data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements /** the actual SQL worksheet */ protected SqlViewer m_Viewer ; /** the panel for the buttons */ protected JPanel m_PanelButtons ; /** the Load button - makes the data available in the Explorer */ protected JButton m_ButtonLoad = new JButton ( \"Load data\" ); /** displays the current query */ protected JLabel m_LabelQuery = new JLabel ( \"\" ); * loading the data into the Explorer by clicking on the Load button will fire a property change: m_ButtonLoad . addActionListener ( new ActionListener () { public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { * load data InstanceQuery query = new InstanceQuery (); query . setDatabaseURL ( m_Viewer . getURL ()); query . setUsername ( m_Viewer . getUser ()); query . setPassword ( m_Viewer . getPassword ()); Instances data = query . retrieveInstances ( m_Viewer . getQuery ()); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); } } }); * In order to add our SqlPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.SqlPanel,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel","title":"Implementation"},{"location":"adding_tabs_in_the_explorer/#screenshot","text":"","title":"Screenshot"},{"location":"adding_tabs_in_the_explorer/#source","text":"SqlPanel.java ( stable-3.8 , developer )","title":"Source"},{"location":"adding_tabs_in_the_explorer/#artificial-data-generation","text":"","title":"Artificial data generation"},{"location":"adding_tabs_in_the_explorer/#purpose_1","text":"Instead of only having a Generate... button in the PreprocessPanel or using it from commandline, this example creates a new panel to be displayed as extra tab in the Explorer. This tab will be available regardless whether a dataset is already loaded or not (= standalone ).","title":"Purpose"},{"location":"adding_tabs_in_the_explorer/#implementation_1","text":"class is derived from javax.swing.JPanel and implements the weka.gui.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.Explorer.LogHandler interface, but that is only additional functionality): public class GeneratorPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have (the same as for the SqlPanel class): /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces (almost identical to SqlPanel ): /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"DataGeneration\" ; // what's displayed as tab-title, e.g., Classify } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Generating artificial datasets\" ; // the tooltip of the tab } /** ignored, since we \"generate\" data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements: /** the GOE for the generators */ protected GenericObjectEditor m_GeneratorEditor = new GenericObjectEditor (); /** the text area for the output of the generated data */ protected JTextArea m_Output = new JTextArea (); /** the Generate button */ protected JButton m_ButtonGenerate = new JButton ( \"Generate\" ); /** the Use button */ protected JButton m_ButtonUse = new JButton ( \"Use\" ); * the Generate button doesn't load the generated data directly into the Explorer, but only outputs in the JTextArea (this is done with the Use button - see further down): m_ButtonGenerate . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ DataGenerator generator = ( DataGenerator ) m_GeneratorEditor . getValue (); String relName = generator . getRelationName (); String cname = generator . getClass (). getName (). replaceAll ( \".*\\\\.\" , \"\" ); String cmd = generator . getClass (). getName (); if ( generator instanceof OptionHandler ) cmd += \" \" + Utils . joinOptions ((( OptionHandler ) generator ). getOptions ()); try { * generate data StringWriter output = new StringWriter (); generator . setOutput ( new PrintWriter ( output )); DataGenerator . makeData ( generator , generator . getOptions ()); m_Output . setText ( output . toString ()); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } generator . setRelationName ( relName ); } }); * the Use button finally fires a property change event that will load the data into the Explorer: m_ButtonUse . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { Instances data = new Instances ( new StringReader ( m_Output . getText ())); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } } }); * In order to add our GeneratorPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.GeneratorPanel:standalone,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Note: the standalone option is used to make the tab available without requiring the preprocess panel to load a dataset first.","title":"Implementation"},{"location":"adding_tabs_in_the_explorer/#screenshot_1","text":"","title":"Screenshot"},{"location":"adding_tabs_in_the_explorer/#source_1","text":"GeneratorPanel.java ( stable-3.8 , developer )","title":"Source"},{"location":"adding_tabs_in_the_explorer/#experimenter-light","text":"","title":"Experimenter \"light\""},{"location":"adding_tabs_in_the_explorer/#purpose_2","text":"By default the Classify panel only performs 1 run of 10-fold cross-validation. Since most classifiers are rather sensitive to the order of the data being presented to them, those results can be too optimistic or pessimistic. Averaging the results over 10 runs with differently randomized train/test pairs returns more reliable results. And this is where this plugin comes in: it can be used to obtain statistical sound results for a specific classifier/dataset combination, without having to setup a whole experiment in the Experimenter.","title":"Purpose"},{"location":"adding_tabs_in_the_explorer/#implementation_2","text":"Since this plugin is rather bulky, we omit the implementation details, but the following can be said: based on the weka.gui.explorer.ClassifierPanel the actual code doing the work follows the example in Using the Experiment API article * In order to add our ExperimentPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ExperimentPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel","title":"Implementation"},{"location":"adding_tabs_in_the_explorer/#screenshot_2","text":"","title":"Screenshot"},{"location":"adding_tabs_in_the_explorer/#source_2","text":"ExperimentPanel.java ( stable-3.6 , developer )","title":"Source"},{"location":"ant/","text":"What is ANT? This is how the ANT homepage defines its tool: Apache Ant is a Java-based build tool. In theory, it is kind of like Make, but without Make's wrinkles. Basics # the ANT build file is based on XML the usual name for the build file is build.xml invocation - the usual build file needs not be specified explicitly, if it's in the current directory; if not target is specified, the default one is used ant [-f <build-file>] [<target>] displaying all the available targets of a build file ant [-f <build-file>] -projecthelp Weka and ANT # a build file for Weka is available from git (it has been included in the weka-src.jar since version 3.4.8 and 3.5.3) it is located in the weka directory some targets of interest clean - Removes the build, dist and reports directories; also any class files in the source tree compile - Compile weka and deposit class files in ${path_modifier}/build/classes docs - Make javadocs into {${path_modifier}/doc}} exejar - Create an executable jar file in ${path_modifier}/dist Links # ANT homepage XML","title":"Ant"},{"location":"ant/#basics","text":"the ANT build file is based on XML the usual name for the build file is build.xml invocation - the usual build file needs not be specified explicitly, if it's in the current directory; if not target is specified, the default one is used ant [-f <build-file>] [<target>] displaying all the available targets of a build file ant [-f <build-file>] -projecthelp","title":"Basics"},{"location":"ant/#weka-and-ant","text":"a build file for Weka is available from git (it has been included in the weka-src.jar since version 3.4.8 and 3.5.3) it is located in the weka directory some targets of interest clean - Removes the build, dist and reports directories; also any class files in the source tree compile - Compile weka and deposit class files in ${path_modifier}/build/classes docs - Make javadocs into {${path_modifier}/doc}} exejar - Create an executable jar file in ${path_modifier}/dist","title":"Weka and ANT"},{"location":"ant/#links","text":"ANT homepage XML","title":"Links"},{"location":"auc/","text":"AUC = the A rea U nder the ROC C urve. Weka uses the Mann Whitney statistic to calculate the AUC via the weka.classifiers.evaluation.ThresholdCurve class. Explorer # See ROC curves . KnowledgeFlow # See ROC curves . Commandline # Classifiers can output the AUC if the -i option is provided. The -i option provides detailed information per class. Running the J48 classifier on the iris UCI Dataset with the following commandline: java [CLASSPATH|-classpath <your-classpath>] weka.classifiers.trees.J48 -t /some/where/iris.arff -i produces this output: == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure ROC Area Class 0.98 0 1 0.98 0.99 0.99 Iris-setosa 0.94 0.03 0.94 0.94 0.94 0.952 Iris-versicolor 0.96 0.03 0.941 0.96 0.95 0.961 Iris-virginica See also # ROC curves Mann Whitney statistic on WikiPedia Links # University of Nebraska Medical Center, Interpreting Diagnostic Tests weka.classifiers.evaluation.ThresholdCurve","title":"Auc"},{"location":"auc/#explorer","text":"See ROC curves .","title":"Explorer"},{"location":"auc/#knowledgeflow","text":"See ROC curves .","title":"KnowledgeFlow"},{"location":"auc/#commandline","text":"Classifiers can output the AUC if the -i option is provided. The -i option provides detailed information per class. Running the J48 classifier on the iris UCI Dataset with the following commandline: java [CLASSPATH|-classpath <your-classpath>] weka.classifiers.trees.J48 -t /some/where/iris.arff -i produces this output: == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure ROC Area Class 0.98 0 1 0.98 0.99 0.99 Iris-setosa 0.94 0.03 0.94 0.94 0.94 0.952 Iris-versicolor 0.96 0.03 0.941 0.96 0.95 0.961 Iris-virginica","title":"Commandline"},{"location":"auc/#see-also","text":"ROC curves Mann Whitney statistic on WikiPedia","title":"See also"},{"location":"auc/#links","text":"University of Nebraska Medical Center, Interpreting Diagnostic Tests weka.classifiers.evaluation.ThresholdCurve","title":"Links"},{"location":"batch_filtering/","text":"Batch filtering is used if a second dataset, normally the test set, needs to be processed with the same statistics as the the first dataset, normally the training set. For example, performing standardization with the Standardize filter on two datasets separately will most certainly create two differently standardized output files, since the mean and the standard deviation are based on the input data (and those will differ if the datasets are different). The same applies to the StringToWordVector : here the word dictionary will change, since word occurrences will differ in training and test set. The generated output will be two incompatible files. In order to create compatible train and test set, batch filtering is necessary. Here, the first input/output pair ( -i / -o ) initializes the filter's statistics and the second input/output pair ( -r / -s ) gets processed according to those statistics. To enable batch filtering, one has to provide the additional parameter -b on the commandline. Here is an example Java call: java weka.filters.unsupervised.attribute.Standardize \\ -b \\ -i train.arff \\ -o train_std.arff \\ -r test.arff \\ -s test_std.arff Note: The commandline outlined above is for a Linux/Unix bash (the backslash tells the shell that the command isn't finished yet and continues on the next line). In case of Windows or the SimpleCLI, just remove those backslashes and put everything on one line. See also # See section Batch filtering in the article Use Weka in your Java code , in case you need to perform batch filtering from within your own code","title":"Batch filtering"},{"location":"batch_filtering/#see-also","text":"See section Batch filtering in the article Use Weka in your Java code , in case you need to perform batch filtering from within your own code","title":"See also"},{"location":"binarize_attribute/","text":"Sometimes one wants to binarize a nominal attribute of a certain dataset by grouping all values except the one of interest together as a negation of this value. E.g., in the {{weather}} data the outlook attribute, where sunny is of interest and the other values, rainy and overcast , are grouped together as not-sunny . Original dataset: @relation weather @attribute outlook {sunny, overcast, rainy} @attribute temperature real @attribute humidity real @attribute windy {TRUE, FALSE} @attribute play {yes, no} @data sunny,85,85,FALSE,no sunny,80,90,TRUE,no overcast,83,86,FALSE,yes rainy,70,96,FALSE,yes rainy,68,80,FALSE,yes rainy,65,70,TRUE,no overcast,64,65,TRUE,yes sunny,72,95,FALSE,no sunny,69,70,FALSE,yes rainy,75,80,FALSE,yes sunny,75,70,TRUE,yes overcast,72,90,TRUE,yes overcast,81,75,FALSE,yes rainy,71,91,TRUE,no Desired output: @relation weather-sunny-and-not_sunny @attribute outlook {sunny,not_sunny} @attribute temperature numeric @attribute humidity numeric @attribute windy {TRUE,FALSE} @attribute play {yes,no} @data sunny,85,85,FALSE,no sunny,80,90,TRUE,no not_sunny,83,86,FALSE,yes not_sunny,70,96,FALSE,yes not_sunny,68,80,FALSE,yes not_sunny,65,70,TRUE,no not_sunny,64,65,TRUE,yes sunny,72,95,FALSE,no sunny,69,70,FALSE,yes not_sunny,75,80,FALSE,yes sunny,75,70,TRUE,yes not_sunny,72,90,TRUE,yes not_sunny,81,75,FALSE,yes not_sunny,71,91,TRUE,no The Weka filter NominalToBinary cannot be used directly, since it generates a new attribute for each value of the nominal attribute. As a postprocessing step one could delete all the attributes that are of no interest, but this is quite cumbersome. The Binarize.java class on the other hand generates directly several ARFF out of a given one in the desired format. Download # Binarize.java ( stable , developer )","title":"Binarize attribute"},{"location":"binarize_attribute/#download","text":"Binarize.java ( stable , developer )","title":"Download"},{"location":"citing_weka/","text":"The best reference for WEKA 3.8 and 3.9 is the online appendix on the WEKA workbench for the fourth edition of \"Data Mining: Practical Machine Learning Tools and Techniques\" by I.H. Witten, Eibe Frank, Mark A. Hall, and Chris J. Pal. The citation is Eibe Frank, Mark A. Hall, and Ian H. Witten (2016). The WEKA Workbench. Online Appendix for \"Data Mining: Practical Machine Learning Tools and Techniques\", Morgan Kaufmann, Fourth Edition, 2016. You may also want to consider the SIGKDD Explorations paper covering WEKA 3.6. The citation is Mark Hall, Eibe Frank, Geoffrey Holmes, Bernhard Pfahringer, Peter Reutemann, and Ian H. Witten (2009). The WEKA Data Mining Software: An Update. SIGKDD Explorations, Volume 11, Issue 1. The WEKA logo is available under the Creative Commons Attribution-ShareAlike 2.5 License .","title":"Citing Weka"},{"location":"classifying_large_datasets/","text":"Unless one has access to a 64-bit machine with lots of RAM, it can happen quite easy that one runs into an OutOfMemoryException running WEKA on large datasets. This article tries to present some solutions apart from buying new hardware. Sampling # The question is, does one really need to train with all the data, or is a subset of the data already sufficient? WEKA offers several filters for re-sampling a dataset and generating a new dataset reduced in size: weka.filters.supervised.instance.Resample This filter takes the class distribution into account for generating the sample, i.e., you can even adjust the distribution by adding a bias. weka.filters.unsupervised.instance.Resample The unsupervised filter does not take the class distribution into account for generating the output. weka.filters.supervised.instance.SpreadSubsample It allows you to specify the maximum \"spread\" between the rarest and most common class. See the respective Javadoc for more information ( book version , developer version ). Incremental classifiers # Most classifiers need to see all the data before they can be trained, e.g., J48 or SMO. But there are also schemes that can be trained in an incremental fashion, not just in batch mode. All classifiers implementing the weka.classifiers.UpdateableClassifier interface are able to process data in such a way. Running such a classifier from commandline will load the dataset incrementally (NB: not all data formats can be loaded incrementally; XRFF is one of them, ARFF on the other hand can be read incrementally) and feed the data instance by instance to the classifier. Check out the Javadoc of the UpdateableClassifier interface to see what schemes implement it ( book version , developer version ). Other tools # MOA - Massive Online Analysis A framework for learning from a continuous supply of examples, a data stream. Includes tools for evaluation and a collection of machine learning algorithms. Related to the WEKA project, also written in Java, while scaling to more demanding problems.","title":"Classifying large datasets"},{"location":"classifying_large_datasets/#sampling","text":"The question is, does one really need to train with all the data, or is a subset of the data already sufficient? WEKA offers several filters for re-sampling a dataset and generating a new dataset reduced in size: weka.filters.supervised.instance.Resample This filter takes the class distribution into account for generating the sample, i.e., you can even adjust the distribution by adding a bias. weka.filters.unsupervised.instance.Resample The unsupervised filter does not take the class distribution into account for generating the output. weka.filters.supervised.instance.SpreadSubsample It allows you to specify the maximum \"spread\" between the rarest and most common class. See the respective Javadoc for more information ( book version , developer version ).","title":"Sampling"},{"location":"classifying_large_datasets/#incremental-classifiers","text":"Most classifiers need to see all the data before they can be trained, e.g., J48 or SMO. But there are also schemes that can be trained in an incremental fashion, not just in batch mode. All classifiers implementing the weka.classifiers.UpdateableClassifier interface are able to process data in such a way. Running such a classifier from commandline will load the dataset incrementally (NB: not all data formats can be loaded incrementally; XRFF is one of them, ARFF on the other hand can be read incrementally) and feed the data instance by instance to the classifier. Check out the Javadoc of the UpdateableClassifier interface to see what schemes implement it ( book version , developer version ).","title":"Incremental classifiers"},{"location":"classifying_large_datasets/#other-tools","text":"MOA - Massive Online Analysis A framework for learning from a continuous supply of examples, a data stream. Includes tools for evaluation and a collection of machine learning algorithms. Related to the WEKA project, also written in Java, while scaling to more demanding problems.","title":"Other tools"},{"location":"classpath/","text":"The CLASSPATH environment variable tells Java where to look for classes. Since Java does the search in a ''first-come-first-serve'' kind of manner, you'll have to take care where and what to put in your CLASSPATH. I, personally, never use the environment variable, since I'm working often on a project in different versions in parallel. The CLASSPATH would just mess up things, if you're not careful (or just forget to remove an entry). ANT offers a nice way for building (and separating source code and class files) Java projects. But still, if you're only working on totally separate projects, it might be easiest for you to use the environment variable. Setting the CLASSPATH # In the following we add the mysql-connector-java-5.1.6-bin.jar to our CLASSPATH variable (this works for any other jar archive) to make it possible to access MySQL Databases via JDBC. Windows # We assume that the mysql-connector-java-5.1.6-bin.jar archive is located in the following directory: C:\\Program Files\\Weka-3-8 In the Control Panel click on System (or right click on This PC and select Properties ) and then go to the Advanced tab. There you will find a button called Environment Variables , click it. Depending on, whether you're the only person using this computer or it is a lab computer shared by many, you can either create a new system-wide (you are the only user) environment variable or a user dependent one (recommended for multi-user machines). Enter the following name for the variable CLASSPATH and add this value C:\\Program Files\\Weka-3-8\\mysql-connector-java-5.1.6-bin.jar If you want to add additional jars, you'll have to separate them with the path separator, the semicolon ; (no spaces!). Unix/Linux # I assume, that the mysql jar is located in the following directory: /home/johndoe/jars/ Open a shell and execute the following command, depending on the shell you're using: bash export CLASSPATH=$CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar c shell setenv CLASSPATH $CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar Unix/Linux uses the colon : as path separator, in contrast to Windows, which uses the semicolon ; . Note: the prefixing with $CLASSPATH adds the mysql jar at the end of the currently existing CLASSPATH . Cygwin # The process is like with Unix/Linux systems, but since the host system is Win32 and therefore the Java installation also a Windows application, you'll have to use the semicolon ; as separator for several jars.","title":"Classpath"},{"location":"classpath/#setting-the-classpath","text":"In the following we add the mysql-connector-java-5.1.6-bin.jar to our CLASSPATH variable (this works for any other jar archive) to make it possible to access MySQL Databases via JDBC.","title":"Setting the CLASSPATH"},{"location":"classpath/#windows","text":"We assume that the mysql-connector-java-5.1.6-bin.jar archive is located in the following directory: C:\\Program Files\\Weka-3-8 In the Control Panel click on System (or right click on This PC and select Properties ) and then go to the Advanced tab. There you will find a button called Environment Variables , click it. Depending on, whether you're the only person using this computer or it is a lab computer shared by many, you can either create a new system-wide (you are the only user) environment variable or a user dependent one (recommended for multi-user machines). Enter the following name for the variable CLASSPATH and add this value C:\\Program Files\\Weka-3-8\\mysql-connector-java-5.1.6-bin.jar If you want to add additional jars, you'll have to separate them with the path separator, the semicolon ; (no spaces!).","title":"Windows"},{"location":"classpath/#unixlinux","text":"I assume, that the mysql jar is located in the following directory: /home/johndoe/jars/ Open a shell and execute the following command, depending on the shell you're using: bash export CLASSPATH=$CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar c shell setenv CLASSPATH $CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar Unix/Linux uses the colon : as path separator, in contrast to Windows, which uses the semicolon ; . Note: the prefixing with $CLASSPATH adds the mysql jar at the end of the currently existing CLASSPATH .","title":"Unix/Linux"},{"location":"classpath/#cygwin","text":"The process is like with Unix/Linux systems, but since the host system is Win32 and therefore the Java installation also a Windows application, you'll have to use the semicolon ; as separator for several jars.","title":"Cygwin"},{"location":"classpath_problems/","text":"Having problems getting Weka to run from a DOS/UNIX command prompt? Getting java.lang.NoClassDefFoundError exceptions? Most likely your CLASSPATH environment variable is not set correctly - it needs to point to the weka.jar file that you downloaded with Weka (or the parent of the Weka directory if you have extracted the jar). Under DOS this can be achieved with: set CLASSPATH=c:\\weka-3-4\\weka.jar;%CLASSPATH% Under UNIX/Linux something like: export CLASSPATH = /home/weka/weka.jar: $CLASSPATH An easy way to get avoid setting the variable this is to specify the CLASSPATH when calling Java. For example, if the jar file is located at c:\\weka-3-4\\weka.jar you can use: java -cp c: \\w eka-3-4 \\w eka.jar weka.classifiers... See also the CLASSPATH article.","title":"Classpath problems"},{"location":"command_redirection/","text":"Console # With command redirection one can redirect standard streams like stdin , stdout and stderr to user-specified locations. Quite often it is useful to redirect the output of a program to a text file. redirecting stdout to a file someProgram >/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt (Windows command prompt) redirecting stderr to a file someProgram 2>/some/where/output.txt (Linux/Unix Bash) someProgram 2>c:\\some\\where\\output.txt (Windows command prompt) redirecting stdout and stderr to a file someProgram &>/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt 2>&1 (Windows command prompt) Note: under Weka quite often the output is printed to stderr , e.g., if one is using the -p 0 option from the commandline to print the predicted values for a test file: java weka.classifiers.trees.J48 -t train.arff -T test.arff -p 0 2> j48.txt or if one already has a trained model: java weka.classifiers.trees.J48 -l j48.model -T test.arff -p 0 2> j48.txt SimpleCLI # One can perform a basic redirection also in the SimpleCLI, e.g.: java weka.classifiers.trees.J48 -t test.arff > j48.txt Note: the > must be preceded and followed by a space , otherwise it is not recognized as redirection, but part of another parameter. Links # Linux Command redirection under Bash I/O Redirection under Bash Redirection under Unix (WikiPedia) Windows Command redirection under MS Windows Command redirection under MS DOS","title":"Console"},{"location":"command_redirection/#console","text":"With command redirection one can redirect standard streams like stdin , stdout and stderr to user-specified locations. Quite often it is useful to redirect the output of a program to a text file. redirecting stdout to a file someProgram >/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt (Windows command prompt) redirecting stderr to a file someProgram 2>/some/where/output.txt (Linux/Unix Bash) someProgram 2>c:\\some\\where\\output.txt (Windows command prompt) redirecting stdout and stderr to a file someProgram &>/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt 2>&1 (Windows command prompt) Note: under Weka quite often the output is printed to stderr , e.g., if one is using the -p 0 option from the commandline to print the predicted values for a test file: java weka.classifiers.trees.J48 -t train.arff -T test.arff -p 0 2> j48.txt or if one already has a trained model: java weka.classifiers.trees.J48 -l j48.model -T test.arff -p 0 2> j48.txt","title":"Console"},{"location":"command_redirection/#simplecli","text":"One can perform a basic redirection also in the SimpleCLI, e.g.: java weka.classifiers.trees.J48 -t test.arff > j48.txt Note: the > must be preceded and followed by a space , otherwise it is not recognized as redirection, but part of another parameter.","title":"SimpleCLI"},{"location":"command_redirection/#links","text":"Linux Command redirection under Bash I/O Redirection under Bash Redirection under Unix (WikiPedia) Windows Command redirection under MS Windows Command redirection under MS DOS","title":"Links"},{"location":"compiling_weka/","text":"There are several ways of compiling the Weka source code: with ant takes care of compiling all the necessary classes and easily generates jar archives with maven similar to ant with an IDE, like IntelliJ IDEA, Eclipse or NetBeans can be very helpful for debugging tricky bugs","title":"Compiling weka"},{"location":"cost_matrix/","text":"Format # Format of the cost matrices: regular % Rows Columns 2 2 % Matrix elements 0.0 5.0 1.0 0.0 Matlab single-line format (see also the Matlab Primer ) [0.0 5.0; 1.0 0.0] Testing the format # The following code loads a cost matrix and prints its content to the console. Useful, if one wants to test whether the format is correct: import weka.classifiers.CostMatrix ; import java.io.BufferedReader ; import java.io.FileReader ; /** * Loads the cost matrix \"args[0]\" and prints its content to the console. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class CostMatrixLoader { public static void main ( String [] args ) throws Exception { CostMatrix matrix = new ** CostMatrix ** ( new BufferedReader ( new FileReader ( args [ 0 ] ))); System . out . println ( matrix ); } } See also # CostSensitiveClassifier MetaCost Downloads # CostMatrixLoader.java","title":"Format"},{"location":"cost_matrix/#format","text":"Format of the cost matrices: regular % Rows Columns 2 2 % Matrix elements 0.0 5.0 1.0 0.0 Matlab single-line format (see also the Matlab Primer ) [0.0 5.0; 1.0 0.0]","title":"Format"},{"location":"cost_matrix/#testing-the-format","text":"The following code loads a cost matrix and prints its content to the console. Useful, if one wants to test whether the format is correct: import weka.classifiers.CostMatrix ; import java.io.BufferedReader ; import java.io.FileReader ; /** * Loads the cost matrix \"args[0]\" and prints its content to the console. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class CostMatrixLoader { public static void main ( String [] args ) throws Exception { CostMatrix matrix = new ** CostMatrix ** ( new BufferedReader ( new FileReader ( args [ 0 ] ))); System . out . println ( matrix ); } }","title":"Testing the format"},{"location":"cost_matrix/#see-also","text":"CostSensitiveClassifier MetaCost","title":"See also"},{"location":"cost_matrix/#downloads","text":"CostMatrixLoader.java","title":"Downloads"},{"location":"cost_sensitive_classifier/","text":"A meta classifier that makes its base classifier cost-sensitive. Two methods can be used to introduce cost-sensitivity: reweighting training instances according to the total cost assigned to each class; or predicting the class with minimum expected misclassification cost (rather than the most likely class). Performance can often be improved by using a bagged classifier to improve the probability estimates of the base classifier. Since the classifier, in default mode (i.e., when using the reweighting method), normalizes the cost matrix before applying it, it can be hard coming up with a cost matrix, e.g., one that balances out imbalanced data. Here is an example: input cost matrix -3 1 1 1 -6 1 0 0 0 normalized cost matrix 0 7 1 4 0 1 3 6 0 The application of a cost matrix using the second, minimum-expected cost approach, which is also used by MetaCost , is more intuitive. See also # MetaCost CostMatrix","title":"Cost sensitive classifier"},{"location":"cost_sensitive_classifier/#see-also","text":"MetaCost CostMatrix","title":"See also"},{"location":"creating_instances/","text":"see Creating an ARFF file","title":"Creating instances"},{"location":"databases/","text":"CLASSPATH # See the CLASSPATH article for how to set up your CLASSPATH environment variable, in order to make the JDBC driver available for Weka. Configuration files # Thanks to JDBC it is easy to connect to Databases that provide a JDBC driver. Responsible for the setup is the following properties file, located in the weka.experiment package: DatabaseUtils.props You can get this properties file from the weka.jar or weka-src.jar jar-archive, both part of a normal Weka release. If you open up one of those files, you'll find the properties file in the sub-folder weka/experiment . Weka comes with example files for a wide range of databases: DatabaseUtils.props.hsql - HSQLDB DatabaseUtils.props.msaccess - MS Access (see the Windows Databases article for more information) DatabaseUtils.props.mssqlserver - MS SQL Server 2000 DatabaseUtils.props.mssqlserver2005 - MS SQL Server 2005 DatabaseUtils.props.mysql - MySQL DatabaseUtils.props.odbc - ODBC access via Sun's ODBC/JDBC bridge, e.g., for MS Sql Server (see the Windows Databases article for more information) DatabaseUtils.props.oracle - Oracle 10g DatabaseUtils.props.postgresql - PostgreSQL 7.4 DatabaseUtils.props.sqlite3 - sqlite 3.x The easiest way is just to place the extracted properties file into your HOME directory. For more information on how property files are processed, check out this article. Note: Weka only looks for the DatabaseUtils.props file. If you take one of the example files listed above, you need to rename it first. Setup # Under normal circumstances you only have to edit the following two properties: jdbcDriver jdbcURL Driver # jdbcDriver is the classname of the JDBC driver, necessary to connect to your database, e.g.: HSQLDB - org.hsqldb.jdbcDriver MS SQL Server 2000 (Desktop Edition) - com.microsoft.jdbc.sqlserver.SQLServerDriver MS SQL Server 2005 - com.microsoft.sqlserver.jdbc.SQLServerDriver MySQL - org.gjt.mm.mysql.Driver (or com.mysql.jdbc.Driver ) ODBC - part of Sun's JDKs/JREs, no external driver necessary - sun.jdbc.odbc.JdbcOdbcDriver Oracle - oracle.jdbc.driver.OracleDriver PostgreSQL - org.postgresql.Driver sqlite 3.x - org.sqlite.JDBC URL # jdbcURL specifies the JDBC URL pointing to your database (can be still changed in the Experimenter/Explorer), e.g. for the database MyDatabase on the server server.my.domain : HSQLDB - jdbc:hsqldb:hsql://server.my.domain/MyDatabase MS SQL Server 2000 (Desktop Edition) - jdbc:microsoft:sqlserver://server.my.comain:1433 Note: if you add ;databasename=*db-name* you can connect to a different database than the default one, e.g., MyDatabase MS SQL Server 2005 - jdbc:sqlserver://server.my.domain:1433 MySQL - jdbc:mysql://server.my.domain:3306/MyDatabase ODBC - jdbc:odbc:DSN_name (replace DSN_name with the DSN that you want to use) Oracle (thin driver) - jdbc:oracle:thin:@server.my.domain:1526:orcl Note: @machineName:port:SID for the Express Edition you can use: jdbc:oracle:thin:@server.my.domain:1521:XE PostgreSQL - jdbc:postgresql://server.my.domain:5432/MyDatabase You can also specify user and password directly in the URL: jdbc:postgresql://server.my.domain:5432/MyDatabase?user=<...>&password=<...> where you have to replace the <...> with the correct values sqlite 3.x - jdbc:sqlite:/path/to/database.db (you can access only local files) Missing Datatypes # Sometimes (e.g. with MySQL) it can happen that a column type cannot be interpreted. In that case it is necessary to map the name of the column type to the Java type it should be interpreted as. E.g. the MySQL type TEXT is returned as BLOB from the JDBC driver and has to be mapped to String ( 0 represents String - the mappings can be found in the comments of the properties file): BLOB=0 The article weka/experiment/DatabaseUtils.props contains more details on this topic. Stored Procedures # Let's say you're tired of typing the same query over and over again. A good way to shorten that, is to create a stored procedure. PostgreSQL 7.4.x # The following example creates a procedure called emplyoee_name that returns the names of all the employees in table employee . Even though it doesn't make much sense to create a stored procedure for this query, nonetheless, it shows how to create and call stored procedures in PostgreSQL. Create CREATE OR REPLACE FUNCTION public.employee_name() RETURNS SETOF text AS 'select name from employee' LANGUAGE 'sql' VOLATILE; SQL statement to call procedure SELECT * FROM employee_name() Retrieve data via InstanceQuery java weka.experiment.InstanceQuery -Q \"SELECT * FROM employee_name()\" -U <user> -P <password> Troubleshooting # In case you're experiencing problems connecting to your database, check out the mailing list . It is possible that somebody else encountered the same problem as you and you'll find a post containing the solution to your problem. Specific MS SQL Server 2000 Troubleshooting MS SQL Server 2005: TCP/IP is not enabled for SQL Server, or the server or port number specified is incorrect.Verify that SQL Server is listening with TCP/IP on the specified server and port. This might be reported with an exception similar to: \"The login has failed. The TCP/IP connection to the host has failed.\" This indicates one of the following: SQL Server is installed but TCP/IP has not been installed as a network protocol for SQL Server by using the SQL Server Network Utility for SQL Server 2000, or the SQL Server Configuration Manager for SQL Server 2005 TCP/IP is installed as a SQL Server protocol, but it is not listening on the port specified in the JDBC connection URL. The default port is 1433. The port that is used by the server has not been opened in the firewall The Added driver: ... output on the commandline does not mean that the actual class was found, but only that Weka will attempt to load the class later on in order to establish a database connection. The error message No suitable driver can be caused by the following: The JDBC driver you are attempting to load is not in the CLASSPATH (Note: using -jar in the java commandline overwrites the CLASSPATH environment variable!). Open the SimpleCLI, run the command java weka.core.SystemInfo and check whether the property java.class.path lists your database jar. If not correct your CLASSPATH or the Java call you start Weka with. The JDBC driver class is misspelled in the jdbcDriver property or you have multiple entries of jdbcDriver ( properties file s need unique keys!) The jdbcURL property has a spelling error and tries to use a non-existing protocol or you listed it multiple times, which doesn't work either (remember, properties file s need unique keys!) See also # weka/experiment/DatabaseUtils.props properties file CLASSPATH Links # HSQLDB homepage IBM Cloudscape homepage Microsoft SQL Server SQL Server 2000 (Desktop Engine) SQL Server 2000 JDBC Driver SP 3 SQL Server 2005 JDBC Driver MySQL homepage JDBC driver Oracle homepage JDBC driver JDBC FAQ PostgreSQL homepage JDBC driver sqlite homepage JDBC driver Weka Mailing list","title":"CLASSPATH"},{"location":"databases/#classpath","text":"See the CLASSPATH article for how to set up your CLASSPATH environment variable, in order to make the JDBC driver available for Weka.","title":"CLASSPATH"},{"location":"databases/#configuration-files","text":"Thanks to JDBC it is easy to connect to Databases that provide a JDBC driver. Responsible for the setup is the following properties file, located in the weka.experiment package: DatabaseUtils.props You can get this properties file from the weka.jar or weka-src.jar jar-archive, both part of a normal Weka release. If you open up one of those files, you'll find the properties file in the sub-folder weka/experiment . Weka comes with example files for a wide range of databases: DatabaseUtils.props.hsql - HSQLDB DatabaseUtils.props.msaccess - MS Access (see the Windows Databases article for more information) DatabaseUtils.props.mssqlserver - MS SQL Server 2000 DatabaseUtils.props.mssqlserver2005 - MS SQL Server 2005 DatabaseUtils.props.mysql - MySQL DatabaseUtils.props.odbc - ODBC access via Sun's ODBC/JDBC bridge, e.g., for MS Sql Server (see the Windows Databases article for more information) DatabaseUtils.props.oracle - Oracle 10g DatabaseUtils.props.postgresql - PostgreSQL 7.4 DatabaseUtils.props.sqlite3 - sqlite 3.x The easiest way is just to place the extracted properties file into your HOME directory. For more information on how property files are processed, check out this article. Note: Weka only looks for the DatabaseUtils.props file. If you take one of the example files listed above, you need to rename it first.","title":"Configuration files"},{"location":"databases/#setup","text":"Under normal circumstances you only have to edit the following two properties: jdbcDriver jdbcURL","title":"Setup"},{"location":"databases/#driver","text":"jdbcDriver is the classname of the JDBC driver, necessary to connect to your database, e.g.: HSQLDB - org.hsqldb.jdbcDriver MS SQL Server 2000 (Desktop Edition) - com.microsoft.jdbc.sqlserver.SQLServerDriver MS SQL Server 2005 - com.microsoft.sqlserver.jdbc.SQLServerDriver MySQL - org.gjt.mm.mysql.Driver (or com.mysql.jdbc.Driver ) ODBC - part of Sun's JDKs/JREs, no external driver necessary - sun.jdbc.odbc.JdbcOdbcDriver Oracle - oracle.jdbc.driver.OracleDriver PostgreSQL - org.postgresql.Driver sqlite 3.x - org.sqlite.JDBC","title":"Driver"},{"location":"databases/#url","text":"jdbcURL specifies the JDBC URL pointing to your database (can be still changed in the Experimenter/Explorer), e.g. for the database MyDatabase on the server server.my.domain : HSQLDB - jdbc:hsqldb:hsql://server.my.domain/MyDatabase MS SQL Server 2000 (Desktop Edition) - jdbc:microsoft:sqlserver://server.my.comain:1433 Note: if you add ;databasename=*db-name* you can connect to a different database than the default one, e.g., MyDatabase MS SQL Server 2005 - jdbc:sqlserver://server.my.domain:1433 MySQL - jdbc:mysql://server.my.domain:3306/MyDatabase ODBC - jdbc:odbc:DSN_name (replace DSN_name with the DSN that you want to use) Oracle (thin driver) - jdbc:oracle:thin:@server.my.domain:1526:orcl Note: @machineName:port:SID for the Express Edition you can use: jdbc:oracle:thin:@server.my.domain:1521:XE PostgreSQL - jdbc:postgresql://server.my.domain:5432/MyDatabase You can also specify user and password directly in the URL: jdbc:postgresql://server.my.domain:5432/MyDatabase?user=<...>&password=<...> where you have to replace the <...> with the correct values sqlite 3.x - jdbc:sqlite:/path/to/database.db (you can access only local files)","title":"URL"},{"location":"databases/#missing-datatypes","text":"Sometimes (e.g. with MySQL) it can happen that a column type cannot be interpreted. In that case it is necessary to map the name of the column type to the Java type it should be interpreted as. E.g. the MySQL type TEXT is returned as BLOB from the JDBC driver and has to be mapped to String ( 0 represents String - the mappings can be found in the comments of the properties file): BLOB=0 The article weka/experiment/DatabaseUtils.props contains more details on this topic.","title":"Missing Datatypes"},{"location":"databases/#stored-procedures","text":"Let's say you're tired of typing the same query over and over again. A good way to shorten that, is to create a stored procedure.","title":"Stored Procedures"},{"location":"databases/#postgresql-74x","text":"The following example creates a procedure called emplyoee_name that returns the names of all the employees in table employee . Even though it doesn't make much sense to create a stored procedure for this query, nonetheless, it shows how to create and call stored procedures in PostgreSQL. Create CREATE OR REPLACE FUNCTION public.employee_name() RETURNS SETOF text AS 'select name from employee' LANGUAGE 'sql' VOLATILE; SQL statement to call procedure SELECT * FROM employee_name() Retrieve data via InstanceQuery java weka.experiment.InstanceQuery -Q \"SELECT * FROM employee_name()\" -U <user> -P <password>","title":"PostgreSQL 7.4.x"},{"location":"databases/#troubleshooting","text":"In case you're experiencing problems connecting to your database, check out the mailing list . It is possible that somebody else encountered the same problem as you and you'll find a post containing the solution to your problem. Specific MS SQL Server 2000 Troubleshooting MS SQL Server 2005: TCP/IP is not enabled for SQL Server, or the server or port number specified is incorrect.Verify that SQL Server is listening with TCP/IP on the specified server and port. This might be reported with an exception similar to: \"The login has failed. The TCP/IP connection to the host has failed.\" This indicates one of the following: SQL Server is installed but TCP/IP has not been installed as a network protocol for SQL Server by using the SQL Server Network Utility for SQL Server 2000, or the SQL Server Configuration Manager for SQL Server 2005 TCP/IP is installed as a SQL Server protocol, but it is not listening on the port specified in the JDBC connection URL. The default port is 1433. The port that is used by the server has not been opened in the firewall The Added driver: ... output on the commandline does not mean that the actual class was found, but only that Weka will attempt to load the class later on in order to establish a database connection. The error message No suitable driver can be caused by the following: The JDBC driver you are attempting to load is not in the CLASSPATH (Note: using -jar in the java commandline overwrites the CLASSPATH environment variable!). Open the SimpleCLI, run the command java weka.core.SystemInfo and check whether the property java.class.path lists your database jar. If not correct your CLASSPATH or the Java call you start Weka with. The JDBC driver class is misspelled in the jdbcDriver property or you have multiple entries of jdbcDriver ( properties file s need unique keys!) The jdbcURL property has a spelling error and tries to use a non-existing protocol or you listed it multiple times, which doesn't work either (remember, properties file s need unique keys!)","title":"Troubleshooting"},{"location":"databases/#see-also","text":"weka/experiment/DatabaseUtils.props properties file CLASSPATH","title":"See also"},{"location":"databases/#links","text":"HSQLDB homepage IBM Cloudscape homepage Microsoft SQL Server SQL Server 2000 (Desktop Engine) SQL Server 2000 JDBC Driver SP 3 SQL Server 2005 JDBC Driver MySQL homepage JDBC driver Oracle homepage JDBC driver JDBC FAQ PostgreSQL homepage JDBC driver sqlite homepage JDBC driver Weka Mailing list","title":"Links"},{"location":"datasets/","text":"Some example datasets for analysis with Weka are included in the Weka distribution and can be found in the data folder of the installed software. Miscellaneous collections of datasets # A jarfile containing 37 classification problems originally obtained from the UCI repository of machine learning datasets ( datasets-UCI.jar , 1,190,961 Bytes). A jarfile containing 37 regression problems obtained from various sources ( datasets-numeric.jar , 169,344 Bytes). A jarfile containing 6 agricultural datasets obtained from agricultural researchers in New Zealand ( agridatasets.jar , 31,200 Bytes). A jarfile containing 30 regression datasets collected by Professor Luis Torgo ( regression-datasets.jar , 10,090,266 Bytes). A gzip'ed tar containing UCI ML and UCI KDD datasets ( uci-20070111.tar.gz , 17,952,832 Bytes) A gzip'ed tar containing StatLib datasets ( statlib-20050214.tar.gz , 12,785,582 Bytes) A gzip'ed tar containing ordinal, real-world datasets donated by Professor Arie Ben David ( datasets-arie_ben_david.tar.gz , 11,348 Bytes) A zip file containing 19 multi-class (1-of-n) text datasets donated by Dr George Forman ( 19MclassTextWc.zip , 14,084,828 Bytes) A bzip'ed tar file containing the Reuters21578 dataset split into separate files according to the ModApte split reuters21578-ModApte.tar.bz2 , 81,745,032 Bytes A zip file containing 41 drug design datasets formed using the Adriana.Code software donated by Dr Mehmet Fatih Amasyali ( Drug-datasets.zip , 11,376,153 Bytes) A zip file containing 80 artificial datasets generated from the Friedman function donated by Dr. M. Fatih Amasyali (Yildiz Technical Unversity) ( Friedman-datasets.zip , 5,802,204 Bytes) A zip file containing a new, image-based version of the classic iris data, with 50 images for each of the three species of iris. The images have size 600x600. Please see the ARFF file for further information ( iris_reloaded.zip , 92,267,000 Bytes). After expanding into a directory using your jar utility (or an archive program that handles tar-archives/zip files in case of the gzip'ed tars/zip files), these datasets may be used with Weka. Bioinformatics datasets # Some bioinformatics datasets in Weka's ARFF format. These are quite old but still available thanks to the Internet Archive. Protein datasets made available by Associate Professor Shuiwang Ji when he was a PhD student at Louisiana State University . Kent Ridge Biomedical Data Set Repository , which was put together by Professor Jinyan Li and Dr Huiqing Liu while they were at the Institute for Infocomm Research, Singapore . Repository for Epitope Datasets (RED) , maintained by Professor Yasser El-Manzalawy when he was at Iowa State University .","title":"Datasets"},{"location":"datasets/#miscellaneous-collections-of-datasets","text":"A jarfile containing 37 classification problems originally obtained from the UCI repository of machine learning datasets ( datasets-UCI.jar , 1,190,961 Bytes). A jarfile containing 37 regression problems obtained from various sources ( datasets-numeric.jar , 169,344 Bytes). A jarfile containing 6 agricultural datasets obtained from agricultural researchers in New Zealand ( agridatasets.jar , 31,200 Bytes). A jarfile containing 30 regression datasets collected by Professor Luis Torgo ( regression-datasets.jar , 10,090,266 Bytes). A gzip'ed tar containing UCI ML and UCI KDD datasets ( uci-20070111.tar.gz , 17,952,832 Bytes) A gzip'ed tar containing StatLib datasets ( statlib-20050214.tar.gz , 12,785,582 Bytes) A gzip'ed tar containing ordinal, real-world datasets donated by Professor Arie Ben David ( datasets-arie_ben_david.tar.gz , 11,348 Bytes) A zip file containing 19 multi-class (1-of-n) text datasets donated by Dr George Forman ( 19MclassTextWc.zip , 14,084,828 Bytes) A bzip'ed tar file containing the Reuters21578 dataset split into separate files according to the ModApte split reuters21578-ModApte.tar.bz2 , 81,745,032 Bytes A zip file containing 41 drug design datasets formed using the Adriana.Code software donated by Dr Mehmet Fatih Amasyali ( Drug-datasets.zip , 11,376,153 Bytes) A zip file containing 80 artificial datasets generated from the Friedman function donated by Dr. M. Fatih Amasyali (Yildiz Technical Unversity) ( Friedman-datasets.zip , 5,802,204 Bytes) A zip file containing a new, image-based version of the classic iris data, with 50 images for each of the three species of iris. The images have size 600x600. Please see the ARFF file for further information ( iris_reloaded.zip , 92,267,000 Bytes). After expanding into a directory using your jar utility (or an archive program that handles tar-archives/zip files in case of the gzip'ed tars/zip files), these datasets may be used with Weka.","title":"Miscellaneous collections of datasets"},{"location":"datasets/#bioinformatics-datasets","text":"Some bioinformatics datasets in Weka's ARFF format. These are quite old but still available thanks to the Internet Archive. Protein datasets made available by Associate Professor Shuiwang Ji when he was a PhD student at Louisiana State University . Kent Ridge Biomedical Data Set Repository , which was put together by Professor Jinyan Li and Dr Huiqing Liu while they were at the Institute for Infocomm Research, Singapore . Repository for Epitope Datasets (RED) , maintained by Professor Yasser El-Manzalawy when he was at Iowa State University .","title":"Bioinformatics datasets"},{"location":"development/","text":"We are following the Linux model of releases, where an even second digit of a release number indicates a \"stable\" release and an odd second digit indicates a \"development\" release (e.g., 3.0.x is a stable release and 3.1.x is a developmental release). If you are using a developmental release, there may be new features, but it is entirely possible that these features will be transient and/or unstable, and backward compatibility of the API and/or models is not guaranteed. If you require stability for teaching or deployment in applications, it is best to use a stable release of Weka. Source code repository # Weka's source code for a particular release is included in the distribution when you download it, in a .jar file (a form of .zip file) called weka-src.jar . However, it is also possible to read source code directly from the git source code repository for Weka. Code credits # The Weka developers would like to thank The MathWorks and the National Institute of Standards and Technology (NIST) for developing the Jama Matrix package and releasing it to the public domain, and to CERN (European Organization for Nuclear Research) for statistics-related code from their Jet libraries (now part of COLT ). The core Weka distributions include third-party library code from the MTJ project for fast matrix algebra in Java, the Java CUP project for generating parsers, the authentication dialog from the Bounce project , and the Apache Commons Compress library. For more information, see the lib folder of the source code repository. Weka, including the early non-Java predecessors of Weka 3, was developed at the Department of Computer Science of the University of Waikato in Hamilton , New Zealand . Most of Weka 3 was written by Eibe Frank, Mark Hall, Peter Reutemann, and Len Trigg, but many others have made significant contributions, in particular, Remco Bouckaert, Richard Kirkby, Ashraf Kibriya, Xin Xu, and Malcolm Ware. For complete info on the contributors, check the Javadoc extracted from the source code of Weka, which is part of the available documentation . Weka's package manager provides access to a large collection of optional libraries, many of which have been contributed by developers from other institutions. For information on the authors of these packages and the third-party libraries used within those Weka packages, please consult the Javadoc for the relevant package and the corresponding package lib folder.","title":"Development"},{"location":"development/#source-code-repository","text":"Weka's source code for a particular release is included in the distribution when you download it, in a .jar file (a form of .zip file) called weka-src.jar . However, it is also possible to read source code directly from the git source code repository for Weka.","title":"Source code repository"},{"location":"development/#code-credits","text":"The Weka developers would like to thank The MathWorks and the National Institute of Standards and Technology (NIST) for developing the Jama Matrix package and releasing it to the public domain, and to CERN (European Organization for Nuclear Research) for statistics-related code from their Jet libraries (now part of COLT ). The core Weka distributions include third-party library code from the MTJ project for fast matrix algebra in Java, the Java CUP project for generating parsers, the authentication dialog from the Bounce project , and the Apache Commons Compress library. For more information, see the lib folder of the source code repository. Weka, including the early non-Java predecessors of Weka 3, was developed at the Department of Computer Science of the University of Waikato in Hamilton , New Zealand . Most of Weka 3 was written by Eibe Frank, Mark Hall, Peter Reutemann, and Len Trigg, but many others have made significant contributions, in particular, Remco Bouckaert, Richard Kirkby, Ashraf Kibriya, Xin Xu, and Malcolm Ware. For complete info on the contributors, check the Javadoc extracted from the source code of Weka, which is part of the available documentation . Weka's package manager provides access to a large collection of optional libraries, many of which have been contributed by developers from other institutions. For information on the authors of these packages and the third-party libraries used within those Weka packages, please consult the Javadoc for the relevant package and the corresponding package lib folder.","title":"Code credits"},{"location":"discretizing_datasets/","text":"Once in a while one has numeric data but wants to use classifier that handles only nominal values. In that case one needs to discretize the data, which can be done with the following filters: weka.filters.supervised.attribute.Discretize uses either Fayyad & Irani's MDL method or Kononeko's MDL criterion weka.filters.unsupervised.attribute.Discretize uses simple binning But, since discretization depends on the data which presented to the discretization algorithm, one easily end up with incompatible train and test files. The following shows how to generate compatible discretized files out of a training and a test file by using the supervised version of the filter. The class takes four files as arguments: input training file input test file output training file output test file import java.io.* ; import weka.core.* ; import weka.filters.Filter ; import weka.filters.supervised.attribute.Discretize ; /** * Shows how to generate compatible train/test sets using the Discretize * filter. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class DiscretizeTest { /** * loads the given ARFF file and sets the class attribute as the last * attribute. * * @param filename the file to load * @throws Exception if somethings goes wrong */ protected static Instances load ( String filename ) throws Exception { Instances result ; BufferedReader reader ; reader = new BufferedReader ( new FileReader ( filename )); result = new Instances ( reader ); result . setClassIndex ( result . numAttributes () - 1 ); reader . close (); return result ; } /** * saves the data to the specified file * * @param data the data to save to a file * @param filename the file to save the data to * @throws Exception if something goes wrong */ protected static void save ( Instances data , String filename ) throws Exception { BufferedWriter writer ; writer = new BufferedWriter ( new FileWriter ( filename )); writer . write ( data . toString ()); writer . newLine (); writer . flush (); writer . close (); } /** * Takes four arguments: * <ol> * <li>input train file</li> * <li>input test file</li> * <li>output train file</li> * <li>output test file</li> * </ol> * * @param args the commandline arguments * @throws Exception if something goes wrong */ public static void main ( String [] args ) throws Exception { Instances inputTrain ; Instances inputTest ; Instances outputTrain ; Instances outputTest ; Discretize filter ; * load data ( class attribute is assumed to be last attribute ) inputTrain = load ( args [ 0 ] ); inputTest = load ( args [ 1 ] ); * setup filter filter = new Discretize (); filter . setInputFormat ( inputTrain ); * apply filter outputTrain = Filter . useFilter ( inputTrain , filter ); outputTest = Filter . useFilter ( inputTest , filter ); * save output save ( outputTrain , args [ 2 ] ); save ( outputTest , args [ 3 ] ); } } The same can be achieved from the commandline with this command ( batch filtering ): java weka.filters.supervised.attribute.Discretize -b -i < in -train> -o <out-train> -r < in -test> -s <out-test> -c <class-index> See also # Manual discretization (Using the MathExpression filter) Batch filtering Downloads # DiscretizeTest.java Links # Javadoc Discretize (supervised) Discretize (unsupervised)","title":"Discretizing datasets"},{"location":"discretizing_datasets/#see-also","text":"Manual discretization (Using the MathExpression filter) Batch filtering","title":"See also"},{"location":"discretizing_datasets/#downloads","text":"DiscretizeTest.java","title":"Downloads"},{"location":"discretizing_datasets/#links","text":"Javadoc Discretize (supervised) Discretize (unsupervised)","title":"Links"},{"location":"document_classification/","text":"See Text categorization with Weka","title":"Document classification"},{"location":"documentation/","text":"This wiki is not the only source of information on the Weka software. Weka comes with built-in help and includes a comprehensive manual. For an introduction to the machine learning techniques implemented in Weka, and the software itself, consider taking a look at the book Data Mining: Practical Machine Learning Tools and Techniques and its freely available online appendix on the Weka workbench , providing an overview of the software. Closely linked to the book, there are also free online courses on data mining with the machine learning techniques in Weka. A list of sources with information on Weka is provided below. General documentation # The online appendix The Weka Workbench , distributed as a free PDF, for the fourth edition of the book Data Mining: Practical Machine Learning Tools and Techniques . The manual for Weka 3.8 and the manual for Weka 3.9 , as included in the distribution of the software when you download it. The Javadoc for Weka 3.8 and the Javadoc for Weka 3.9 , extracted directly from the source code, providing information on the API and parameters for command-line usage of Weka. The videos and slides for the online courses on Data Mining with Weka , More Data Mining with Weka , and Advanced Data Mining with Weka . Weka packages # There is a list of packages for Weka that can be installed using the built-in package manager. Javadoc for a package is available at https://weka.sourceforge.io/doc.packages/ followed by the name of the package. Mailing list archive # The Weka mailing list is a very helpful source of information, spanning more than 15 years of questions and answers on Weka. Blogs # There is the official Weka blog that has Weka-related news items and the occasional article of interest to Weka users. There is also Mark Hall's blog with a lot of useful information on several important Weka packages in particular. Other sources of information # Weka can be used from several other software systems for data science, and there is a set of slides on WEKA in the Ecosystem for Scientific Computing covering Octave/Matlab, R, Python, and Hadoop. A page with with news and documentation on Weka's support for importing PMML models . A short tutorial on connecting Weka to MongoDB using a JDBC driver .","title":"Documentation"},{"location":"documentation/#general-documentation","text":"The online appendix The Weka Workbench , distributed as a free PDF, for the fourth edition of the book Data Mining: Practical Machine Learning Tools and Techniques . The manual for Weka 3.8 and the manual for Weka 3.9 , as included in the distribution of the software when you download it. The Javadoc for Weka 3.8 and the Javadoc for Weka 3.9 , extracted directly from the source code, providing information on the API and parameters for command-line usage of Weka. The videos and slides for the online courses on Data Mining with Weka , More Data Mining with Weka , and Advanced Data Mining with Weka .","title":"General documentation"},{"location":"documentation/#weka-packages","text":"There is a list of packages for Weka that can be installed using the built-in package manager. Javadoc for a package is available at https://weka.sourceforge.io/doc.packages/ followed by the name of the package.","title":"Weka packages"},{"location":"documentation/#mailing-list-archive","text":"The Weka mailing list is a very helpful source of information, spanning more than 15 years of questions and answers on Weka.","title":"Mailing list archive"},{"location":"documentation/#blogs","text":"There is the official Weka blog that has Weka-related news items and the occasional article of interest to Weka users. There is also Mark Hall's blog with a lot of useful information on several important Weka packages in particular.","title":"Blogs"},{"location":"documentation/#other-sources-of-information","text":"Weka can be used from several other software systems for data science, and there is a set of slides on WEKA in the Ecosystem for Scientific Computing covering Octave/Matlab, R, Python, and Hadoop. A page with with news and documentation on Weka's support for importing PMML models . A short tutorial on connecting Weka to MongoDB using a JDBC driver .","title":"Other sources of information"},{"location":"downloading_weka/","text":"There are two versions of Weka: Weka 3.8 is the latest stable version and Weka 3.9 is the development version. New releases of these two versions are normally made once or twice a year. The stable version receives only bug fixes and feature upgrades that do not break compatibility with its earlier releases, while the development version may receive new features that break compatibility with its earlier releases. Weka 3.8 and 3.9 feature a package management system that makes it easy for the Weka community to add new functionality to Weka. The package management system requires an internet connection in order to download and install packages. Stable version # Weka 3.8 is the latest stable version of Weka. This branch of Weka only receives bug fixes and upgrades that do not break compatibility with earlier 3.8 releases, although major new features may become available in packages. There are different options for downloading and installing it on your system: Windows # Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-windows.exe; 133.2 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka. Mac OS - Intel processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-8-6-azul-zulu-osx.dmg; 180.2 MB) Mac OS - ARM processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-8-6-azul-zulu-arm-osx.dmg; 166.3 MB) Linux # Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-linux.zip; 146.9 MB) First unzip the zip file. This will create a new directory called weka-3-8-5. To run Weka, change into that directory and type ./weka.sh Other platforms # Click here to download a zip archive containing Weka (weka-3-8-6.zip; 59.6 MB) First unzip the zip file. This will create a new directory called weka-3-8-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note that using -jar will override your current CLASSPATH variable and only use the weka.jar . Developer version # This is the main development trunk of Weka and continues from the stable Weka 3.8 code line. It may receive new features that break backwards compatibility. Windows # Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-windows.exe; 133.0 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka. Mac OS - Intel processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-9-6-azul-zulu-osx.dmg; 180.0 MB) Mac OS - ARM processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-9-6-azul-zulu-arm-osx.dmg; 166.3 MB) Linux # Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-linux.zip; 146.7 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type ./weka.sh Other platforms # Click here to download a zip archive containing Weka (weka-3-9-6.zip; 59.4 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note, that using -jar will override your current CLASSPATH variable and only use the weka.jar . Old versions # All old versions of Weka are available from the Sourceforge website . Upgrading from Weka 3.7 # In case you are upgrading an existing Weka 3.7 installation, if the Weka 3.8 package manager does not start up, please delete the file installedPackageCache.ser in the packages folder that resides in the wekafiles folder in your user home. Also, serialized Weka models created in 3.7 are incompatible with 3.8. The model migrator tool can migrate some models to 3.8 (a known exception is RandomForest). Usage is as follows: java -cp <path to modelMigrator.jar>:<path to weka.jar> weka.core.ModelMigrator -i <path to old serialized weka mode> -o <upgraded model file name>","title":"Downloading and installing Weka"},{"location":"downloading_weka/#stable-version","text":"Weka 3.8 is the latest stable version of Weka. This branch of Weka only receives bug fixes and upgrades that do not break compatibility with earlier 3.8 releases, although major new features may become available in packages. There are different options for downloading and installing it on your system:","title":"Stable version"},{"location":"downloading_weka/#windows","text":"Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-windows.exe; 133.2 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka.","title":"Windows"},{"location":"downloading_weka/#mac-os-intel-processors","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-8-6-azul-zulu-osx.dmg; 180.2 MB)","title":"Mac OS - Intel processors"},{"location":"downloading_weka/#mac-os-arm-processors","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-8-6-azul-zulu-arm-osx.dmg; 166.3 MB)","title":"Mac OS - ARM processors"},{"location":"downloading_weka/#linux","text":"Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-linux.zip; 146.9 MB) First unzip the zip file. This will create a new directory called weka-3-8-5. To run Weka, change into that directory and type ./weka.sh","title":"Linux"},{"location":"downloading_weka/#other-platforms","text":"Click here to download a zip archive containing Weka (weka-3-8-6.zip; 59.6 MB) First unzip the zip file. This will create a new directory called weka-3-8-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note that using -jar will override your current CLASSPATH variable and only use the weka.jar .","title":"Other platforms"},{"location":"downloading_weka/#developer-version","text":"This is the main development trunk of Weka and continues from the stable Weka 3.8 code line. It may receive new features that break backwards compatibility.","title":"Developer version"},{"location":"downloading_weka/#windows_1","text":"Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-windows.exe; 133.0 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka.","title":"Windows"},{"location":"downloading_weka/#mac-os-intel-processors_1","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-9-6-azul-zulu-osx.dmg; 180.0 MB)","title":"Mac OS - Intel processors"},{"location":"downloading_weka/#mac-os-arm-processors_1","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-9-6-azul-zulu-arm-osx.dmg; 166.3 MB)","title":"Mac OS - ARM processors"},{"location":"downloading_weka/#linux_1","text":"Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-linux.zip; 146.7 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type ./weka.sh","title":"Linux"},{"location":"downloading_weka/#other-platforms_1","text":"Click here to download a zip archive containing Weka (weka-3-9-6.zip; 59.4 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note, that using -jar will override your current CLASSPATH variable and only use the weka.jar .","title":"Other platforms"},{"location":"downloading_weka/#old-versions","text":"All old versions of Weka are available from the Sourceforge website .","title":"Old versions"},{"location":"downloading_weka/#upgrading-from-weka-37","text":"In case you are upgrading an existing Weka 3.7 installation, if the Weka 3.8 package manager does not start up, please delete the file installedPackageCache.ser in the packages folder that resides in the wekafiles folder in your user home. Also, serialized Weka models created in 3.7 are incompatible with 3.8. The model migrator tool can migrate some models to 3.8 (a known exception is RandomForest). Usage is as follows: java -cp <path to modelMigrator.jar>:<path to weka.jar> weka.core.ModelMigrator -i <path to old serialized weka mode> -o <upgraded model file name>","title":"Upgrading from Weka 3.7"},{"location":"ensemble_selection/","text":"Notes # This bug has now been fixed. (12/2014) There is a bug in the code to build a library -- trying to build any model specification with three layers (e.g., Bagging a REPTree) causes the form to freeze up and/or crash. The documentation on how to run from the command line is outdated. Some corrections: The \"-D\" option no longer exists. The command shown for training a library from the command line: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 5 -S 1 -O -t yourTrainingInstances.arff fails for me with an exception that \"Folds 1 and 5 are not equal.\" A command line that works is to set the folds to 1: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 1 -S 1 -O -t yourTrainingInstances.arff Links # Ensemble_selection.pdf - Documentation on how to use Ensemble Selection in Weka Ensemble Selection from Libraries of Models, ICML'04","title":"Notes"},{"location":"ensemble_selection/#notes","text":"This bug has now been fixed. (12/2014) There is a bug in the code to build a library -- trying to build any model specification with three layers (e.g., Bagging a REPTree) causes the form to freeze up and/or crash. The documentation on how to run from the command line is outdated. Some corrections: The \"-D\" option no longer exists. The command shown for training a library from the command line: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 5 -S 1 -O -t yourTrainingInstances.arff fails for me with an exception that \"Folds 1 and 5 are not equal.\" A command line that works is to set the folds to 1: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 1 -S 1 -O -t yourTrainingInstances.arff","title":"Notes"},{"location":"ensemble_selection/#links","text":"Ensemble_selection.pdf - Documentation on how to use Ensemble Selection in Weka Ensemble Selection from Libraries of Models, ICML'04","title":"Links"},{"location":"extending_weka/","text":"The following articles describe how you can extend Weka: Writing a new Filter Writing a new Classifier Writing your own Classifier Article","title":"Extending Weka"},{"location":"faq/","text":"General # What are the principal release branches of Weka? Where can I get old versions of WEKA? How do I get the latest bugfixes? Can I check my CLASSPATH from within WEKA? Where is my home directory located? Can I check how much memory is available for WEKA? Can I use WEKA in commercial applications? Basic usage # Can I use CSV files? How do I perform CSV file conversion? How do I divide a dataset into training and test set? How do I generate compatible train and test sets that get processed with a filter? How do I perform attribute selection? How do I perform clustering? Where do I find visualization of classifiers, etc.? How do I perform text classification? How can I perform multi-instance learning in WEKA? How do I perform cost-sensitive classification? How do I make predictions with a trained model? Why am I missing certain nominal or string values from sparse instances? Can I use WEKA for time series analysis? Does WEKA support multi-label classification? How do I perform one-class classification? Can I make a screenshot of a plot or graph directly in WEKA? How do I use the package manager? What do I do if the package manager does not start? Advanced usage # How can I track instances in WEKA? How do I use ID attributes? How do I connect to a database? How do I use WEKA from the command line? Can I tune the parameters of a classifier? How do I generate Learning curves? Where can I find information regarding ROC curves? I have unbalanced data - now what? Can I run an experiment using clusterers in the Experimenter? How can I use transactional data in Weka? How can I use Weka with Matlab or Octave? How can I speed up Weka? Can I use GPUs to speed up Weka? Customizing Weka # Can I change the colors (background, axes, etc.) of the plots in WEKA? How do I add a new classifier, filter, kernel, etc Using third-party tools # How do I use libsvm in WEKA? The snowball stemmers don't work, what am I doing wrong? Developing with WEKA # Where can I get WEKA's source code? How do I compile WEKA? What is Git and what do I need to do to access it? How do I use WEKA's classes in my own code? How do I write a new classifier or filter? Can I compile WEKA into native code? Can I use WEKA from C#? Can I use WEKA from Python? Can I use WEKA from Groovy? Serialization is nice, but what about generating actual Java code from WEKA classes? How are packages structured for the package management system? Pluggable evaluation metrics for classification/regression How can I contribute to WEKA? Windows # How do I modify the CLASSPATH? How do I modify the RunWeka.bat file? Can I process UTF-8 datasets or files? How do I run the Windows Weka installer in silent mode? Troubleshooting # I have Weka download problems - what's going wrong? My ARFF file doesn't load - why? What does nominal value not declared in header, read Token[X], line Y mean? ) How do I get rid of this OutOfMemoryException? How do I deal with a StackOverflowError? Why do I get the error message 'training and test set are not compatible'? Couldn't read from database: unknown data type Trying to add JDBC driver: ... - Error, not in CLASSPATH? I cannot process large datasets - any ideas? See Troubleshooting article for more troubleshooting.","title":"FAQ"},{"location":"faq/#general","text":"What are the principal release branches of Weka? Where can I get old versions of WEKA? How do I get the latest bugfixes? Can I check my CLASSPATH from within WEKA? Where is my home directory located? Can I check how much memory is available for WEKA? Can I use WEKA in commercial applications?","title":"General"},{"location":"faq/#basic-usage","text":"Can I use CSV files? How do I perform CSV file conversion? How do I divide a dataset into training and test set? How do I generate compatible train and test sets that get processed with a filter? How do I perform attribute selection? How do I perform clustering? Where do I find visualization of classifiers, etc.? How do I perform text classification? How can I perform multi-instance learning in WEKA? How do I perform cost-sensitive classification? How do I make predictions with a trained model? Why am I missing certain nominal or string values from sparse instances? Can I use WEKA for time series analysis? Does WEKA support multi-label classification? How do I perform one-class classification? Can I make a screenshot of a plot or graph directly in WEKA? How do I use the package manager? What do I do if the package manager does not start?","title":"Basic usage"},{"location":"faq/#advanced-usage","text":"How can I track instances in WEKA? How do I use ID attributes? How do I connect to a database? How do I use WEKA from the command line? Can I tune the parameters of a classifier? How do I generate Learning curves? Where can I find information regarding ROC curves? I have unbalanced data - now what? Can I run an experiment using clusterers in the Experimenter? How can I use transactional data in Weka? How can I use Weka with Matlab or Octave? How can I speed up Weka? Can I use GPUs to speed up Weka?","title":"Advanced usage"},{"location":"faq/#customizing-weka","text":"Can I change the colors (background, axes, etc.) of the plots in WEKA? How do I add a new classifier, filter, kernel, etc","title":"Customizing Weka"},{"location":"faq/#using-third-party-tools","text":"How do I use libsvm in WEKA? The snowball stemmers don't work, what am I doing wrong?","title":"Using third-party tools"},{"location":"faq/#developing-with-weka","text":"Where can I get WEKA's source code? How do I compile WEKA? What is Git and what do I need to do to access it? How do I use WEKA's classes in my own code? How do I write a new classifier or filter? Can I compile WEKA into native code? Can I use WEKA from C#? Can I use WEKA from Python? Can I use WEKA from Groovy? Serialization is nice, but what about generating actual Java code from WEKA classes? How are packages structured for the package management system? Pluggable evaluation metrics for classification/regression How can I contribute to WEKA?","title":"Developing with WEKA"},{"location":"faq/#windows","text":"How do I modify the CLASSPATH? How do I modify the RunWeka.bat file? Can I process UTF-8 datasets or files? How do I run the Windows Weka installer in silent mode?","title":"Windows"},{"location":"faq/#troubleshooting","text":"I have Weka download problems - what's going wrong? My ARFF file doesn't load - why? What does nominal value not declared in header, read Token[X], line Y mean? ) How do I get rid of this OutOfMemoryException? How do I deal with a StackOverflowError? Why do I get the error message 'training and test set are not compatible'? Couldn't read from database: unknown data type Trying to add JDBC driver: ... - Error, not in CLASSPATH? I cannot process large datasets - any ideas? See Troubleshooting article for more troubleshooting.","title":"Troubleshooting"},{"location":"feature_extraction_from_images/","text":"ImageJ can be used to extract features from images. ImageJ contains a macro language with which it is easy to extract features and then dump them into an ARFF file. Links # ImageJ homepage","title":"Feature extraction from images"},{"location":"feature_extraction_from_images/#links","text":"ImageJ homepage","title":"Links"},{"location":"filtered_classifier_updateable/","text":"Description # Incremental version of weka.classifiers.meta.FilteredClassifier , which takes only incremental base classifiers (i.e., classifiers implementing weka.classifiers.UpdateableClassifier ). Reference # -none- Package # weka.classifiers.meta Download # Source code: FilteredClassifierUpdateable.java Example class: FilteredUpdateableTest.java Additional Information # -none- Version # Tested with source code from git (= trunk/weka ) as of 10/11/2008.","title":"Description"},{"location":"filtered_classifier_updateable/#description","text":"Incremental version of weka.classifiers.meta.FilteredClassifier , which takes only incremental base classifiers (i.e., classifiers implementing weka.classifiers.UpdateableClassifier ).","title":"Description"},{"location":"filtered_classifier_updateable/#reference","text":"-none-","title":"Reference"},{"location":"filtered_classifier_updateable/#package","text":"weka.classifiers.meta","title":"Package"},{"location":"filtered_classifier_updateable/#download","text":"Source code: FilteredClassifierUpdateable.java Example class: FilteredUpdateableTest.java","title":"Download"},{"location":"filtered_classifier_updateable/#additional-information","text":"-none-","title":"Additional Information"},{"location":"filtered_classifier_updateable/#version","text":"Tested with source code from git (= trunk/weka ) as of 10/11/2008.","title":"Version"},{"location":"generating_and_saving_a_precision_recall_curve/","text":"The following Java class evaluates a NaiveBayes classifier using cross-validation with a dataset provided by the user and saves a precision-recall curve for the first class label as a JPEG file, based on a user-specified file name. Source code: import java.awt.* ; import java.io.* ; import java.util.* ; import javax.swing.* ; import weka.core.* ; import weka.classifiers.* ; import weka.classifiers.bayes.NaiveBayes ; import weka.classifiers.evaluation.Evaluation ; import weka.classifiers.evaluation.ThresholdCurve ; import weka.gui.visualize.* ; /** * Generates and saves a precision-recall curve. Uses a cross-validation * with NaiveBayes to make the curve. * * @author FracPete * @author Eibe Frank */ public class SavePrecisionRecallCurve { /** * takes two arguments: dataset in ARFF format (expects class to * be last attribute) and name of file with output */ public static void main ( String [] args ) throws Exception { // load data Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); // train classifier Classifier cl = new NaiveBayes (); Evaluation eval = new Evaluation ( data ); eval . crossValidateModel ( cl , data , 10 , new Random ( 1 )); // generate curve ThresholdCurve tc = new ThresholdCurve (); int classIndex = 0 ; Instances result = tc . getCurve ( eval . predictions (), classIndex ); // plot curve ThresholdVisualizePanel vmc = new ThresholdVisualizePanel (); PlotData2D tempd = new PlotData2D ( result ); // specify which points are connected boolean [] cp = new boolean [ result . numInstances () ] ; for ( int n = 1 ; n < cp . length ; n ++ ) cp [ n ] = true ; tempd . setConnectPoints ( cp ); // add plot vmc . addPlot ( tempd ); // We want a precision-recall curve vmc . setXIndex ( result . attribute ( \"Recall\" ). index ()); vmc . setYIndex ( result . attribute ( \"Precision\" ). index ()); // Make window with plot but don't show it JFrame jf = new JFrame (); jf . setSize ( 500 , 400 ); jf . getContentPane (). add ( vmc ); jf . pack (); // Save to file specified as second argument (can use any of // BMPWriter, JPEGWriter, PNGWriter, PostscriptWriter for different formats) JComponentWriter jcw = new JPEGWriter ( vmc . getPlotPanel (), new File ( args [ 1 ] )); jcw . toOutput (); System . exit ( 1 ); } } See also # ROC curves Visualizing ROC curve Plotting multiple ROC curves Version # Needs the developer version >=3.5.1 or 3.6.x","title":"Generating and saving a precision recall curve"},{"location":"generating_and_saving_a_precision_recall_curve/#see-also","text":"ROC curves Visualizing ROC curve Plotting multiple ROC curves","title":"See also"},{"location":"generating_and_saving_a_precision_recall_curve/#version","text":"Needs the developer version >=3.5.1 or 3.6.x","title":"Version"},{"location":"generating_classifier_evaluation_output_manually/","text":"In the following some code snippets that explain how to generate the output Weka generates when one runs a classifier from the commandline. When referring to the Evaluation class, the weka.classifiers.Evaluation class is meant. This article provides only a quick overview, for more details, please see the Javadoc of the Evaluation class. Model # A classifier's model, if that classifier supports the output of it, can be simply output by using the toString() method after it got trained: Instances data = ... // from somewhere Classifier cls = new weka . classifiers . trees . J48 (); cls . buildClassifier ( data ); System . out . println ( cls ); NB: Weka always outputs the model based on the full training set (provided with the option -t ), no matter whether cross-validation is used or a designated test set (via -T ). The 10 models generated during a 10-fold cross-validation run are never output. If you want to output these models you have to simulate the crossValidateModel method yourself, use the KnowledgeFlow (see article Displaying results of cross-validation folds ). Statistics # The statistics, also called the summary of an evaluation, can be be generated via the toSummaryString methods. Here is an example of the summary from a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toSummaryString ()); Detailed class statistics # In order to generate the detailed statistics per class (on the commandline via option -i ), one can use the toClassDetailsString methods. Once again a code snippet featuring a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toClassDetailsString ()); Confusion matrix # The confusion matrix is simply output with the toMatrixString() or toMatrixString(String) method of the Evaluation class. In the following an example of cross-validating J48 on a dataset and outputting the confusion matrix to stdout. Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toMatrixString ()); See also # Use Weka in your Java code - general overview of the Weka API","title":"Generating classifier evaluation output manually"},{"location":"generating_classifier_evaluation_output_manually/#model","text":"A classifier's model, if that classifier supports the output of it, can be simply output by using the toString() method after it got trained: Instances data = ... // from somewhere Classifier cls = new weka . classifiers . trees . J48 (); cls . buildClassifier ( data ); System . out . println ( cls ); NB: Weka always outputs the model based on the full training set (provided with the option -t ), no matter whether cross-validation is used or a designated test set (via -T ). The 10 models generated during a 10-fold cross-validation run are never output. If you want to output these models you have to simulate the crossValidateModel method yourself, use the KnowledgeFlow (see article Displaying results of cross-validation folds ).","title":"Model"},{"location":"generating_classifier_evaluation_output_manually/#statistics","text":"The statistics, also called the summary of an evaluation, can be be generated via the toSummaryString methods. Here is an example of the summary from a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toSummaryString ());","title":"Statistics"},{"location":"generating_classifier_evaluation_output_manually/#detailed-class-statistics","text":"In order to generate the detailed statistics per class (on the commandline via option -i ), one can use the toClassDetailsString methods. Once again a code snippet featuring a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toClassDetailsString ());","title":"Detailed class statistics"},{"location":"generating_classifier_evaluation_output_manually/#confusion-matrix","text":"The confusion matrix is simply output with the toMatrixString() or toMatrixString(String) method of the Evaluation class. In the following an example of cross-validating J48 on a dataset and outputting the confusion matrix to stdout. Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toMatrixString ());","title":"Confusion matrix"},{"location":"generating_classifier_evaluation_output_manually/#see-also","text":"Use Weka in your Java code - general overview of the Weka API","title":"See also"},{"location":"generating_cv_folds/","text":"You have two choices of generating cross-validation folds: Filter approach - uses a bash script to generate the train/test pairs beforehand Java approach - to be used from within your own Java code, creates train/test pairs on the fly","title":"Generating cv folds"},{"location":"generating_cv_folds_filter/","text":"The filter RemoveFolds (package weka.filters.unsupervised.instance ) can be used to generate the train/test splits used in cross-validation (for stratified folds, use weka.filters.supervised.instance.StratifiedRemoveFolds ). The filter has to be used twice for each train/test split, first to generate the train set and then to obtain the test set. Since this is rather cumbersome by hand, one can also put this into a bash script: #!/bin/bash # # expects the weka.jar as first parameter and the datasets to work on as # second parameter. # # FracPete, 2007-04-10 if [ ! $# -eq 2 ] then echo echo \"usage: folds.sh <weka.jar> <dataset>\" echo exit 1 fi JAR = $1 DATASET = $2 FOLDS = 10 FILTER = weka.filters.unsupervised.instance.RemoveFolds SEED = 1 for (( i = 1 ; i < = $FOLDS ; i++ )) do echo \"Generating pair $i / $FOLDS ...\" OUTFILE = ` echo $DATASET | sed s/ \"\\.arff\" //g ` # train set java -cp $JAR $FILTER -V -N $FOLDS -F $i -S $SEED -i $DATASET -o \" $OUTFILE -train- $i -of- $FOLDS .arff\" # test set java -cp $JAR $FILTER -N $FOLDS -F $i -S $SEED -i $DATASET -o \" $OUTFILE -test- $i -of- $FOLDS .arff\" done The script expects two parameters: the weka.jar (or the path to the Weka classes) the dataset to generate the train/test pairs from Example: ./folds.sh /some/where/weka.jar /some/where/else/dataset.arff This example will create the train/test splits for a 10-fold cross-validation at the same location as the original dataset, i.e., in the directory /some/where/else/ . Downloads # folds.sh","title":"Generating cv folds filter"},{"location":"generating_cv_folds_filter/#downloads","text":"folds.sh","title":"Downloads"},{"location":"generating_cv_folds_java/","text":"This article describes how to generate train/test splits for cross-validation using the Weka API directly. The following variables are given: Instances data = ...; // contains the full dataset we wann create train/test sets from int seed = ...; // the seed for randomizing the data int folds = ...; // the number of folds to generate, >=2 Randomize the data # First, randomize your data: Random rand = new Random ( seed ); // create seeded number generator randData = new Instances ( data ); // create copy of original data randData . randomize ( rand ); // randomize data with number generator In case your data has a nominal class and you wanna perform stratified cross-validation: randData . stratify ( folds ); Generate the folds # Single run # Next thing that we have to do is creating the train and the test set: for ( int n = 0 ; n < folds ; n ++ ) { Instances train = randData . trainCV ( folds , n , rand ); Instances test = randData . testCV ( folds , n ); // further processing, classification, etc. ... } Note: the above code is used by the weka.filters.supervised.instance.StratifiedRemoveFolds filter the weka.classifiers.Evaluation class and the Explorer/Experimenter would use this method for obtaining the train set: Instances train = randData . trainCV ( folds , n , rand ); Multiple runs # The example above only performs one run of a cross-validation. In case you want to run 10 runs of 10-fold cross-validation, use the following loop: Instances data = ...; // our dataset again, obtained from somewhere int runs = 10 ; for ( int i = 0 ; i < runs ; i ++ ) { seed = i + 1 ; // every run gets a new, but defined seed value // see: randomize the data ... // see: generate the folds ... } See also # Use Weka in your Java code - for general use of the Weka API Downloads # CrossValidationSingleRun.java ( stable , developer ) - simulates a single run of 10-fold cross-validation CrossValidationSingleRunVariant.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but outputs the confusion matrices for each single train/test pair as well. CrossValidationMultipleRuns.java ( stable , developer ) - simulates 10 runs of 10-fold cross-validation CrossValidationAddPrediction.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but also adds the classification/distribution/error flag to the test data (uses the AddClassification filter)","title":"Generating cv folds java"},{"location":"generating_cv_folds_java/#randomize-the-data","text":"First, randomize your data: Random rand = new Random ( seed ); // create seeded number generator randData = new Instances ( data ); // create copy of original data randData . randomize ( rand ); // randomize data with number generator In case your data has a nominal class and you wanna perform stratified cross-validation: randData . stratify ( folds );","title":"Randomize the data"},{"location":"generating_cv_folds_java/#generate-the-folds","text":"","title":"Generate the folds"},{"location":"generating_cv_folds_java/#single-run","text":"Next thing that we have to do is creating the train and the test set: for ( int n = 0 ; n < folds ; n ++ ) { Instances train = randData . trainCV ( folds , n , rand ); Instances test = randData . testCV ( folds , n ); // further processing, classification, etc. ... } Note: the above code is used by the weka.filters.supervised.instance.StratifiedRemoveFolds filter the weka.classifiers.Evaluation class and the Explorer/Experimenter would use this method for obtaining the train set: Instances train = randData . trainCV ( folds , n , rand );","title":"Single run"},{"location":"generating_cv_folds_java/#multiple-runs","text":"The example above only performs one run of a cross-validation. In case you want to run 10 runs of 10-fold cross-validation, use the following loop: Instances data = ...; // our dataset again, obtained from somewhere int runs = 10 ; for ( int i = 0 ; i < runs ; i ++ ) { seed = i + 1 ; // every run gets a new, but defined seed value // see: randomize the data ... // see: generate the folds ... }","title":"Multiple runs"},{"location":"generating_cv_folds_java/#see-also","text":"Use Weka in your Java code - for general use of the Weka API","title":"See also"},{"location":"generating_cv_folds_java/#downloads","text":"CrossValidationSingleRun.java ( stable , developer ) - simulates a single run of 10-fold cross-validation CrossValidationSingleRunVariant.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but outputs the confusion matrices for each single train/test pair as well. CrossValidationMultipleRuns.java ( stable , developer ) - simulates 10 runs of 10-fold cross-validation CrossValidationAddPrediction.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but also adds the classification/distribution/error flag to the test data (uses the AddClassification filter)","title":"Downloads"},{"location":"generating_roc_curve/","text":"The following little Java class trains a NaiveBayes classifier with a dataset provided by the user and displays the ROC curve for the first class label. Source code: import java.awt.* ; import java.io.* ; import java.util.* ; import javax.swing.* ; import weka.core.* ; import weka.classifiers.* ; import weka.classifiers.bayes.NaiveBayes ; import weka.classifiers.evaluation.Evaluation ; import weka.classifiers.evaluation.ThresholdCurve ; import weka.gui.visualize.* ; /** * Generates and displays a ROC curve from a dataset. Uses a default * NaiveBayes to generate the ROC data. * * @author FracPete */ public class GenerateROC { /** * takes one argument: dataset in ARFF format (expects class to * be last attribute) */ public static void main ( String [] args ) throws Exception { // load data Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); // train classifier Classifier cl = new NaiveBayes (); Evaluation eval = new Evaluation ( data ); eval . crossValidateModel ( cl , data , 10 , new Random ( 1 )); // generate curve ThresholdCurve tc = new ThresholdCurve (); int classIndex = 0 ; Instances result = tc . getCurve ( eval . predictions (), classIndex ); // plot curve ThresholdVisualizePanel vmc = new ThresholdVisualizePanel (); vmc . setROCString ( \"(Area under ROC = \" + Utils . doubleToString ( tc . getROCArea ( result ), 4 ) + \")\" ); vmc . setName ( result . relationName ()); PlotData2D tempd = new PlotData2D ( result ); tempd . setPlotName ( result . relationName ()); tempd . addInstanceNumberAttribute (); // specify which points are connected boolean [] cp = new boolean [ result . numInstances () ] ; for ( int n = 1 ; n < cp . length ; n ++ ) cp [ n ] = true ; tempd . setConnectPoints ( cp ); // add plot vmc . addPlot ( tempd ); // display curve String plotName = vmc . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vmc , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } } See also # ROC curves Visualizing ROC curve Plotting multiple ROC curves Downloads # GenerateROC.java ( stable , developer )","title":"Generating roc curve"},{"location":"generating_roc_curve/#see-also","text":"ROC curves Visualizing ROC curve Plotting multiple ROC curves","title":"See also"},{"location":"generating_roc_curve/#downloads","text":"GenerateROC.java ( stable , developer )","title":"Downloads"},{"location":"generating_source_code_from_weka_classes/","text":"Some of the schemes in Weka can generate Java source code that represents their current internal state. At the moment these are classifiers (book and developer version) and filters (>3.5.6). The generated code can be used within Weka as normal classifier/filter, since this code will be derived from the same superclass ( weka.classifiers.Classifier or weka.filters.Filter ) as the generating code. Note: The commands listed here are for a Linux/Unix bash (the backslash tells the shell that the command isn't finished yet and continues on the next line). In case of Windows or the SimpleCLI, just remove the backslashes and put everything on one line. Classifiers # Instead of using a serialized filter to perform further classifications/predictions, one can also obtain source code from a trained classifier and use this instead. The advantage of this is being less dependent on version changes and incompatible serialized files. All classifiers implementing the weka.classifiers.Sourcable interface can turn their model into Java source code (check the Javadoc of this interface for all the classifiers implementing it). Here's an example of generating source code from a trained J48 (the source code is saved in a file called WekaWrapper.java ): java weka.classifiers.trees.J48 \\ -t /some/where/data.arff \\ -z SourcedJ48 \\ # name of the inner class, gets called by wrapper class WekaWrapper > /else/where/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.classifiers package. Make sure that you place the source code and/or class files in the correct location. The generated classifier can be used from the commandline or GUI like any other classifier within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the classifier in ( weka.classifiers is not listed by default). The following command calls the generated classifier with a training set (training has no effect, of course) and outputs the predictions for this dataset to stdout : java weka.classifiers.WekaWrapper \\ -t /some/file.arff \\ -p 0 # output predictions for training set Note: the Explorer can output source code as well, you only have to check the Output source code option in the More options dialog. Filters # With versions of Weka later than 3.5.6 of the developer version, one can now also turn filters into source code. The process is basically the same as with classifiers outlined above. All filters that implement the weka.filters.Sourcable interface can be turned into Java code (again, check out the Javadoc for this interface, to see the filters implementing it). The following command turns an initialized ReplaceMissingValues filter into source code: java weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -i /somewhere1/input.arff \\ -o /somewhere2/output.arff \\ -z SourcedRMV \\ # name of the inner class, gets called by wrapper class WekaWrapper > /some/place/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.filters package. Make sure that you place the source code and/or class files in the correct location. The generated filter can be used from the commandline or GUI like any other filter within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the filter in. And again a little demonstration of how to call the generated source code: java weka.filters.WekaWrapper \\ -i /some/where/input.arff \\ # must have the same structure as **/somewhere1/input.arff**, of course -o /other/place/output.arff See also # Serialization - can be used for all classifiers and filters to save them in a persistent state.","title":"Generating source code from weka classes"},{"location":"generating_source_code_from_weka_classes/#classifiers","text":"Instead of using a serialized filter to perform further classifications/predictions, one can also obtain source code from a trained classifier and use this instead. The advantage of this is being less dependent on version changes and incompatible serialized files. All classifiers implementing the weka.classifiers.Sourcable interface can turn their model into Java source code (check the Javadoc of this interface for all the classifiers implementing it). Here's an example of generating source code from a trained J48 (the source code is saved in a file called WekaWrapper.java ): java weka.classifiers.trees.J48 \\ -t /some/where/data.arff \\ -z SourcedJ48 \\ # name of the inner class, gets called by wrapper class WekaWrapper > /else/where/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.classifiers package. Make sure that you place the source code and/or class files in the correct location. The generated classifier can be used from the commandline or GUI like any other classifier within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the classifier in ( weka.classifiers is not listed by default). The following command calls the generated classifier with a training set (training has no effect, of course) and outputs the predictions for this dataset to stdout : java weka.classifiers.WekaWrapper \\ -t /some/file.arff \\ -p 0 # output predictions for training set Note: the Explorer can output source code as well, you only have to check the Output source code option in the More options dialog.","title":"Classifiers"},{"location":"generating_source_code_from_weka_classes/#filters","text":"With versions of Weka later than 3.5.6 of the developer version, one can now also turn filters into source code. The process is basically the same as with classifiers outlined above. All filters that implement the weka.filters.Sourcable interface can be turned into Java code (again, check out the Javadoc for this interface, to see the filters implementing it). The following command turns an initialized ReplaceMissingValues filter into source code: java weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -i /somewhere1/input.arff \\ -o /somewhere2/output.arff \\ -z SourcedRMV \\ # name of the inner class, gets called by wrapper class WekaWrapper > /some/place/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.filters package. Make sure that you place the source code and/or class files in the correct location. The generated filter can be used from the commandline or GUI like any other filter within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the filter in. And again a little demonstration of how to call the generated source code: java weka.filters.WekaWrapper \\ -i /some/where/input.arff \\ # must have the same structure as **/somewhere1/input.arff**, of course -o /other/place/output.arff","title":"Filters"},{"location":"generating_source_code_from_weka_classes/#see-also","text":"Serialization - can be used for all classifiers and filters to save them in a persistent state.","title":"See also"},{"location":"generic_object_editor/","text":"The GenericObjectEditor is the core component in Weka for modifying schemes, like classifiers and filters in the GUI. It has to be configured correctly in order to show default and additional schemes. See the following articles for more details: GenericObjectEditor (book version) GenericObjectEditor (developer version)","title":"Generic object editor"},{"location":"generic_object_editor_book_version/","text":"Introduction # As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off. It is assumed that you already placed the GenericPropertiesCreator.props (GPC) file in your home directory (this file is located in directory weka/gui of either the weka.jar or weka-src.jar ZIP archive) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . For disabling dynamic class discovery, you need to set the boolean constant USE_DYNAMIC of the weka.gui.GenericObjectEditor class to false . See article Compiling WEKA for more information on how to compile a modified version of WEKA. A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file). File Structure # The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules Class Discovery # Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Multiple Class Hierarchies # In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here: Links # GenericObjectEditor (developer version) CLASSPATH Properties file GenericPropertiesCreator.props","title":"Introduction"},{"location":"generic_object_editor_book_version/#introduction","text":"As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off. It is assumed that you already placed the GenericPropertiesCreator.props (GPC) file in your home directory (this file is located in directory weka/gui of either the weka.jar or weka-src.jar ZIP archive) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . For disabling dynamic class discovery, you need to set the boolean constant USE_DYNAMIC of the weka.gui.GenericObjectEditor class to false . See article Compiling WEKA for more information on how to compile a modified version of WEKA. A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file).","title":"Introduction"},{"location":"generic_object_editor_book_version/#file-structure","text":"The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules","title":"File Structure"},{"location":"generic_object_editor_book_version/#class-discovery","text":"Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes.","title":"Class Discovery"},{"location":"generic_object_editor_book_version/#multiple-class-hierarchies","text":"In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here:","title":"Multiple Class Hierarchies"},{"location":"generic_object_editor_book_version/#links","text":"GenericObjectEditor (developer version) CLASSPATH Properties file GenericPropertiesCreator.props","title":"Links"},{"location":"generic_object_editor_developer_version/","text":"Introduction # As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). In some versions (3.5.8, 3.6.0) this facility was not enabled by default as it is a bit slower than the GOE file approach, and, furthermore, does not function in environments that do not have a CLASSPATH (e.g., application servers). Later versions (3.6.1, 3.7.0) enabled the dynamic discovery again, as WEKA can now distinguish between being a standalone Java application or being run in a non-CLASSPATH environment. If you wish to enable or disable dynamic class discovery, the relevant file to edit is GenericPropertiesCreator.props (GPC). You can obtain this file either from the weka.jar or weka-src.jar archive. Open one of these files with an archive manager that can handle ZIP files (for Windows users, you can use 7-Zip for this) and navigate to the weka/gui directory, where the GPC file is located. All that is required, is to change the UseDynamic property in this file from false to true (for enabling it) or the other way round (for disabling it). After changing the file, you just place it in your home directory. In order to find out the location of your home directory, do the following: Linux/Unix Open a terminal run the following command: echo $HOME Windows Open a command-primpt run the following command: echo %USERPROFILE% If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off again. It is assumed that you already place the GPC file in your home directory (see steps above) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file). File Structure # The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70 in WEKA 3.4.4!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules Exclusion # It may not always be desired to list all the classes that can be found along the CLASSPATH . Sometimes, classes cannot be declared abstract but still shouldn't be listed in the GOE. For that reason one can list classes, interfaces, superclasses for certain packages to be excluded from display. This exclusion is done with the following file: weka/gui/GenericPropertiesCreator.excludes The format of this properties file is fairly easy: <key>=<prefix>:<class>[,<prefix>:<class>] Where the <key> corresponds to a key in the GenericPropertiesCreator.props file and the <prefix> can be one of the following: S - Superclass any class class derived from this will be excluded I - Interface any class implementing this interface will be excluded C - Class exactly this class will be excluded Here are a few examples: # exclude all ResultListeners that also implement the ResultProducer interface # (all ResultProducers do that!) weka.experiment.ResultListener = \\ I:weka.experiment.ResultProducer # exclude J48 and all SingleClassifierEnhancers weka.classifiers.Classifier = \\ C:weka.classifiers.trees.J48, \\ S:weka.classifiers.SingleClassifierEnhancer Class Discovery # Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Multiple Class Hierarchies # In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here: Capabilities # Version 3.5.3 of Weka introduces the notion of Capabilities . Capabilities basically list what kind of data a certain object can handle, e.g., one classifier can handle numeric classes, but another cannot. In case a class supports capabilities the additional buttons Filter... and Remove filter will be available in the GOE. The Filter... button pops up a dialog which lists all available Capabilities: One can then choose those capabilities an object, e.g., a classifier, should have. If one is looking for classification problem, then the Nominal class Capability can be selected. On the other hand, if one needs a regression scheme, then the Capability Numeric class can be selected. This filtering mechanism makes the search for an appropriate learning scheme easier. After applying that filter, the tree with the objects will be displayed again and lists all objects that can handle all the selected Capabilities black , the ones that cannot red (starting with 3.5.8: silver ) and the ones that might be able to handle them blue (e.g., meta classifiers which depend on their base classifier(s)). Links # GenericObjectEditor (book version) CLASSPATH Properties file GenericPropertiesCreator.props GenericPropertiesCreator.excludes","title":"Introduction"},{"location":"generic_object_editor_developer_version/#introduction","text":"As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). In some versions (3.5.8, 3.6.0) this facility was not enabled by default as it is a bit slower than the GOE file approach, and, furthermore, does not function in environments that do not have a CLASSPATH (e.g., application servers). Later versions (3.6.1, 3.7.0) enabled the dynamic discovery again, as WEKA can now distinguish between being a standalone Java application or being run in a non-CLASSPATH environment. If you wish to enable or disable dynamic class discovery, the relevant file to edit is GenericPropertiesCreator.props (GPC). You can obtain this file either from the weka.jar or weka-src.jar archive. Open one of these files with an archive manager that can handle ZIP files (for Windows users, you can use 7-Zip for this) and navigate to the weka/gui directory, where the GPC file is located. All that is required, is to change the UseDynamic property in this file from false to true (for enabling it) or the other way round (for disabling it). After changing the file, you just place it in your home directory. In order to find out the location of your home directory, do the following: Linux/Unix Open a terminal run the following command: echo $HOME Windows Open a command-primpt run the following command: echo %USERPROFILE% If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off again. It is assumed that you already place the GPC file in your home directory (see steps above) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file).","title":"Introduction"},{"location":"generic_object_editor_developer_version/#file-structure","text":"The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70 in WEKA 3.4.4!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules","title":"File Structure"},{"location":"generic_object_editor_developer_version/#exclusion","text":"It may not always be desired to list all the classes that can be found along the CLASSPATH . Sometimes, classes cannot be declared abstract but still shouldn't be listed in the GOE. For that reason one can list classes, interfaces, superclasses for certain packages to be excluded from display. This exclusion is done with the following file: weka/gui/GenericPropertiesCreator.excludes The format of this properties file is fairly easy: <key>=<prefix>:<class>[,<prefix>:<class>] Where the <key> corresponds to a key in the GenericPropertiesCreator.props file and the <prefix> can be one of the following: S - Superclass any class class derived from this will be excluded I - Interface any class implementing this interface will be excluded C - Class exactly this class will be excluded Here are a few examples: # exclude all ResultListeners that also implement the ResultProducer interface # (all ResultProducers do that!) weka.experiment.ResultListener = \\ I:weka.experiment.ResultProducer # exclude J48 and all SingleClassifierEnhancers weka.classifiers.Classifier = \\ C:weka.classifiers.trees.J48, \\ S:weka.classifiers.SingleClassifierEnhancer","title":"Exclusion"},{"location":"generic_object_editor_developer_version/#class-discovery","text":"Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes.","title":"Class Discovery"},{"location":"generic_object_editor_developer_version/#multiple-class-hierarchies","text":"In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here:","title":"Multiple Class Hierarchies"},{"location":"generic_object_editor_developer_version/#capabilities","text":"Version 3.5.3 of Weka introduces the notion of Capabilities . Capabilities basically list what kind of data a certain object can handle, e.g., one classifier can handle numeric classes, but another cannot. In case a class supports capabilities the additional buttons Filter... and Remove filter will be available in the GOE. The Filter... button pops up a dialog which lists all available Capabilities: One can then choose those capabilities an object, e.g., a classifier, should have. If one is looking for classification problem, then the Nominal class Capability can be selected. On the other hand, if one needs a regression scheme, then the Capability Numeric class can be selected. This filtering mechanism makes the search for an appropriate learning scheme easier. After applying that filter, the tree with the objects will be displayed again and lists all objects that can handle all the selected Capabilities black , the ones that cannot red (starting with 3.5.8: silver ) and the ones that might be able to handle them blue (e.g., meta classifiers which depend on their base classifier(s)).","title":"Capabilities"},{"location":"generic_object_editor_developer_version/#links","text":"GenericObjectEditor (book version) CLASSPATH Properties file GenericPropertiesCreator.props GenericPropertiesCreator.excludes","title":"Links"},{"location":"get_latest_bugfixes/","text":"Weka is actively developed, that means that bugs are fixed and new functionality is added (only to the developer version) all the time. Every now and then (about every 6-12 months), when there was a sufficiently large number of improvements or fixes, a release is made and uploaded to Sourceforget.net . If you don't want to wait that long, you can get the latest source code from Git and compile it yourself. See the following articles for more information: obtaining the source code from Git , either book or developer version compiling the source code","title":"Get latest bugfixes"},{"location":"getting_help/","text":"In addition to consulting the available documentation , try searching a mailing list archive or community forum to check whether a solution to your problem has already been posted there. Please consult these sources of information before posting a query on the Weka mailing list or elsewhere. And please never email individual Weka developers directly. When you do post a message regarding a problem you encountered with Weka, please include as much as information as possible. In particular, consider running Weka with a console window open so that you can see the entire error output from Java (including the Java stack trace). This makes it much more likely that you will get useful help. When posting questions, comments, or bug reports to the Weka mailing list, consider the mailing list etiquette . Mailing list archive and mirrors # Consider searching the archive of the Weka mailing list (wekalist) or its mirror marc.info . Forums offering help # You should also consider looking for a solution at stackoverflow.com , the old forum for Weka at pentaho.com , or the newer forum at hitachivantara.com . Bug reports # Bug reports can be send to the Weka mailing list or posted at the JIRA . IRC channel for discussing Weka # ##weka on freenode.","title":"Getting help"},{"location":"getting_help/#mailing-list-archive-and-mirrors","text":"Consider searching the archive of the Weka mailing list (wekalist) or its mirror marc.info .","title":"Mailing list archive and mirrors"},{"location":"getting_help/#forums-offering-help","text":"You should also consider looking for a solution at stackoverflow.com , the old forum for Weka at pentaho.com , or the newer forum at hitachivantara.com .","title":"Forums offering help"},{"location":"getting_help/#bug-reports","text":"Bug reports can be send to the Weka mailing list or posted at the JIRA .","title":"Bug reports"},{"location":"getting_help/#irc-channel-for-discussing-weka","text":"##weka on freenode.","title":"IRC channel for discussing Weka"},{"location":"git/","text":"General # The main trunk of the Weka Git repository is accessible and browseable via the following URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/main/trunk Other branches can be accessed via https://git.cms.waikato.ac.nz/weka/weka For example, if you want to obtain the source code of the 3.8 version, use this URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8 Specific version # Whenever a release of Weka is generated, the repository gets tagged . The tag for a development version has the form dev-X-Y-Z For example, WEKA 3.9.6 corresponds to the tag dev-3-9-6. The tag for a stable version is stable-X-Y-Z The WEKA 3.8 version is one of those stable versions, e.g., stable-3-8-6 will be the tag for Weka 3.8.6.","title":"General"},{"location":"git/#general","text":"The main trunk of the Weka Git repository is accessible and browseable via the following URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/main/trunk Other branches can be accessed via https://git.cms.waikato.ac.nz/weka/weka For example, if you want to obtain the source code of the 3.8 version, use this URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8","title":"General"},{"location":"git/#specific-version","text":"Whenever a release of Weka is generated, the repository gets tagged . The tag for a development version has the form dev-X-Y-Z For example, WEKA 3.9.6 corresponds to the tag dev-3-9-6. The tag for a stable version is stable-X-Y-Z The WEKA 3.8 version is one of those stable versions, e.g., stable-3-8-6 will be the tag for Weka 3.8.6.","title":"Specific version"},{"location":"gpgpu/","text":"See : this post I am looking for input from WEKA users. Please leave a comment on the website and I'll respond back. The input/help I need from WEKA users is as follows: I need to know what algorithms would be desired to have optimized first. For now, I'm working on Bayes (for starters). I need willing volunteers to use the revised code that I create (I am not making any changes to the algorithms just diverting the mathematical calculations from the CPU to the GPU to increase speed) and let me know of any performance changes observed.","title":"Gpgpu"},{"location":"gui_chooser_starts_but_not_experimenter_or_explorer/","text":"The GUIChooser starts, but the Explorer and Experimenter do not start and output an Exception like this in the terminal: /usr/share/themes/Mist/gtk-2.0/gtkrc:48: Engine \"mist\" is unsupported, ignoring ---Registering Weka Editors--- java.lang.NullPointerException at weka.gui.explorer.PreprocessPanel.addPropertyChangeListener(PreprocessPanel.java:519) at javax.swing.plaf.synth.SynthPanelUI.installListeners(SynthPanelUI.java:49) at javax.swing.plaf.synth.SynthPanelUI.installUI(SynthPanelUI.java:38) at javax.swing.JComponent.setUI(JComponent.java:652) at javax.swing.JPanel.setUI(JPanel.java:131) ... This behavior happens only under Java 5/6 and Gnome/Linux, KDE doesn't produce this error. The reason for this is, that Weka tries to look more \"native\" and therefore sets a platform-specific Swing theme. Unfortunately, this doesn't seem to be working correctly in Java 5/6 together with Gnome. A workaround for this is to set the cross-platform Metal theme. In order to use another theme one only has to create the following properties file: LookAndFeel.props with this content: Theme = javax.swing.plaf.metal.MetalLookAndFeel","title":"Gui chooser starts but not experimenter or explorer"},{"location":"history/","text":"Book 1st ed. version (3.0) Old GUI version (3.2) Stable/Book 2nd ed. version (3.4) Stable/Book 3rd ed. version (3.6) Stable/Book 4th ed. version (3.8) Development version (3.9) 3.8.6 (pkgs) 3.9.6 (pkgs) 3.8.5 (pkgs) 3.9.5 (pkgs) 3.8.4 (pkgs) 3.9.4 (pkgs) 3.8.3 (pkgs) 3.9.3 (pkgs) 3.8.2 (pkgs) 3.9.2 (pkgs) 3.6.15 3.8.1 (pkgs) 3.9.1 (pkgs) 3.6.14 3.8.0 (pkgs) 3.9.0 (pkgs) 3.6.13 3.7.13 (pkgs) 3.6.12 3.7.12 (pkgs) 3.6.11 3.7.11 (pkgs) 3.6.10 3.7.10 (pkgs) 3.7.9 (pkgs) 3.6.9 3.7.8 (pkgs) 3.6.8 3.7.7 (pkgs) 3.6.7 3.7.6 (pkgs) 3.6.6 3.7.5 (pkgs) 3.4.19 3.6.5 3.7.4 (pkgs) 3.4.18 3.6.4 3.7.3 (pkgs) 3.4.17 3.6.3 3.7.2 (pkgs) 3.4.16 3.6.2 3.7.1 3.4.15 3.6.1 3.7.0 3.4.14 3.6.0 3.4.13 3.5.8 3.4.12 3.5.7 3.4.11 3.5.6 3.4.10 3.5.5 3.4.9 3.5.4 3.4.8 3.5.3 3.4.7 3.5.2 3.4.6 3.5.1 3.4.5 3.5.0 3.4.4 3.4.3 3.4.2 3.4.1 3.4 3.3.6 3.3.5 3.3.4 3.3.3 3.2.3 3.3.2 3.0.6 3.2.2 3.3.1 3.0.5 3.2.1 3.3 3.0.4 3.2 3.0.3 3.1.9 3.0.2 3.1.8 3.0.1 3.1.7 3.0 3.1.6 Prerelease 6 3.1.5 Prerelease 5 3.1.4 Prerelease 4","title":"History"},{"location":"how_do_i_modify_the_classpath/","text":"See the article CLASSPATH and check out this section for changing the environment variable. This article explains how to add a MySQL jar to the variable. With version 3.5.4 or later you can also just use the RunWEKA.ini file to modify your CLASSPATH.","title":"How do i modify the classpath"},{"location":"how_do_i_use_the_associator_generalized_sequential_patterns/","text":"The article GeneralizedSequentialPatterns contains more information on this associator.","title":"How do i use the associator generalized sequential patterns"},{"location":"how_to_run_weka_schemes_from_commandline/","text":"It is quite often the case that one has to run a classifier, filter, attribute selection, etc. from commandline, leaving the comfort of the GUI (most likely the Explorer). Due to the vast amount of options the Weka schemes offer, it can be quite tedious setting up a scheme on the commandline. In the following, a few different approaches are listed that can be used for running a scheme from the commandline: Hardcore approach (works for all versions of Weka) one just uses the -h option to display the commandline help with all available options and chooses the ones that apply, e.g.: java weka.classifiers.functions.SMO -h The drawback of this method is, that one has to take care of escaping nested quotes oneself. As soon as one has to use meta-classifiers, this gets real messy. An introduction to the commandline use can be found in the Primer . copy/paste approach With this approach, one doesn't have to worry about correct nesting, since Weka takes care of that, returning correctly nested and escaped options. Since version 3.5.3, one can right-click (or <Alt>+<Shift> left-click for Mac users) any GenericObjectEditor panel and select the Copy configuration to clipboard option to copy the currently shown configuration to the clipboard and then just paste it into the commandline. One only needs to add the appropriate java call and other general options, like datasets, class index, etc. Another copy/paste approach is copying the configurations from the Explorer log, which is available since version 3.5.4. Every action in the Explorer, like applying a filter, running a classifier, attribute selection, etc. outputs the command to the log as well. This makes is fairly easy copying it to the clipboard and using it in the console, only the java call and other general options need to be added. See also # Primer - introduction to Weka from the commandline CLASSPATH - how to load all necessary libraries or welcome to the JAR hell Command redirection - shows how to redirect output in files","title":"How to run weka schemes from commandline"},{"location":"how_to_run_weka_schemes_from_commandline/#see-also","text":"Primer - introduction to Weka from the commandline CLASSPATH - how to load all necessary libraries or welcome to the JAR hell Command redirection - shows how to redirect output in files","title":"See also"},{"location":"ikvm_with_weka_tutorial/","text":"This tutorial walks you through the creation of a Microsoft C# program that uses Weka , and some Java API classes, via IKVM . The process will be similar for other .NET languages. Set up / Installation # You will first need to install IKVM, which can be found here . You will also need a C# compiler/VM - Mono is an excellent open source solution for both linux and windows, or you could just use Microsoft Visual Studio .NET. Conversion from Java to a .NET dll # With that out of the way, the first thing you will want to do is to convert the Weka .jar file into a .NET dll. To do this, we will use ikvmc , which is the IKVM static compiler. On the console, go to the directory which contains weka.jar, and type: > ikvmc -target:library weka.jar The -target:library call causes ikvmc to create a .dll library instead of an executable. Note that the IKVM tutorial tells you that you should add -reference:/usr/lib/IKVM.GNU.Classpath.dll (or appropriate path) to the above command, it tells IKVM where to find the GNU Classpath library. However, IKVM.GNU.Classpath.dll Is no longer included in the download package, and is from very old versions of IKVM. When Sun open sources Java, it got replaced by the IKVM.OpenJDK.*.dll files. You should now have a file called \"weka.dll\", which is a .NET version of the entire weka API. That's exactly what we want! Use the dll in a .NET application # To try it out, lets use a small C# program that I wrote. The program simply runs the J48 classifier on the Iris dataset with a 66% test/data split, and prints out the correctness percentage. It also uses a few Java classes, and is already about 95% legal Java code. The code is here: //start of file Main.cs using System ; class MainClass { public static void Main ( string [] args ) { Console . WriteLine ( \"Hello Java, from C#!\" ); classifyTest (); } const int percentSplit = 66 ; public static void classifyTest () { try { weka . core . Instances insts = new weka . core . Instances ( new java . io . FileReader ( \"iris.arff\" )); insts . setClassIndex ( insts . numAttributes () - 1 ); weka . classifiers . Classifier cl = new weka . classifiers . trees . J48 (); Console . WriteLine ( \"Performing \" + percentSplit + \"% split evaluation.\" ); //randomize the order of the instances in the dataset. weka . filters . Filter myRandom = new weka . filters . unsupervised . instance . Randomize (); myRandom . setInputFormat ( insts ); insts = weka . filters . Filter . useFilter ( insts , myRandom ); int trainSize = insts . numInstances () * percentSplit / 100 ; int testSize = insts . numInstances () - trainSize ; weka . core . Instances train = new weka . core . Instances ( insts , 0 , trainSize ); cl . buildClassifier ( train ); int numCorrect = 0 ; for ( int i = trainSize ; i < insts . numInstances (); i ++ ) { weka . core . Instance currentInst = insts . instance ( i ); double predictedClass = cl . classifyInstance ( currentInst ); if ( predictedClass == insts . instance ( i ). classValue ()) numCorrect ++ ; } Console . WriteLine ( numCorrect + \" out of \" + testSize + \" correct (\" + ( double )(( double ) numCorrect / ( double ) testSize * 100.0 ) + \"%)\" ); } catch ( java . lang . Exception ex ) { ex . printStackTrace (); } } } //end of file Main.cs Compile and run it # Now we just need to compile it. If you are using MonoDevelop or Visual Studio, you will need to add references to weka.dll, and all of the IKVM.OpenJDK.*.dll files, and lastly IKVM.Runtime.dll into your project. Otherwise, on the command line, you can type: NOTE: replace IKVM.OpenJDK. .dll with the remaining IKVM.openJDK files. >mcs Main.cs -r:weka.dll,IKVM.Runtime.dll,IKVM.OpenJDK.core.dll, IKVM.OpenJDK.*.dll to run the Mono C# compiler with references to the appropriate dlls (according to the Mono documentation, the command line arguments for Visual Studio are the same). And there you go! Now you can run the program. But make sure that the Iris.arff dataset is in the same directory first. For mono: >mono Main.exe or if you are using visual studio, just: >Main.exe Hopefully you will get as output: Hello Java, from C#! Performing 66% split evaluation. 49 out of 51 correct (96.078431372549%) And there you have it. Now we have a working program that uses Weka classes, and some classes from the standard Java API, in a C# program for the .NET framework. Links # An Introduction to IKVM IKVM.NET Mono The official IKVM tutorial Use Weka with the Microsoft .NET Framework","title":"Ikvm with weka tutorial"},{"location":"ikvm_with_weka_tutorial/#set-up-installation","text":"You will first need to install IKVM, which can be found here . You will also need a C# compiler/VM - Mono is an excellent open source solution for both linux and windows, or you could just use Microsoft Visual Studio .NET.","title":"Set up / Installation"},{"location":"ikvm_with_weka_tutorial/#conversion-from-java-to-a-net-dll","text":"With that out of the way, the first thing you will want to do is to convert the Weka .jar file into a .NET dll. To do this, we will use ikvmc , which is the IKVM static compiler. On the console, go to the directory which contains weka.jar, and type: > ikvmc -target:library weka.jar The -target:library call causes ikvmc to create a .dll library instead of an executable. Note that the IKVM tutorial tells you that you should add -reference:/usr/lib/IKVM.GNU.Classpath.dll (or appropriate path) to the above command, it tells IKVM where to find the GNU Classpath library. However, IKVM.GNU.Classpath.dll Is no longer included in the download package, and is from very old versions of IKVM. When Sun open sources Java, it got replaced by the IKVM.OpenJDK.*.dll files. You should now have a file called \"weka.dll\", which is a .NET version of the entire weka API. That's exactly what we want!","title":"Conversion from Java to a .NET dll"},{"location":"ikvm_with_weka_tutorial/#use-the-dll-in-a-net-application","text":"To try it out, lets use a small C# program that I wrote. The program simply runs the J48 classifier on the Iris dataset with a 66% test/data split, and prints out the correctness percentage. It also uses a few Java classes, and is already about 95% legal Java code. The code is here: //start of file Main.cs using System ; class MainClass { public static void Main ( string [] args ) { Console . WriteLine ( \"Hello Java, from C#!\" ); classifyTest (); } const int percentSplit = 66 ; public static void classifyTest () { try { weka . core . Instances insts = new weka . core . Instances ( new java . io . FileReader ( \"iris.arff\" )); insts . setClassIndex ( insts . numAttributes () - 1 ); weka . classifiers . Classifier cl = new weka . classifiers . trees . J48 (); Console . WriteLine ( \"Performing \" + percentSplit + \"% split evaluation.\" ); //randomize the order of the instances in the dataset. weka . filters . Filter myRandom = new weka . filters . unsupervised . instance . Randomize (); myRandom . setInputFormat ( insts ); insts = weka . filters . Filter . useFilter ( insts , myRandom ); int trainSize = insts . numInstances () * percentSplit / 100 ; int testSize = insts . numInstances () - trainSize ; weka . core . Instances train = new weka . core . Instances ( insts , 0 , trainSize ); cl . buildClassifier ( train ); int numCorrect = 0 ; for ( int i = trainSize ; i < insts . numInstances (); i ++ ) { weka . core . Instance currentInst = insts . instance ( i ); double predictedClass = cl . classifyInstance ( currentInst ); if ( predictedClass == insts . instance ( i ). classValue ()) numCorrect ++ ; } Console . WriteLine ( numCorrect + \" out of \" + testSize + \" correct (\" + ( double )(( double ) numCorrect / ( double ) testSize * 100.0 ) + \"%)\" ); } catch ( java . lang . Exception ex ) { ex . printStackTrace (); } } } //end of file Main.cs","title":"Use the dll in a .NET application"},{"location":"ikvm_with_weka_tutorial/#compile-and-run-it","text":"Now we just need to compile it. If you are using MonoDevelop or Visual Studio, you will need to add references to weka.dll, and all of the IKVM.OpenJDK.*.dll files, and lastly IKVM.Runtime.dll into your project. Otherwise, on the command line, you can type: NOTE: replace IKVM.OpenJDK. .dll with the remaining IKVM.openJDK files. >mcs Main.cs -r:weka.dll,IKVM.Runtime.dll,IKVM.OpenJDK.core.dll, IKVM.OpenJDK.*.dll to run the Mono C# compiler with references to the appropriate dlls (according to the Mono documentation, the command line arguments for Visual Studio are the same). And there you go! Now you can run the program. But make sure that the Iris.arff dataset is in the same directory first. For mono: >mono Main.exe or if you are using visual studio, just: >Main.exe Hopefully you will get as output: Hello Java, from C#! Performing 66% split evaluation. 49 out of 51 correct (96.078431372549%) And there you have it. Now we have a working program that uses Weka classes, and some classes from the standard Java API, in a C# program for the .NET framework.","title":"Compile and run it"},{"location":"ikvm_with_weka_tutorial/#links","text":"An Introduction to IKVM IKVM.NET Mono The official IKVM tutorial Use Weka with the Microsoft .NET Framework","title":"Links"},{"location":"instance_id/","text":"People often want to tag their instances with identifiers , so they can keep track of them and the predictions made on them. Adding the ID # A new ID attribute is added real easy: one only needs to run the AddID filter over the dataset and it's done. Here's an example (at a DOS/Unix command prompt): java weka.filters.unsupervised.attribute.AddID -i data_without_id.arff -o data_with_id.arff (all on a single line) Note: the AddID filter adds a numeric attribute, not a String attribute to the dataset. If you want to remove this ID attribute for the classifier in a FilteredClassifier environment again, use the Remove filter instead of the RemoveType filter (same package). Removing the ID # If you run from the command line you can use the -p option to output predictions plus any other attributes you are interested in. So it is possible to have a string attribute in your data that acts as an identifier. A problem is that most classifiers don't like String attributes, but you can get around this by using the RemoveType (this removes String attributes by default). Here's an example. Lets say you have a training file named train.arff , a testing file named test.arff , and they have an identifier String attribute as their 5th attribute. You can get the predictions from J48 along with the identifier strings by issuing the following command (at a DOS/Unix command prompt): java weka.classifiers.meta.FilteredClassifier -F weka.filters.unsupervised.attribute.RemoveType -W weka.classifiers.trees.J48 -t train.arff -T test.arff -p 5 (all on a single line) If you want, you can redirect the output to a file by adding \" > output.txt \" to the end of the line. In the Explorer GUI you could try a similar trick of using the String attribute identifiers here as well. Choose the FilteredClassifier , with the RemoveType as the filter, and whatever classifier you prefer. When you visualize the results you will need click through each instance to see the identifier listed for each.","title":"Instance id"},{"location":"instance_id/#adding-the-id","text":"A new ID attribute is added real easy: one only needs to run the AddID filter over the dataset and it's done. Here's an example (at a DOS/Unix command prompt): java weka.filters.unsupervised.attribute.AddID -i data_without_id.arff -o data_with_id.arff (all on a single line) Note: the AddID filter adds a numeric attribute, not a String attribute to the dataset. If you want to remove this ID attribute for the classifier in a FilteredClassifier environment again, use the Remove filter instead of the RemoveType filter (same package).","title":"Adding the ID"},{"location":"instance_id/#removing-the-id","text":"If you run from the command line you can use the -p option to output predictions plus any other attributes you are interested in. So it is possible to have a string attribute in your data that acts as an identifier. A problem is that most classifiers don't like String attributes, but you can get around this by using the RemoveType (this removes String attributes by default). Here's an example. Lets say you have a training file named train.arff , a testing file named test.arff , and they have an identifier String attribute as their 5th attribute. You can get the predictions from J48 along with the identifier strings by issuing the following command (at a DOS/Unix command prompt): java weka.classifiers.meta.FilteredClassifier -F weka.filters.unsupervised.attribute.RemoveType -W weka.classifiers.trees.J48 -t train.arff -T test.arff -p 5 (all on a single line) If you want, you can redirect the output to a file by adding \" > output.txt \" to the end of the line. In the Explorer GUI you could try a similar trick of using the String attribute identifiers here as well. Choose the FilteredClassifier , with the RemoveType as the filter, and whatever classifier you prefer. When you visualize the results you will need click through each instance to see the identifier listed for each.","title":"Removing the ID"},{"location":"j48_weighter_patch/","text":"Description # J48-Weighter patch: Modification of J48 for Weighted Data. Reference # -none- Package # Patches to: weka.classifiers.trees.j48 weka.core weka.filters.unsupervised.attribute Download # Patch for Weka 3.4.5: j48-weighter.patch Additional Information # This patch addresses two separate but related issues: The proposed filter \"Weighter\" allows one to specify a numeric attribute to be used as an instance weight. As mentioned on Wekalist, tests using weighted sample-survey data indicated possible problems in the J48 decision tree algorithm. The Weighter filter # Weighter is a general-purpose filter independent of J48 or other classifiers, but to preserve the weight assignment it initially had to be run under FilteredClassifier. To make weights persistent via .arff files, some changes were made in Instances and Instance, while retaining compatibility with the existing ARFF format. Briefly, if Weighter is applied to an attribute, e.g. \"fnlwgt\" in the \"adult\" dataset from the UCI repository, that attribute is removed and its value is used as instance weight. Upon Save, the weight is appended to each instance under the attribute name \"::weight::fnlwgt\"; reading the .arff file inverts the Save process, transparent to the user. Repeated application of Weighter multiplies the weight and extends its name. The special case of invoking Weighter without an attribute argument restores the unweighted dataset, with an appended attribute named as above. J48 with instance weights # The simple rescaling inserted in weka.classifiers.trees.j48.Stats is intended to: use the correct sample size in the normal approximation to the binomial, make the scale of the .5 continuity correction consistent with the data, base the minimum-leaf-count option (-M) on unweighted counts. These changes make pruning more effective with weighted data, and help to reduce apparent overfitting. This should be the case whether the weights reflect missing value imputation (as is common in Weka), or survey-sampling probabilities (e.g. \"fnlwgt\" in the UCI \"adult\" sample). The modification to j48.Stats would not have worked on its own. In particular, j48.Distribution had been written to maintain one set of counts only. To work on weighted data statistical algorithms often require both weighted and unweighted counts. A few other minor modifications were introduced to change the way \"-M\" works. One effect is that, for this purpose, instances with missing x-values are no longer counted; they are considered missing.","title":"Description"},{"location":"j48_weighter_patch/#description","text":"J48-Weighter patch: Modification of J48 for Weighted Data.","title":"Description"},{"location":"j48_weighter_patch/#reference","text":"-none-","title":"Reference"},{"location":"j48_weighter_patch/#package","text":"Patches to: weka.classifiers.trees.j48 weka.core weka.filters.unsupervised.attribute","title":"Package"},{"location":"j48_weighter_patch/#download","text":"Patch for Weka 3.4.5: j48-weighter.patch","title":"Download"},{"location":"j48_weighter_patch/#additional-information","text":"This patch addresses two separate but related issues: The proposed filter \"Weighter\" allows one to specify a numeric attribute to be used as an instance weight. As mentioned on Wekalist, tests using weighted sample-survey data indicated possible problems in the J48 decision tree algorithm.","title":"Additional Information"},{"location":"j48_weighter_patch/#the-weighter-filter","text":"Weighter is a general-purpose filter independent of J48 or other classifiers, but to preserve the weight assignment it initially had to be run under FilteredClassifier. To make weights persistent via .arff files, some changes were made in Instances and Instance, while retaining compatibility with the existing ARFF format. Briefly, if Weighter is applied to an attribute, e.g. \"fnlwgt\" in the \"adult\" dataset from the UCI repository, that attribute is removed and its value is used as instance weight. Upon Save, the weight is appended to each instance under the attribute name \"::weight::fnlwgt\"; reading the .arff file inverts the Save process, transparent to the user. Repeated application of Weighter multiplies the weight and extends its name. The special case of invoking Weighter without an attribute argument restores the unweighted dataset, with an appended attribute named as above.","title":"The Weighter filter"},{"location":"j48_weighter_patch/#j48-with-instance-weights","text":"The simple rescaling inserted in weka.classifiers.trees.j48.Stats is intended to: use the correct sample size in the normal approximation to the binomial, make the scale of the .5 continuity correction consistent with the data, base the minimum-leaf-count option (-M) on unweighted counts. These changes make pruning more effective with weighted data, and help to reduce apparent overfitting. This should be the case whether the weights reflect missing value imputation (as is common in Weka), or survey-sampling probabilities (e.g. \"fnlwgt\" in the UCI \"adult\" sample). The modification to j48.Stats would not have worked on its own. In particular, j48.Distribution had been written to maintain one set of counts only. To work on weighted data statistical algorithms often require both weighted and unweighted counts. A few other minor modifications were introduced to change the way \"-M\" works. One effect is that, for this purpose, instances with missing x-values are no longer counted; they are considered missing.","title":"J48 with instance weights"},{"location":"java_virtual_machine/","text":"The Java virtual machine (JVM) is the platform dependent interpreter of the Java bytecode (i.e., the classes ). It translates the bytecode into machine specific instructions. Amount of available memory # If you start the virtual machine without any parameters it takes default values for stack and heap. In case you run into OutOfMemory exceptions, try to start your JVM with a bigger maximum heap size. (However, there's a limit, depending on your OS. See the 32-Bit and 64-Bit sections.) 32-bit # With a 32-Bit machine you can address at most 4GB of virtual memory . Different operating systems divide up the memory further into //system/kernel and user space*. From experience, you can achieve the following maximum sizes for the heap on Windows and Linux: Windows: 1.4GB Linux: 1.7GB 64-bit # Larger heap sizes are available when using 64-bit Java in a conjunction with a 64-bit operating system. There is more information available here .","title":"Java virtual machine"},{"location":"java_virtual_machine/#amount-of-available-memory","text":"If you start the virtual machine without any parameters it takes default values for stack and heap. In case you run into OutOfMemory exceptions, try to start your JVM with a bigger maximum heap size. (However, there's a limit, depending on your OS. See the 32-Bit and 64-Bit sections.)","title":"Amount of available memory"},{"location":"java_virtual_machine/#32-bit","text":"With a 32-Bit machine you can address at most 4GB of virtual memory . Different operating systems divide up the memory further into //system/kernel and user space*. From experience, you can achieve the following maximum sizes for the heap on Windows and Linux: Windows: 1.4GB Linux: 1.7GB","title":"32-bit"},{"location":"java_virtual_machine/#64-bit","text":"Larger heap sizes are available when using 64-bit Java in a conjunction with a 64-bit operating system. There is more information available here .","title":"64-bit"},{"location":"jupyter_notebooks/","text":"Jupyter notebooks are extremely popular in the Python world, simply because it is great to combine documentation and code in a visually appealing way. Great tool for teaching! Thanks to the IJava kernel and the JDK 9+ JShell feature, it is possible to run Java within Notebooks without compiling the code now as well. Installation on Linux # The following worked on Linux Mint 18.2: create a directory called weka-notebooks mkdir weka-notebooks change into the directory and create a Python virtual environment: cd weka-notebooks virtualenv -p /usr/bin/python3.5 venv install Jupyter notebooks and its dependencies: venv/bin/pip install jupyter then download the latest IJava release (at time of writing, this was 1.20 ) into this directory unzip the IJava archive: unzip -q ijava*.zip install the Java kernel into the virtual environment, using the IJava installer: venv/bin/python install.py --sys-prefix after that, fire up Jupyter using: venv/bin/jupyter-notebook now you can create new (Java) notebooks! Installation on Windows (using anaconda) # open a command prompt create a new environment using anaconda (e.g., for Python 3.5) conda create -n py35-ijava python=3.5 activate environment activate py35-ijava install Jupyter pip install jupyter download the latest IJava release (at time of writing, this was 1.20 ) unzip the IJava release (e.g., with your File browser or 7-Zip) change into the directory where you extracted the release, containing the install.py , e.g.: cd C:\\Users\\fracpete\\Downloads\\ijava-1.2.0 install the kernel python install.py --sys-prefix start Jupyter jupyter-notebook now you can create new (Java) notebooks!","title":"Jupyter notebooks"},{"location":"jupyter_notebooks/#installation-on-linux","text":"The following worked on Linux Mint 18.2: create a directory called weka-notebooks mkdir weka-notebooks change into the directory and create a Python virtual environment: cd weka-notebooks virtualenv -p /usr/bin/python3.5 venv install Jupyter notebooks and its dependencies: venv/bin/pip install jupyter then download the latest IJava release (at time of writing, this was 1.20 ) into this directory unzip the IJava archive: unzip -q ijava*.zip install the Java kernel into the virtual environment, using the IJava installer: venv/bin/python install.py --sys-prefix after that, fire up Jupyter using: venv/bin/jupyter-notebook now you can create new (Java) notebooks!","title":"Installation on Linux"},{"location":"jupyter_notebooks/#installation-on-windows-using-anaconda","text":"open a command prompt create a new environment using anaconda (e.g., for Python 3.5) conda create -n py35-ijava python=3.5 activate environment activate py35-ijava install Jupyter pip install jupyter download the latest IJava release (at time of writing, this was 1.20 ) unzip the IJava release (e.g., with your File browser or 7-Zip) change into the directory where you extracted the release, containing the install.py , e.g.: cd C:\\Users\\fracpete\\Downloads\\ijava-1.2.0 install the kernel python install.py --sys-prefix start Jupyter jupyter-notebook now you can create new (Java) notebooks!","title":"Installation on Windows (using anaconda)"},{"location":"just_in_time_jit_compiler/","text":"For maximum enjoyment, use a virtual machine that incorporates a just-in-time compiler . This can speed things up quite significantly. Note also that there can be large differences in execution time between different virtual machines. The Sun JDK/JRE all include a JIT compiler (\"hotspot\").","title":"Just in time jit compiler"},{"location":"jvm/","text":"see Java Virtual Machine","title":"Jvm"},{"location":"knowledge_flow_toolbars_are_empty/","text":"In the terminal, you will most likely see this output as well: Failed to instantiate: weka.gui.beans.Loader This behavior can happen under Gnome with Java 5/6, see GUIChooser starts but not Experimenter or Explorer for a solution.","title":"Knowledge flow toolbars are empty"},{"location":"learning_resources/","text":"Videos # Youtube channel of Data Mining with Weka MOOCs Tutorials # Learn Data Science Online MOOCs # Data Mining with Weka More Data Mining with Weka Advanced Data Mining with Weka","title":"Videos"},{"location":"learning_resources/#videos","text":"Youtube channel of Data Mining with Weka MOOCs","title":"Videos"},{"location":"learning_resources/#tutorials","text":"Learn Data Science Online","title":"Tutorials"},{"location":"learning_resources/#moocs","text":"Data Mining with Weka More Data Mining with Weka Advanced Data Mining with Weka","title":"MOOCs"},{"location":"lib_svm/","text":"Description # Wrapper class for the LibSVM library by Chih-Chung Chang and Chih-Jen Lin. The original wrapper, named WLSVM, was developed by Yasser EL-Manzalawy. The current version is complete rewrite of the wrapper, using Reflection in order to avoid compilation errors, in case the libsvm.jar is not in the CLASSPATH . Important note: From WEKA >= 3.7.2 installation and use of LibSVM in WEKA has been simplified by the creation of a LibSVM package that can be installed using either the graphical or command line package manager . Reference (Weka <= 3.6.8) # LibSVM WLSVM Package # weka.classifiers.functions Download # The wrapper class is part of WEKA since version 3.5.2. But LibSVM , as a third-party-tool needs to be downloaded separately. It is recommended to upgrade to a post-3.5.3 version (or git ) for bug-fixes and extensions (contains now the distributionForInstance method). CLASSPATH # Add the libsvm.jar from the LibSVM distribution to your CLASSPATH to make it available. Note: Do NOT start WEKA then with java -jar weka.jar . The -jar option overwrites the CLASSPATH , not augments it (a very common trap to fall into). Instead use something like this on Linux: java -classpath $CLASSPATH :weka.jar:libsvm.jar weka.gui.GUIChooser or this on Win32 (if you're starting it from commandline): java -classpath \"%CLASSPATH%;weka.jar;libsvm.jar\" weka.gui.GUIChooser If you're starting WEKA from the Start Menu on Windows, you'll have to add the libsvm.jar to your CLASSPATH environment variable. The following steps are for Windows XP (unfortunately, the GUI changes among the different Windows versions): right-click on My Computer and select Properties from the menu choose the Advanced tab and click on Environment variables at the bottom either add or modify a variable called CLASSPATH and add the libsvm.jar with full path to it Troubleshooting # LibSVM classes not in CLASSPATH! Check whether the libsvm.jar is really in your CLASSPATH. Execute the following command in the SimpleCLI : java weka.core.SystemInfo The property java.class.path must list the libsvm.jar . If it is listed, check whether the path is correct. If you're on Windows and you find %CLASSPATH% there, see next bullet point to fix this. On Windows, if you added the libsvm.jar to your CLASSPATH environment variable, it can still happen that WEKA pops up the error message that the LibSVM classes are not in your CLASSPATH. This can happen where the %CLASSPATH% does not get expanded to its actual value in starting up WEKA. You can inspect your current CLASSPATH with which WEKA got started up with the SimpleCLI (see previous bullet point). If %CLASSPATH% is listed there, your system has the same problem. You can also explicitly add a .jar file to RunWeka.ini . Note: backslashes have to be escaped, not only once, but twice (they get interpreted by Java twice!). In other words, instead of one you have to use four : C:\\some\\where then turns into C:\\\\\\\\some\\\\\\\\where . Issues with libsvm.jar that were discussed on the Weka list in April 2007 (and may no longer be relevant) # The following changes were not incorporated in WEKA, since it also means modifying the LibSVM Java code, which (I think) is autogenerated from the C code. The authors of LibSVM might have to consider that update. It's left to the reader to incorporate these changes. libsvm.svm uses Math.random # libsvm.svm calls Math.random so the model it returns is usually different for the same training set and svm parameters over time. Obviously, if you call libsvm.svm from weka.classifiers.functions.libsvm, and you call it again from libsvm.svm_train, the results are also different. You can use libsvm.svm_save_model to record the svms into files, and then compare the model file from WEKA LibSVM with the model file from libsvm.svm_predict. Then you can see that ProbA values use to be different. WEKA experimenter is based on using always the same random sequences in order to repeat experiments with the same results. So, I'm afraid some important design changes are required on libsvm.jar and weka.classifiers.functions.libsvm.class to keep such behaviour. We made a quick fix adding an static Random attribute to libsvm.svm class: static java . util . Random ranGen = new Random ( 0 ); We have changed all Math.random() invokations to ranGen.nextdouble(). Then we have obtained the same svm from weka LibSVM than from LibSVM train_svm. However, WEKA accuracy results on primary_tumor data were still worse, so there's something wrong when weka uses the svm model at testing step. Classes without instances # ARFF format provides some meta-information (i.e. attributes name and type, set of possible values for nominal attributes), but LibSVM format doesn't. So if there are classes in the dataset with zero occurrences through all the instances, LibSVM thinks that these classes don't exist whereas WEKA knows they exist. For example, there is a class in primary tumor dataset that never appears. When WEKA experimenter makes testing, it calls to: public static double svm_predict_probability ( svm_model model , svm_node [] x , double [] prob_estimates ) passing the array prob_estimates plenty of zeros (array cells are initialized to zero). The size of the array is equal to the number of classes (= 22). On the other hand, if this method is invoked from libsvm.svm_predict, the class that never appears is ignored, so the array dimension is now equal to 21. So accuracy results are different depending on origin of svm_predict_probability method invocation. I think that better results are obtained if classes without instances are ignored, but I don't know if it is very fair. In fact, accuracies from weka.libsvm and from libsvm.predict_svm seem to be the same if the class that never appears is removed from ARFF file. Note that this problem only appears when testing, because the training code uses always the svm_group_classes method to compute the number of classes, so Instances.numClasses() value is never used for training. Moreover, maybe the mismatch between the training number of classes and the testing number of classes is the reason behind worse accuracy results when svm_predict_probability invocation is made from WEKA, but I haven't proved it yet. Note that this problem does also happen when you have a class with less examples than the number of folds. For some folds, the class will not have training examples. We also made a quick fix for this problem: Add this public method to libsvm.svm_model class public int getNr_class(){return nr_class;} Make the following changes into distributionforInstance Method at weka.classifiers.functions.LibSVM First line of the method: int [] labels = new int [ instance . numClasses () ] ; could be changed to int [] labels = new int [ (( svm_model ) m_Model ). getNr_class () ] ; Last line in \"if(m_ProbablityEstimates)\" block: prob_estimates = new double [ instance . numClasses () ] ; could be changed to prob_estimates = new double [ (( svm_model ) m_Model ). getNr_class () ] ;","title":"Lib svm"},{"location":"lib_svm/#description","text":"Wrapper class for the LibSVM library by Chih-Chung Chang and Chih-Jen Lin. The original wrapper, named WLSVM, was developed by Yasser EL-Manzalawy. The current version is complete rewrite of the wrapper, using Reflection in order to avoid compilation errors, in case the libsvm.jar is not in the CLASSPATH . Important note: From WEKA >= 3.7.2 installation and use of LibSVM in WEKA has been simplified by the creation of a LibSVM package that can be installed using either the graphical or command line package manager .","title":"Description"},{"location":"lib_svm/#reference-weka-368","text":"LibSVM WLSVM","title":"Reference (Weka &lt;= 3.6.8)"},{"location":"lib_svm/#package","text":"weka.classifiers.functions","title":"Package"},{"location":"lib_svm/#download","text":"The wrapper class is part of WEKA since version 3.5.2. But LibSVM , as a third-party-tool needs to be downloaded separately. It is recommended to upgrade to a post-3.5.3 version (or git ) for bug-fixes and extensions (contains now the distributionForInstance method).","title":"Download"},{"location":"lib_svm/#classpath","text":"Add the libsvm.jar from the LibSVM distribution to your CLASSPATH to make it available. Note: Do NOT start WEKA then with java -jar weka.jar . The -jar option overwrites the CLASSPATH , not augments it (a very common trap to fall into). Instead use something like this on Linux: java -classpath $CLASSPATH :weka.jar:libsvm.jar weka.gui.GUIChooser or this on Win32 (if you're starting it from commandline): java -classpath \"%CLASSPATH%;weka.jar;libsvm.jar\" weka.gui.GUIChooser If you're starting WEKA from the Start Menu on Windows, you'll have to add the libsvm.jar to your CLASSPATH environment variable. The following steps are for Windows XP (unfortunately, the GUI changes among the different Windows versions): right-click on My Computer and select Properties from the menu choose the Advanced tab and click on Environment variables at the bottom either add or modify a variable called CLASSPATH and add the libsvm.jar with full path to it","title":"CLASSPATH"},{"location":"lib_svm/#troubleshooting","text":"LibSVM classes not in CLASSPATH! Check whether the libsvm.jar is really in your CLASSPATH. Execute the following command in the SimpleCLI : java weka.core.SystemInfo The property java.class.path must list the libsvm.jar . If it is listed, check whether the path is correct. If you're on Windows and you find %CLASSPATH% there, see next bullet point to fix this. On Windows, if you added the libsvm.jar to your CLASSPATH environment variable, it can still happen that WEKA pops up the error message that the LibSVM classes are not in your CLASSPATH. This can happen where the %CLASSPATH% does not get expanded to its actual value in starting up WEKA. You can inspect your current CLASSPATH with which WEKA got started up with the SimpleCLI (see previous bullet point). If %CLASSPATH% is listed there, your system has the same problem. You can also explicitly add a .jar file to RunWeka.ini . Note: backslashes have to be escaped, not only once, but twice (they get interpreted by Java twice!). In other words, instead of one you have to use four : C:\\some\\where then turns into C:\\\\\\\\some\\\\\\\\where .","title":"Troubleshooting"},{"location":"lib_svm/#issues-with-libsvmjar-that-were-discussed-on-the-weka-list-in-april-2007-and-may-no-longer-be-relevant","text":"The following changes were not incorporated in WEKA, since it also means modifying the LibSVM Java code, which (I think) is autogenerated from the C code. The authors of LibSVM might have to consider that update. It's left to the reader to incorporate these changes.","title":"Issues with libsvm.jar that were discussed on the Weka list in April 2007 (and may no longer be relevant)"},{"location":"lib_svm/#libsvmsvm-uses-mathrandom","text":"libsvm.svm calls Math.random so the model it returns is usually different for the same training set and svm parameters over time. Obviously, if you call libsvm.svm from weka.classifiers.functions.libsvm, and you call it again from libsvm.svm_train, the results are also different. You can use libsvm.svm_save_model to record the svms into files, and then compare the model file from WEKA LibSVM with the model file from libsvm.svm_predict. Then you can see that ProbA values use to be different. WEKA experimenter is based on using always the same random sequences in order to repeat experiments with the same results. So, I'm afraid some important design changes are required on libsvm.jar and weka.classifiers.functions.libsvm.class to keep such behaviour. We made a quick fix adding an static Random attribute to libsvm.svm class: static java . util . Random ranGen = new Random ( 0 ); We have changed all Math.random() invokations to ranGen.nextdouble(). Then we have obtained the same svm from weka LibSVM than from LibSVM train_svm. However, WEKA accuracy results on primary_tumor data were still worse, so there's something wrong when weka uses the svm model at testing step.","title":"libsvm.svm uses Math.random"},{"location":"lib_svm/#classes-without-instances","text":"ARFF format provides some meta-information (i.e. attributes name and type, set of possible values for nominal attributes), but LibSVM format doesn't. So if there are classes in the dataset with zero occurrences through all the instances, LibSVM thinks that these classes don't exist whereas WEKA knows they exist. For example, there is a class in primary tumor dataset that never appears. When WEKA experimenter makes testing, it calls to: public static double svm_predict_probability ( svm_model model , svm_node [] x , double [] prob_estimates ) passing the array prob_estimates plenty of zeros (array cells are initialized to zero). The size of the array is equal to the number of classes (= 22). On the other hand, if this method is invoked from libsvm.svm_predict, the class that never appears is ignored, so the array dimension is now equal to 21. So accuracy results are different depending on origin of svm_predict_probability method invocation. I think that better results are obtained if classes without instances are ignored, but I don't know if it is very fair. In fact, accuracies from weka.libsvm and from libsvm.predict_svm seem to be the same if the class that never appears is removed from ARFF file. Note that this problem only appears when testing, because the training code uses always the svm_group_classes method to compute the number of classes, so Instances.numClasses() value is never used for training. Moreover, maybe the mismatch between the training number of classes and the testing number of classes is the reason behind worse accuracy results when svm_predict_probability invocation is made from WEKA, but I haven't proved it yet. Note that this problem does also happen when you have a class with less examples than the number of folds. For some folds, the class will not have training examples. We also made a quick fix for this problem: Add this public method to libsvm.svm_model class public int getNr_class(){return nr_class;} Make the following changes into distributionforInstance Method at weka.classifiers.functions.LibSVM First line of the method: int [] labels = new int [ instance . numClasses () ] ; could be changed to int [] labels = new int [ (( svm_model ) m_Model ). getNr_class () ] ; Last line in \"if(m_ProbablityEstimates)\" block: prob_estimates = new double [ instance . numClasses () ] ; could be changed to prob_estimates = new double [ (( svm_model ) m_Model ). getNr_class () ] ;","title":"Classes without instances"},{"location":"literature/","text":"Apart from Data Mining: Practical Machine Learning Tools and Techniques , there are several other books with material on Weka: Jason Bell (2020) Machine Learning: Hands-On for Developers and Technical Professionals, Second Edition , Wiley. Richard J. Roiger (2020) Just Enough R! An Interactive Approach to Machine Learning and Analytics , CRC Press. Parteek Bhatia (2019) Data Mining and Data Warehousing Principles and Practical Techniques , Cambridge University Press. Mark Wickham (2018) Practical Java Machine Learning Projects with Google Cloud Platform and Amazon Web Services , APress. AshishSingh Bhatia, Bostjan Kaluza (2018) Machine Learning in Java - Second Edition , Packt Publishing. Richard J. Roiger (2016) Data Mining: A Tutorial-Based Primer , CRC Press. Mei Yu Yuan (2016) Data Mining and Machine Learning: WEKA Technology and Practice , Tsinghua University Press (in Chinese). J\u00fcrgen Cleve, Uwe L\u00e4mmel (2016) Data Mining , De Gruyter (in German). Eric Rochester (2015) Clojure Data Analysis Cookbook - Second Edition , Packt Publishing. Bo\u0161tjan Kalu\u017ea (2013) Instant Weka How-to , Packt Publishing. Hongbo Du (2010) Data Mining Techniques and Applications , Cengage Learning. A book explaining why Weka won't learn (discovered by Stuart Inglis).","title":"Literature"},{"location":"mailing_list/","text":"The WEKA Mailing list can be found here: List for subscribing/unsubscribing to the list. Archives for searching previous posted messages. Before posting, please read the mailing list etiquette . Once you have subscribed to the list (a moderator may have to approve your request), you can send posts to the list using the following email address: weka-users@lists.sourceforge.net NB: The mailing list moved to Sourceforge.net in mid-December 2024, due to the university mailman server being decommissioned. You can find the old archives on this mirror .","title":"Mailing list"},{"location":"making_predictions/","text":"Command line # The following sections show how to obtain predictions/classifications without writing your own Java code via the command line. Classifiers # After a model has been saved , one can make predictions for a test set, whether that set contains valid class values or not. The output will contain both the actual and predicted class. (Note that if the test class contains simply '?' for the class label for each instance, the \"actual\" class label for each instance will not contain useful information, but the predicted class label will.) The -T <test_set> command-line switch specifies the dataset of instances whose classes are to be predicted, while the -p <attribute_range> switch allows the user to write out a range of attributes (examples: \"1-2\" for the first and second attributes, or \"0\" for no attributes). Sample command line: java weka.classifiers.trees.J48 -T unclassified.arff -l j48.model -p 0 The format of the output is as follows: <test_instance_index> <actual_class_index>:<actual_class_val> <pred_class_index>:<pred_class_val> [+| ] <prob_of_pred_class_val> where \"+\" occurs only for those items that were mispredicted. Note that if the actual class label is always \"?\" (i.e., the dataset does not include known class labels), the error column will always be empty. Sample output: inst# actual predicted error prediction 1 1:? 1:0 0.757 2 1:? 1:0 0.824 3 1:? 1:0 0.807 4 1:? 1:0 0.807 5 1:? 1:0 0.79 6 1:? 2:1 0.661 ... In this case, taken directly from a test dataset where all class attributes were marked by \"?\", the \"actual\" column, which can be ignored, simply states that each class belongs to an unknown class. The \"predicted\" column shows that instances 1 through 5 are predicted to be of class 1, whose value is 0, and instance 6 is predicted to be of class 2, whose value is 1. The error field is empty; if predictions were being performed on a labeled test set, each instance where the prediction failed to match the label would contain a \"+\". The probability that instance 1 actually belongs to class 0 is estimated at 0.757. Notes: Since Weka 3.5.4 you can also output the complete class distribution, not just the prediction, by using the parameter -distribution in conjunction with the -p option. In this case, \"*\" is placed beside the probability in the distribution that corresponds to the predicted class value. If you have an ID attribute in your dataset as first attribute (you can always add one with the AddID filter), you could output it with -p 1 instead of using -p 0 . This works only for explicit train/test sets, but you can use the Explorer for cross-validation. Using the -classifications option instead of -p ... you can also use different output formats, like CSV : -classifications \"weka.classifiers.evaluation.output.prediction.CSV -p ...\" (the -p option takes the indices of the additional attributes to output). Filters # The AddClassification filter (package weka.filters.supervised.attribute ) can either train a classifier on the input data and transform this or load a serialized model to transform the input data (even though the filter was introduced in 3.5.4, due to a bug in the commandline option handling, it is recommended to download a version >3.5.5 from the Weka homepage). This filter can add the classification, class distribution and the error per row as extra attributes to the dataset. training the classifier, e.g., J48, on the input data and replacing the class values with the ones of the trained classifier: java \\ weka.filters.supervised.attribute.AddClassification \\ -W \"weka.classifiers.trees.J48\" \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last * using a serialized model, e.g., a J48 model, to replace the class values with the ones predicted by the serialized model: java \\ weka.filters.supervised.attribute.AddClassification \\ -serialized /some/where/j48.model \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last GUI # The Weka GUI allows you as well to output predictions based on a previously saved model. Explorer # See the Explorer section of the Saving and loading models article to setup the Explorer. Additionally, you need to check the Output predictions options in the More options dialog. Right-clicking on the respective results history item and selecting Re-evaluate model on current test set will output then the predictions as well (the statistics will be useless due to missing class values in the test set, so just ignore them). The output is similar to the one produced by the commandline. Example output for the anneal UCI dataset: == Predictions on test set == inst#, actual, predicted, error, probability distribution 1 ? 3:3 + 0 0 *1 0 0 0 2 ? 3:3 + 0 0 *1 0 0 0 3 ? 3:3 + 0 0 *1 0 0 0 ... 17 ? 6:U + 0 0 0 0 0 *1 18 ? 6:U + 0 0 0 0 0 *1 19 ? 3:3 + 0 0 *1 0 0 0 20 ? 3:3 + 0 0 *1 0 0 0 ... Note: The developer version (>3.5.6) can also output additional attributes like the commandline with the -p option. In the More options... dialog you can specify those attribute indices with Output additional attributes , e.g., first or 1-7 . In contrast to the commandline, this output also works for cross-validation. KnowledgeFlow # Using the PredictionAppender # With the PredictionAppender (from the Evaluation toolbar) you cannot use an already saved model, but you can train a classifier on a dataset and output an ARFF file with the predictions appended as additional attribute. Here's an example setup: /---dataSet--> TrainingSetMaker ---trainingSet--\\ ArffLoader --< >--> J48... \\---dataSet--> TestSetMaker -------testSet------/ ...J48 --batchClassifier--> PredictionAppender --testSet--> ArffSaver Using the AddClassification filter # The AddClassification filter can be used in the KnowledgeFlow as well, either for training a model, or for using a serialized model to perform the predictions. An example setup could look like this: ArffLoader --dataSet--> ClassAssigner --dataSet--> AddClassification --dataSet--> ArffSaver Java # If you want to perform the classification within your own code, see the classifying instances section of this article , explaining the Weka API in general. See also # Saving and loading models Use Weka in your Java code - general information about using the Weka API Using ID attributes Version # The developer version shortly before the release of 3.5.6 was used as basis for this article.","title":"Command line"},{"location":"making_predictions/#command-line","text":"The following sections show how to obtain predictions/classifications without writing your own Java code via the command line.","title":"Command line"},{"location":"making_predictions/#classifiers","text":"After a model has been saved , one can make predictions for a test set, whether that set contains valid class values or not. The output will contain both the actual and predicted class. (Note that if the test class contains simply '?' for the class label for each instance, the \"actual\" class label for each instance will not contain useful information, but the predicted class label will.) The -T <test_set> command-line switch specifies the dataset of instances whose classes are to be predicted, while the -p <attribute_range> switch allows the user to write out a range of attributes (examples: \"1-2\" for the first and second attributes, or \"0\" for no attributes). Sample command line: java weka.classifiers.trees.J48 -T unclassified.arff -l j48.model -p 0 The format of the output is as follows: <test_instance_index> <actual_class_index>:<actual_class_val> <pred_class_index>:<pred_class_val> [+| ] <prob_of_pred_class_val> where \"+\" occurs only for those items that were mispredicted. Note that if the actual class label is always \"?\" (i.e., the dataset does not include known class labels), the error column will always be empty. Sample output: inst# actual predicted error prediction 1 1:? 1:0 0.757 2 1:? 1:0 0.824 3 1:? 1:0 0.807 4 1:? 1:0 0.807 5 1:? 1:0 0.79 6 1:? 2:1 0.661 ... In this case, taken directly from a test dataset where all class attributes were marked by \"?\", the \"actual\" column, which can be ignored, simply states that each class belongs to an unknown class. The \"predicted\" column shows that instances 1 through 5 are predicted to be of class 1, whose value is 0, and instance 6 is predicted to be of class 2, whose value is 1. The error field is empty; if predictions were being performed on a labeled test set, each instance where the prediction failed to match the label would contain a \"+\". The probability that instance 1 actually belongs to class 0 is estimated at 0.757. Notes: Since Weka 3.5.4 you can also output the complete class distribution, not just the prediction, by using the parameter -distribution in conjunction with the -p option. In this case, \"*\" is placed beside the probability in the distribution that corresponds to the predicted class value. If you have an ID attribute in your dataset as first attribute (you can always add one with the AddID filter), you could output it with -p 1 instead of using -p 0 . This works only for explicit train/test sets, but you can use the Explorer for cross-validation. Using the -classifications option instead of -p ... you can also use different output formats, like CSV : -classifications \"weka.classifiers.evaluation.output.prediction.CSV -p ...\" (the -p option takes the indices of the additional attributes to output).","title":"Classifiers"},{"location":"making_predictions/#filters","text":"The AddClassification filter (package weka.filters.supervised.attribute ) can either train a classifier on the input data and transform this or load a serialized model to transform the input data (even though the filter was introduced in 3.5.4, due to a bug in the commandline option handling, it is recommended to download a version >3.5.5 from the Weka homepage). This filter can add the classification, class distribution and the error per row as extra attributes to the dataset. training the classifier, e.g., J48, on the input data and replacing the class values with the ones of the trained classifier: java \\ weka.filters.supervised.attribute.AddClassification \\ -W \"weka.classifiers.trees.J48\" \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last * using a serialized model, e.g., a J48 model, to replace the class values with the ones predicted by the serialized model: java \\ weka.filters.supervised.attribute.AddClassification \\ -serialized /some/where/j48.model \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last","title":"Filters"},{"location":"making_predictions/#gui","text":"The Weka GUI allows you as well to output predictions based on a previously saved model.","title":"GUI"},{"location":"making_predictions/#explorer","text":"See the Explorer section of the Saving and loading models article to setup the Explorer. Additionally, you need to check the Output predictions options in the More options dialog. Right-clicking on the respective results history item and selecting Re-evaluate model on current test set will output then the predictions as well (the statistics will be useless due to missing class values in the test set, so just ignore them). The output is similar to the one produced by the commandline. Example output for the anneal UCI dataset: == Predictions on test set == inst#, actual, predicted, error, probability distribution 1 ? 3:3 + 0 0 *1 0 0 0 2 ? 3:3 + 0 0 *1 0 0 0 3 ? 3:3 + 0 0 *1 0 0 0 ... 17 ? 6:U + 0 0 0 0 0 *1 18 ? 6:U + 0 0 0 0 0 *1 19 ? 3:3 + 0 0 *1 0 0 0 20 ? 3:3 + 0 0 *1 0 0 0 ... Note: The developer version (>3.5.6) can also output additional attributes like the commandline with the -p option. In the More options... dialog you can specify those attribute indices with Output additional attributes , e.g., first or 1-7 . In contrast to the commandline, this output also works for cross-validation.","title":"Explorer"},{"location":"making_predictions/#knowledgeflow","text":"","title":"KnowledgeFlow"},{"location":"making_predictions/#using-the-predictionappender","text":"With the PredictionAppender (from the Evaluation toolbar) you cannot use an already saved model, but you can train a classifier on a dataset and output an ARFF file with the predictions appended as additional attribute. Here's an example setup: /---dataSet--> TrainingSetMaker ---trainingSet--\\ ArffLoader --< >--> J48... \\---dataSet--> TestSetMaker -------testSet------/ ...J48 --batchClassifier--> PredictionAppender --testSet--> ArffSaver","title":"Using the PredictionAppender"},{"location":"making_predictions/#using-the-addclassification-filter","text":"The AddClassification filter can be used in the KnowledgeFlow as well, either for training a model, or for using a serialized model to perform the predictions. An example setup could look like this: ArffLoader --dataSet--> ClassAssigner --dataSet--> AddClassification --dataSet--> ArffSaver","title":"Using the AddClassification filter"},{"location":"making_predictions/#java","text":"If you want to perform the classification within your own code, see the classifying instances section of this article , explaining the Weka API in general.","title":"Java"},{"location":"making_predictions/#see-also","text":"Saving and loading models Use Weka in your Java code - general information about using the Weka API Using ID attributes","title":"See also"},{"location":"making_predictions/#version","text":"The developer version shortly before the release of 3.5.6 was used as basis for this article.","title":"Version"},{"location":"mathematical_functions/","text":"Mathematical functions implemented on dataset instances, like tan, cos, exp, log, and so on can be achived using one of the following filters: AddExpression (Stable version) MathExpression (Stable version)","title":"Mathematical functions"},{"location":"maven/","text":"Maven is another build tool. But unlike Ant , it is a more high-level tool. Though its configuration file, pom.xml is written in XML as well, Maven uses a different approach to the build process. In Ant, you tell it where to find Java classes for compilation, what libraries to compile against, where to put the compiled ones and then how to combine them into a jar. With Maven, you only specify dependent libraries, a compile and a jar plugin and maybe tweak the options a bit. For this to work, Maven enforces a strict directory structure (though you can tweak that, if you need to). So why another build tool? # Whereas Ant scripts quite often create a fat jar , i.e., a jar that contains not only the project's code, but also the contain of libraries the code was compiled against. Handy if you only want to have a single jar. However, this is a nightmare, if you need to update a single library, but all you have is a single, enormous jar. Maven handles dependencies automatically , relying on libraries (they call them artifacts) to be publicly available, e.g., on Maven Central . It allows you to use newer versions of libraries than defined by the dependent libraries (e.g., critical bug fixes), without having to modify any jars manually. Though Maven can also generate fat jar files, it is not considered good practice, as it defeats Maven's automatic version resolution. In order to make Weka, and most of its packages, available to a wider audience (e.g., other software developers), we also publish on Maven Central. Compiling # For compiling Weka, you would issue a command like this (in the same directory as pom.xml ): mvn clean install If you don't want the tests to run, use this: mvn clean install -DskipTests = true","title":"Maven"},{"location":"maven/#so-why-another-build-tool","text":"Whereas Ant scripts quite often create a fat jar , i.e., a jar that contains not only the project's code, but also the contain of libraries the code was compiled against. Handy if you only want to have a single jar. However, this is a nightmare, if you need to update a single library, but all you have is a single, enormous jar. Maven handles dependencies automatically , relying on libraries (they call them artifacts) to be publicly available, e.g., on Maven Central . It allows you to use newer versions of libraries than defined by the dependent libraries (e.g., critical bug fixes), without having to modify any jars manually. Though Maven can also generate fat jar files, it is not considered good practice, as it defeats Maven's automatic version resolution. In order to make Weka, and most of its packages, available to a wider audience (e.g., other software developers), we also publish on Maven Central.","title":"So why another build tool?"},{"location":"maven/#compiling","text":"For compiling Weka, you would issue a command like this (in the same directory as pom.xml ): mvn clean install If you don't want the tests to run, use this: mvn clean install -DskipTests = true","title":"Compiling"},{"location":"memory_consumption_and_garbage_collector/","text":"There is the ability to print how much memory is available in the Explorer and Experimenter and to run the garbage collector. Just right click over the Status area in the Explorer/Experimenter.","title":"Memory consumption and garbage collector"},{"location":"message_classifier/","text":"In the following you'll find some information about the MessageClassifier from the 2nd edition of the Data Mining book by Witten and Frank. Source code # Depending on the version of the book, download the corresponding version (this article is based on the 2nd edition): 1st Edition: MessageClassifier 2nd Edition: MessageClassifier ( book , stable-3.8 , developer ) Compiling # compile the source code like this, if the weka.jar is already in your CLASSPATH environment variable: javac MessageClassifier.java * otherwise, use this command line (of course, replace /path/to/ with the correct path on your system): javac - classpath / path / to / weka . jar MessageClassifier . java Note: The classpath handling is omitted from here on. Training # If you run the MessageClassifier for the first time, you need to provide labeled examples to build a classifier from, i.e., messages (\" -m \") and the corresponding classes (\" -c \"). Since the data and the model are kept for future use, one has to specify a filename, where the MessageClassifier is serialized to (\" -t \"). Here's an example, that labels the message email1.txt as miss : java MessageClassifier -m email1.txt -c miss -t messageclassifier.model Repeat this for all the messages you want to have classified. Classifying # Classifying an unseen message is quite straight-forward, one just omits the class option (\" -c \"). The following call java MessageClassifier -m email1023.txt -t messageclassifier.model will produce something like this: Message classified as : miss","title":"Message classifier"},{"location":"message_classifier/#source-code","text":"Depending on the version of the book, download the corresponding version (this article is based on the 2nd edition): 1st Edition: MessageClassifier 2nd Edition: MessageClassifier ( book , stable-3.8 , developer )","title":"Source code"},{"location":"message_classifier/#compiling","text":"compile the source code like this, if the weka.jar is already in your CLASSPATH environment variable: javac MessageClassifier.java * otherwise, use this command line (of course, replace /path/to/ with the correct path on your system): javac - classpath / path / to / weka . jar MessageClassifier . java Note: The classpath handling is omitted from here on.","title":"Compiling"},{"location":"message_classifier/#training","text":"If you run the MessageClassifier for the first time, you need to provide labeled examples to build a classifier from, i.e., messages (\" -m \") and the corresponding classes (\" -c \"). Since the data and the model are kept for future use, one has to specify a filename, where the MessageClassifier is serialized to (\" -t \"). Here's an example, that labels the message email1.txt as miss : java MessageClassifier -m email1.txt -c miss -t messageclassifier.model Repeat this for all the messages you want to have classified.","title":"Training"},{"location":"message_classifier/#classifying","text":"Classifying an unseen message is quite straight-forward, one just omits the class option (\" -c \"). The following call java MessageClassifier -m email1023.txt -t messageclassifier.model will produce something like this: Message classified as : miss","title":"Classifying"},{"location":"metacost/","text":"This metaclassifier makes its base classifier cost-sensitive using the method specified in: Pedro Domingos: MetaCost: A general method for making classifiers cost-sensitive. In: Fifth International Conference on Knowledge Discovery and Data Mining, 155-164, 1999. This classifier should produce similar results to one created by passing the base learner to Bagging, which is in turn passed to a CostSensitiveClassifier operating on minimum expected cost. The difference is that MetaCost produces a single cost-sensitive classifier of the base learner, giving the benefits of fast classification and interpretable output (if the base learner itself is interpretable). This implementation uses all bagging iterations when reclassifying training data (the MetaCost paper reports a marginal improvement when only those iterations containing each training instance are used in reclassifying that instance). Examples # The following cost matrix is used for a 3-class problem: -3 1 1 1 -6 1 0 0 0 MetaCost will compute the costs ( Costs ) based on the class distribution the bagged base learner returns ( Class probs ) and select the class with the lowest cost ( Chosen class ): +---------------+-----------------+--------------+ | Class probs | Costs | Chosen class | +---------------+-----------------+--------------+ | 1.0, 0.0, 0.0 | -3.0, 1.0, 1.0 | 1 | | 0.0, 1.0, 0.0 | 1.0, -6.0, 1.0 | 2 | | 0.0, 0.0, 1.0 | 0.0, 0.0, 0.0 | 1 * | | 0.7, 0.1, 0.2 | -2.0, 0.1, 0.8 | 1 | | 0.2, 0.7, 0.1 | 0.1, -4.0. 0.9 | 2 | | 0.1, 0.2, 0.7 | -0.1, -1.1, 0.3 | 2 | +---------------+-----------------+--------------+ * in case of a tie, the first one will be picked. See also # CostSensitiveClassifier CostMatrix Links # Publication on CiteSeer","title":"Metacost"},{"location":"metacost/#examples","text":"The following cost matrix is used for a 3-class problem: -3 1 1 1 -6 1 0 0 0 MetaCost will compute the costs ( Costs ) based on the class distribution the bagged base learner returns ( Class probs ) and select the class with the lowest cost ( Chosen class ): +---------------+-----------------+--------------+ | Class probs | Costs | Chosen class | +---------------+-----------------+--------------+ | 1.0, 0.0, 0.0 | -3.0, 1.0, 1.0 | 1 | | 0.0, 1.0, 0.0 | 1.0, -6.0, 1.0 | 2 | | 0.0, 0.0, 1.0 | 0.0, 0.0, 0.0 | 1 * | | 0.7, 0.1, 0.2 | -2.0, 0.1, 0.8 | 1 | | 0.2, 0.7, 0.1 | 0.1, -4.0. 0.9 | 2 | | 0.1, 0.2, 0.7 | -0.1, -1.1, 0.3 | 2 | +---------------+-----------------+--------------+ * in case of a tie, the first one will be picked.","title":"Examples"},{"location":"metacost/#see-also","text":"CostSensitiveClassifier CostMatrix","title":"See also"},{"location":"metacost/#links","text":"Publication on CiteSeer","title":"Links"},{"location":"ms_sql_server_2000_desktop_engine/","text":"Installation # Download the Desktop Engine (see Links ) Extract the files by running the downloaded executable Edit the setup.ini file and add a strong password for the sa account: SAPWD=*password* Note: the default password is empty, which can prevent the setup from continuing the installation Run the setup Testing # This article lists Java code for testing the connection Troubleshooting # Error Establishing Socket with JDBC Driver Add TCP/IP to the list of protocols as stated in this article Login failed for user 'sa'. Reason: Not associated with a trusted SQL Server connection. For changing the authentication to mixed mode see this article Links # Microsoft SQL Server 2000 (Desktop Engine) Microsoft SQL Server 2000 JDBC Driver SP 3","title":"Installation"},{"location":"ms_sql_server_2000_desktop_engine/#installation","text":"Download the Desktop Engine (see Links ) Extract the files by running the downloaded executable Edit the setup.ini file and add a strong password for the sa account: SAPWD=*password* Note: the default password is empty, which can prevent the setup from continuing the installation Run the setup","title":"Installation"},{"location":"ms_sql_server_2000_desktop_engine/#testing","text":"This article lists Java code for testing the connection","title":"Testing"},{"location":"ms_sql_server_2000_desktop_engine/#troubleshooting","text":"Error Establishing Socket with JDBC Driver Add TCP/IP to the list of protocols as stated in this article Login failed for user 'sa'. Reason: Not associated with a trusted SQL Server connection. For changing the authentication to mixed mode see this article","title":"Troubleshooting"},{"location":"ms_sql_server_2000_desktop_engine/#links","text":"Microsoft SQL Server 2000 (Desktop Engine) Microsoft SQL Server 2000 JDBC Driver SP 3","title":"Links"},{"location":"mtj_with_nvblas/","text":"(The following is based on a post from Eibe Frank on the Weka mailing list.) Here is an example of running MTJ with NVBLAS (NVIDIA's BLAS wrapper) on Ubuntu: Installed https://prdownloads.sourceforge.net/weka/weka-3-8-6-azul-zulu-linux.zip in /home/eibe/Desktop Ran ~/Desktop/weka-3-8-6/weka.sh -main weka.core.WekaPackageManager -install-package netlibNativeLinux To install CPU-based system BLAS/LAPACK, ran sudo apt-get install libblas-dev liblapack-dev sudo ln -s /usr/lib/x86_64-linux-gnu/libblas.so.3 /usr/lib/libblas.so.3 sudo ln -s /usr/lib/x86_64-linux-gnu/liblapack.so.3 /usr/lib/liblapack.so.3 Downloaded and installed CUDA 11.6 from https://developer.nvidia.com/cuda-downloads Copied example nvblas.conf from https://docs.nvidia.com/cuda/nvblas/ into local directory using cat > nvblas.conf Edited nvblas.conf to have NVBLAS_CPU_BLAS_LIB /usr/lib/x86_64-linux-gnu/blas/libblas.so.3 Now, by adapting what's given at https://github.com/fommil/netlib-java/wiki/NVBLAS , issued export LD_LIBRARY_PATH=/usr/local/cuda-11.6/lib64:/usr/lib/x86_64-linux-gnu/blas/libblas.so.3 Then, ~/Desktop/weka-3-8-6/weka.sh -main weka.Run .RandomRBF -a 5000 > RandomRBF.a5000.arff LD_PRELOAD=libnvblas.so ~/Desktop/weka-3-8-6/weka.sh -main weka.Run .attributeSelection.PrincipalComponents -i RandomRBF.a5000.arff Observation: Memory is being allocated on the GPU. Looking at nvblas.log , the GPU is used, but only for some dgemm operations. However, according to https://docs.nvidia.com/cuda/nvblas/ , the tremm operation (which is executed on the CPU) should also be supported by the GPU.","title":"Mtj with nvblas"},{"location":"multi_instance_classification/","text":"Multi-instance (MI) classification is a supervised learning technique, but differs from normal supervised learning: it has multiple instances in an example only one class label is observable for all the instances in an example Classifiers # Multi-instance classifiers were originally available through a separate software package, Multi-Instance Learning Kit (= MILK). Weka handles relational attributes now natively since 3.5.3 and the multi-instance classifiers are available through the multiInstanceLearning package and filters through the multiInstanceFilters . Once the packages have been installed, the classifiers can be found in the following package: weka.classifiers.mi Data format # The data format for multi-instance classifiers is fairly simple: bag-id - nominal attribute; unique identifier for each bag bag - relational attribute; contains the instances of an example class - the class label for the examples Weka offers two filters to convert from flat file format (or propositional format), which is normally used in supervised classification, to multi-instance format and vice versa: weka.filters.unsupervised.attribute.PropositionalToMultiInstance weka.filters.unsupervised.attribute.MultiInstanceToPropositional Here is an example of the musk1 UCI dataset, used quite often in publications covering MI learning (Note: ... denotes omission): propositional format: This ARFF file lists all the attributes, molecule_name (which is the bag-id), f1 to f166 (containing the actual data of the instances) and the class attribute. @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @attribute class {0,1} @data MUSK-188,42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30,1 MUSK-188,42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30,1 ... multi-instance format: Using the relational attribute, one only has three attributes on the first level: molecule_name , bag and class . The relational attribute contains the instances for each example, consisting of the attributes f1 to f166 . The data of the relational attribute is surrounded by quotes and the single instances inside the bag are separated by line-feeds (= \\n ). @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} @data MUSK-188,\"42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30\\n42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30\\n...\",1 ... See also # Use Weka in your Java code - general article about using the Weka API Creating an ARFF file - explains how to create an ARFF file from within Java, incl. relational attributes Links # Xin Xu. Statistical learning in multiple instance problem. Master's thesis, University of Waikato, Hamilton, NZ, 2003. 0657.594. Download MILK homepage multiInstanceLearning Javadoc multiInstanceFilters Javadoc","title":"Multi instance classification"},{"location":"multi_instance_classification/#classifiers","text":"Multi-instance classifiers were originally available through a separate software package, Multi-Instance Learning Kit (= MILK). Weka handles relational attributes now natively since 3.5.3 and the multi-instance classifiers are available through the multiInstanceLearning package and filters through the multiInstanceFilters . Once the packages have been installed, the classifiers can be found in the following package: weka.classifiers.mi","title":"Classifiers"},{"location":"multi_instance_classification/#data-format","text":"The data format for multi-instance classifiers is fairly simple: bag-id - nominal attribute; unique identifier for each bag bag - relational attribute; contains the instances of an example class - the class label for the examples Weka offers two filters to convert from flat file format (or propositional format), which is normally used in supervised classification, to multi-instance format and vice versa: weka.filters.unsupervised.attribute.PropositionalToMultiInstance weka.filters.unsupervised.attribute.MultiInstanceToPropositional Here is an example of the musk1 UCI dataset, used quite often in publications covering MI learning (Note: ... denotes omission): propositional format: This ARFF file lists all the attributes, molecule_name (which is the bag-id), f1 to f166 (containing the actual data of the instances) and the class attribute. @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @attribute class {0,1} @data MUSK-188,42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30,1 MUSK-188,42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30,1 ... multi-instance format: Using the relational attribute, one only has three attributes on the first level: molecule_name , bag and class . The relational attribute contains the instances for each example, consisting of the attributes f1 to f166 . The data of the relational attribute is surrounded by quotes and the single instances inside the bag are separated by line-feeds (= \\n ). @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} @data MUSK-188,\"42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30\\n42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30\\n...\",1 ...","title":"Data format"},{"location":"multi_instance_classification/#see-also","text":"Use Weka in your Java code - general article about using the Weka API Creating an ARFF file - explains how to create an ARFF file from within Java, incl. relational attributes","title":"See also"},{"location":"multi_instance_classification/#links","text":"Xin Xu. Statistical learning in multiple instance problem. Master's thesis, University of Waikato, Hamilton, NZ, 2003. 0657.594. Download MILK homepage multiInstanceLearning Javadoc multiInstanceFilters Javadoc","title":"Links"},{"location":"not_so_faq/","text":"Associators # How do I use the associator GeneralizedSequentialPatterns? Classifiers # What do those numbers mean in a J48 tree?","title":"Not so FAQ"},{"location":"not_so_faq/#associators","text":"How do I use the associator GeneralizedSequentialPatterns?","title":"Associators"},{"location":"not_so_faq/#classifiers","text":"What do those numbers mean in a J48 tree?","title":"Classifiers"},{"location":"optimizing_parameters/","text":"Since finding the optimal parameters for a classifier can be a rather tedious process, Weka offers some ways of automating this process a bit. The following meta-classifiers allow you to optimize some parameters of your base classifier: weka.classifiers.meta.CVParameterSelection weka.classifiers.meta.GridSearch (only developer version) weka.classifiers.meta.MultiSearch ( external package for 3.7.11+) Auto-WEKA ( external package package for 3.7.13+) After finding the best possible setup, the meta-classifiers then train an instance of the base classifier with these parameters and use it for subsequent predictions. CVParameterSelection # This meta-classifier can optimize over an arbitrary number of parameters, with only one drawback (apart from the obvious explosion of possible parameter combinations): one cannot optimize on nested options, only direct options of the base classifier. What does that mean? It means, that you can optimize the C parameter of weka.classifiers.functions.SMO , but not the C of an weka.classifiers.functions.SMO within a weka.classifiers.meta.FilteredClassifier . Here are a few examples: J48 and it's confidence interval (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.trees.J48 as base classifier within CVParameterSelection open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 0.1 0.5 5 - This will test the confidence parameter from 0.1 to 0.5 with step size 0.1 (= 5 steps) close dialogs and start the classifier you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.trees.J48 Cross-validation Parameter: '-C' ranged from 0.1 to 0.5 with 5.0 steps Classifier Options: **-C 0.1** -M 2 SMO and it's complexity parameter (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.functions.SMO as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 2 8 4 This will test the complexity parameters 2, 4, 6 and 8 (= 4 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.SMO Cross-validation Parameter: '-C' ranged from 2.0 to 8.0 with 4.0 steps Classifier Options: **-C 8** -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.01\" * LibSVM and the gamma parameter of the RBF kernel (\"-G\") * load your dataset in the Explorer * choose weka.classifiers.meta.CVParameterSelection as classifier * select [weka.classifiers.functions.LibSVM](lib_svm.md) as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel * open the ArrayEditor for CVParameters and enter the following string (and click on Add ): G 0.01 0.1 10 This will iterate over the gamma parameter, using values from 0.01 to 0.1 (= 10 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.LibSVM Cross-validation Parameter: '-G' ranged from 0.01 to 0.1 with 10.0 steps Classifier Options: **-G 0.09** -S 0 -K 2 -D 3 -R 0.0 -N 0.5 -M 40.0 -C 1.0 -E 0.0010 -P 0.1 GridSearch # weka.classifiers.meta.GridSearch is a meta-classifier for exploring 2 parameters, hence the grid in the name. If one turns the log on, the classifier will create output suitable for gnuplot , i.e., sections of the log will contain script and data sections. Instead of just using a classifier, one can specify a base classifier and a filter, which both of them can be optimized (one parameter each). In contrast to CVParameterSelection , GridSearch is not limited to first-level parameters of the base classifier, since it's using Java Beans Introspection and one can specify paths to the properties one wants to optimize. A property here is the string of the parameter displayed in the GenericObjectEditor (generated though Introspection), e.g., bagSizePercent or classifier of weka.classifiers.meta.Bagging . Due to some important bugfixes, one should obtain a version of Weka >3.5.6 later than 11 Sept 2007. For each of the two axes, X and Y, one can specify the following parameters: property The dot-separated path pointing to the property to be optimized. In order to distinguish between paths for the filter or the classifier, one needs to prefix the path either with filter. or classifier. for filter or classifier path respectively. expression The mathematical expression to generate the value for the property, processed with the weka.core.MathematicalExpression class, which supports the following functions: abs , sqrt , log , exp , sin , cos , tan , rint , floor , pow , ceil . These variables are available in the expression: BASE , FROM , TO , STEP , I ; with I ranging from FROM to TO . min The minimum value to start from. max The maximum value. step The step size used to get from min to max . base Used in pow() calculations. GridSearch can also optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP) [only when using Weka packages] Note: Correlation coefficient is only available for numeric classes and Accuracy only for nominal ones. Here are a some examples (taken from the Javadoc of the classifier): Optimizing SMO with RBFKernel (C and gamma) Start the Explorer and load your dataset with nominal class. Set the evaluation to Accuracy . Set the filter to weka.filters.AllFilter since we don't need any special data processing and we don't optimize the filter in this case (data gets always passed through filter!). Set weka.classifiers.functions.SMO as classifier with weka.classifiers.functions.supportVector.RBFKernel as kernel. Set the XProperty to \"classifier.c\", XMin to \"1\", XMax to \"16\", XStep to \"1\" and the XExpression to \"I\". This will test the \"C\" parameter of SMO for the values from 1 to 16. Set the YProperty to \"classifier.kernel.gamma\", YMin to \"-5\", YMax to \"2\", YStep to \"1\", YBase to \"10\" and YExpression to \"pow(BASE,I)\". This will test the gamma of the RBFKernel with the values 10 -5 , 10 -4 ,..,10 2 . Output will be similar to this one here: Filter: weka.filters.AllFilter Classifier: weka.classifiers.functions.SMO -C 2.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.0\" X property: classifier.c Y property: classifier.kernel.gamma Evaluation: Accuracy Coordinates: [2.0, 0.0] Values: **2.0** (X coordinate), **1.0** (Y coordinate) * Optimizing PLSFilter with LinearRegression (# of components and ridge) - default setup * Start the Explorer and load your dataset with numeric class. * Set the evaluation to Correlation coefficient. * Set the filter to weka.filters.supervised.attribute.PLSFilter . * Set weka.classifiers.functions.LinearRegression as classifier and use no attribute selection and no elimination of colinear attributes (speeds up LinearRegression significantly!). * Set the XProperty to \"filter.numComponents\", XMin to \"5\", XMax to \"20\" (this depends heavily on your dataset, should be no more than the number of attributes!), XStep to \"1\" and XExpression to \"I\". This will test the number of components the PLSFilter will produce from 5 to 20. * Set the YProperty to \"classifier.ridge\", XMin to \"-10\", XMax to \"5\", YStep to \"1\" and YExpression to \"pow(BASE,I)\". This will try ridge parameters from 10 -10 to 10 5 . * Output will be similar to this one: Filter: weka.filters.supervised.attribute.PLSFilter -C 5 -M -A PLS1 -P center Classifier: weka.classifiers.functions.LinearRegression -S 1 -C -R 5.0 X property: filter.numComponents Y property: classifier.ridge Evaluation: Correlation coefficient Coordinates: [5.0, 5.0] Values: **5.0** (X coordinate), **100000.0** (Y coordinate) Notes: a property for the classifier starts with classifier. a property for the filter starts with filter. Arrays of objects are addressed with [ ] , with the index being 0-based. E.g., using a weka.filters.MultiFilter in GridSearch consisting of a ReplaceMissingValues and a PLSFilter filter one can address the numComponents property of the PLSFilter with filter.filter[1].numComponents MultiSearch # weka.classifiers.meta.MultiSearch is available through this Weka package (requires Weka 3.7.11 or later; for downloads see the Releases section). MultiSearch is similar to GridSearch, more general and simpler at the same time. More general, because it allows the optimization of an arbitrary number of parameters, not just two. Simpler, because it does not offer any search space expansions or gnuplot output and less options. For each parameter to optimize, the user has to define a search parameter . There are two types of parameters available: MathParameter - basically what GridSearch uses, with an expression to calculate the actual value using the min, max and step parameters ListParameter - the blank-separated list of values is used as input for the optimization (useful, if values cannot be described by a mathematical function) Here is a setup for finding the best ridge parameter (property classifier.ridge ) using the MathParameter search parameter using values from 10^-10 to 10^5: weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.MathParameter -property classifier.ridge -min -10.0 -max 5.0 -step 1.0 -base 10.0 -expression pow(BASE,I)\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 And here using the ListParameter search parameter for evaluating values 0.001, 0.05, 0.1, 0.5, 0.75 and 1.0 for the ridge parameter (property classifier.ridge ): weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.ListParameter -property classifier.ridge -list \\\"0.001 0.05 0.1 0.5 0.75 1.0\\\"\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 MultiSearch can be optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP) Auto-WEKA # Auto-WEKA is available as a package through the WEKA package manager. It provides the class weka.classifiers.meta.AutoWEKAClassifier and optimizes all parameters of all learners. It also automatically determines the best learner to use and the best attribute selection method for a given dataset. More information is available on the project website and the manual . Downloads # CVParam.java - optimizes J48's -C parameter See also # LibSVM - you need additional jars in your CLASSPATH to be able to use LibSVM Links # gnuplot homepage Java Beans Introspection","title":"Optimizing parameters"},{"location":"optimizing_parameters/#cvparameterselection","text":"This meta-classifier can optimize over an arbitrary number of parameters, with only one drawback (apart from the obvious explosion of possible parameter combinations): one cannot optimize on nested options, only direct options of the base classifier. What does that mean? It means, that you can optimize the C parameter of weka.classifiers.functions.SMO , but not the C of an weka.classifiers.functions.SMO within a weka.classifiers.meta.FilteredClassifier . Here are a few examples: J48 and it's confidence interval (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.trees.J48 as base classifier within CVParameterSelection open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 0.1 0.5 5 - This will test the confidence parameter from 0.1 to 0.5 with step size 0.1 (= 5 steps) close dialogs and start the classifier you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.trees.J48 Cross-validation Parameter: '-C' ranged from 0.1 to 0.5 with 5.0 steps Classifier Options: **-C 0.1** -M 2 SMO and it's complexity parameter (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.functions.SMO as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 2 8 4 This will test the complexity parameters 2, 4, 6 and 8 (= 4 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.SMO Cross-validation Parameter: '-C' ranged from 2.0 to 8.0 with 4.0 steps Classifier Options: **-C 8** -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.01\" * LibSVM and the gamma parameter of the RBF kernel (\"-G\") * load your dataset in the Explorer * choose weka.classifiers.meta.CVParameterSelection as classifier * select [weka.classifiers.functions.LibSVM](lib_svm.md) as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel * open the ArrayEditor for CVParameters and enter the following string (and click on Add ): G 0.01 0.1 10 This will iterate over the gamma parameter, using values from 0.01 to 0.1 (= 10 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.LibSVM Cross-validation Parameter: '-G' ranged from 0.01 to 0.1 with 10.0 steps Classifier Options: **-G 0.09** -S 0 -K 2 -D 3 -R 0.0 -N 0.5 -M 40.0 -C 1.0 -E 0.0010 -P 0.1","title":"CVParameterSelection"},{"location":"optimizing_parameters/#gridsearch","text":"weka.classifiers.meta.GridSearch is a meta-classifier for exploring 2 parameters, hence the grid in the name. If one turns the log on, the classifier will create output suitable for gnuplot , i.e., sections of the log will contain script and data sections. Instead of just using a classifier, one can specify a base classifier and a filter, which both of them can be optimized (one parameter each). In contrast to CVParameterSelection , GridSearch is not limited to first-level parameters of the base classifier, since it's using Java Beans Introspection and one can specify paths to the properties one wants to optimize. A property here is the string of the parameter displayed in the GenericObjectEditor (generated though Introspection), e.g., bagSizePercent or classifier of weka.classifiers.meta.Bagging . Due to some important bugfixes, one should obtain a version of Weka >3.5.6 later than 11 Sept 2007. For each of the two axes, X and Y, one can specify the following parameters: property The dot-separated path pointing to the property to be optimized. In order to distinguish between paths for the filter or the classifier, one needs to prefix the path either with filter. or classifier. for filter or classifier path respectively. expression The mathematical expression to generate the value for the property, processed with the weka.core.MathematicalExpression class, which supports the following functions: abs , sqrt , log , exp , sin , cos , tan , rint , floor , pow , ceil . These variables are available in the expression: BASE , FROM , TO , STEP , I ; with I ranging from FROM to TO . min The minimum value to start from. max The maximum value. step The step size used to get from min to max . base Used in pow() calculations. GridSearch can also optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP) [only when using Weka packages] Note: Correlation coefficient is only available for numeric classes and Accuracy only for nominal ones. Here are a some examples (taken from the Javadoc of the classifier): Optimizing SMO with RBFKernel (C and gamma) Start the Explorer and load your dataset with nominal class. Set the evaluation to Accuracy . Set the filter to weka.filters.AllFilter since we don't need any special data processing and we don't optimize the filter in this case (data gets always passed through filter!). Set weka.classifiers.functions.SMO as classifier with weka.classifiers.functions.supportVector.RBFKernel as kernel. Set the XProperty to \"classifier.c\", XMin to \"1\", XMax to \"16\", XStep to \"1\" and the XExpression to \"I\". This will test the \"C\" parameter of SMO for the values from 1 to 16. Set the YProperty to \"classifier.kernel.gamma\", YMin to \"-5\", YMax to \"2\", YStep to \"1\", YBase to \"10\" and YExpression to \"pow(BASE,I)\". This will test the gamma of the RBFKernel with the values 10 -5 , 10 -4 ,..,10 2 . Output will be similar to this one here: Filter: weka.filters.AllFilter Classifier: weka.classifiers.functions.SMO -C 2.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.0\" X property: classifier.c Y property: classifier.kernel.gamma Evaluation: Accuracy Coordinates: [2.0, 0.0] Values: **2.0** (X coordinate), **1.0** (Y coordinate) * Optimizing PLSFilter with LinearRegression (# of components and ridge) - default setup * Start the Explorer and load your dataset with numeric class. * Set the evaluation to Correlation coefficient. * Set the filter to weka.filters.supervised.attribute.PLSFilter . * Set weka.classifiers.functions.LinearRegression as classifier and use no attribute selection and no elimination of colinear attributes (speeds up LinearRegression significantly!). * Set the XProperty to \"filter.numComponents\", XMin to \"5\", XMax to \"20\" (this depends heavily on your dataset, should be no more than the number of attributes!), XStep to \"1\" and XExpression to \"I\". This will test the number of components the PLSFilter will produce from 5 to 20. * Set the YProperty to \"classifier.ridge\", XMin to \"-10\", XMax to \"5\", YStep to \"1\" and YExpression to \"pow(BASE,I)\". This will try ridge parameters from 10 -10 to 10 5 . * Output will be similar to this one: Filter: weka.filters.supervised.attribute.PLSFilter -C 5 -M -A PLS1 -P center Classifier: weka.classifiers.functions.LinearRegression -S 1 -C -R 5.0 X property: filter.numComponents Y property: classifier.ridge Evaluation: Correlation coefficient Coordinates: [5.0, 5.0] Values: **5.0** (X coordinate), **100000.0** (Y coordinate) Notes: a property for the classifier starts with classifier. a property for the filter starts with filter. Arrays of objects are addressed with [ ] , with the index being 0-based. E.g., using a weka.filters.MultiFilter in GridSearch consisting of a ReplaceMissingValues and a PLSFilter filter one can address the numComponents property of the PLSFilter with filter.filter[1].numComponents","title":"GridSearch"},{"location":"optimizing_parameters/#multisearch","text":"weka.classifiers.meta.MultiSearch is available through this Weka package (requires Weka 3.7.11 or later; for downloads see the Releases section). MultiSearch is similar to GridSearch, more general and simpler at the same time. More general, because it allows the optimization of an arbitrary number of parameters, not just two. Simpler, because it does not offer any search space expansions or gnuplot output and less options. For each parameter to optimize, the user has to define a search parameter . There are two types of parameters available: MathParameter - basically what GridSearch uses, with an expression to calculate the actual value using the min, max and step parameters ListParameter - the blank-separated list of values is used as input for the optimization (useful, if values cannot be described by a mathematical function) Here is a setup for finding the best ridge parameter (property classifier.ridge ) using the MathParameter search parameter using values from 10^-10 to 10^5: weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.MathParameter -property classifier.ridge -min -10.0 -max 5.0 -step 1.0 -base 10.0 -expression pow(BASE,I)\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 And here using the ListParameter search parameter for evaluating values 0.001, 0.05, 0.1, 0.5, 0.75 and 1.0 for the ridge parameter (property classifier.ridge ): weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.ListParameter -property classifier.ridge -list \\\"0.001 0.05 0.1 0.5 0.75 1.0\\\"\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 MultiSearch can be optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP)","title":"MultiSearch"},{"location":"optimizing_parameters/#auto-weka","text":"Auto-WEKA is available as a package through the WEKA package manager. It provides the class weka.classifiers.meta.AutoWEKAClassifier and optimizes all parameters of all learners. It also automatically determines the best learner to use and the best attribute selection method for a given dataset. More information is available on the project website and the manual .","title":"Auto-WEKA"},{"location":"optimizing_parameters/#downloads","text":"CVParam.java - optimizes J48's -C parameter","title":"Downloads"},{"location":"optimizing_parameters/#see-also","text":"LibSVM - you need additional jars in your CLASSPATH to be able to use LibSVM","title":"See also"},{"location":"optimizing_parameters/#links","text":"gnuplot homepage Java Beans Introspection","title":"Links"},{"location":"osx_mountain_lion_weka_x_y_z_is_damaged_and_cant_be_installed_you_should_eject_the_disk_image/","text":"Mac OS X 10.8 (Mountain Lion) introduced a new security feature that, by default, limits \"acceptable\" applications to only those downloaded from the Mac App store. Thankfully, you can alter this in the system preferences. Go to \"Security & Privacy\" and change the \"Allow applications downloaded from:\" to \"Anywhere\". Weka will launch successfully after this change.","title":"Osx mountain lion weka x y z is damaged and cant be installed you should eject the disk image"},{"location":"performing_attribute_selection/","text":"In Weka, you have three options of performing attribute selection from commandline (not everything is possible from the GUI): the native approach, using the attribute selection classes directly using a meta-classifier the filter approach Notes: The commandlines outlined in this article are for a Linux/Unix bash (the backslash tells the shell that the command isn't finished yet and continues on the next line). In case of Windows or the SimpleCLI, just remove those backslashes and put everything on one line. The Explorer in the developer version (>= 3.5.4) also outputs the commandline setups to its log. Just click on the Log button to display the log and copy/paste the commandlines (you will need to add the appropriate java call and dataset files, of course). Native # Using the attribute selection classes directly outputs some additional useful information, like number of subsets evaluated/best merit (for subset evaluators), ranked output with merit per attribute (for ranking based setups). The attribute selection classes are located in the following package: weka.attributeSelection Example using CfsSubsetEval and BestFirst : java weka.attributeSelection.CfsSubsetEval \\ -M \\ -s \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -i <file.arff> Meta-classifier # Weka also offers a meta-classifier that takes a search algorithm and evaluator next to the base classifier. This makes the attribute selection process completely transparent and the base classifier receives only the reduced dataset. This is the full classname of the meta-classifier: weka.classifiers.meta.AttributeSelectedClassifier Example using CfsSubsetEval and BestFirst : java weka.classifiers.meta.AttributeSelectedClassifier \\ -t <training.arff> \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -W weka.classifiers.trees.J48 \\ -- \\ -C 0 .25 -M 2 Filter # In case you want to obtain the reduced/ranked data and not just output the selected/ranked attributes or using it internally in a classifier, you can use the filter approach. The following filter offers attribute selection: weka.filters.supervised.attribute.AttributeSelection Example using CfsSubsetEval and BestFirst in batch mode : java weka.filters.supervised.attribute.AttributeSelection \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -b \\ -i <input1.arff> \\ -o <output1.arff> \\ -r <input2.arff> \\ -s <output2.arff> Note: batch mode is not available from the Explorer. See also # Batch filtering - general information about batch filtering Use Weka in your Java code , section Attribute selection - if you want to use attribute selection from your own code.","title":"Performing attribute selection"},{"location":"performing_attribute_selection/#native","text":"Using the attribute selection classes directly outputs some additional useful information, like number of subsets evaluated/best merit (for subset evaluators), ranked output with merit per attribute (for ranking based setups). The attribute selection classes are located in the following package: weka.attributeSelection Example using CfsSubsetEval and BestFirst : java weka.attributeSelection.CfsSubsetEval \\ -M \\ -s \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -i <file.arff>","title":"Native"},{"location":"performing_attribute_selection/#meta-classifier","text":"Weka also offers a meta-classifier that takes a search algorithm and evaluator next to the base classifier. This makes the attribute selection process completely transparent and the base classifier receives only the reduced dataset. This is the full classname of the meta-classifier: weka.classifiers.meta.AttributeSelectedClassifier Example using CfsSubsetEval and BestFirst : java weka.classifiers.meta.AttributeSelectedClassifier \\ -t <training.arff> \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -W weka.classifiers.trees.J48 \\ -- \\ -C 0 .25 -M 2","title":"Meta-classifier"},{"location":"performing_attribute_selection/#filter","text":"In case you want to obtain the reduced/ranked data and not just output the selected/ranked attributes or using it internally in a classifier, you can use the filter approach. The following filter offers attribute selection: weka.filters.supervised.attribute.AttributeSelection Example using CfsSubsetEval and BestFirst in batch mode : java weka.filters.supervised.attribute.AttributeSelection \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -b \\ -i <input1.arff> \\ -o <output1.arff> \\ -r <input2.arff> \\ -s <output2.arff> Note: batch mode is not available from the Explorer.","title":"Filter"},{"location":"performing_attribute_selection/#see-also","text":"Batch filtering - general information about batch filtering Use Weka in your Java code , section Attribute selection - if you want to use attribute selection from your own code.","title":"See also"},{"location":"plotting_multiple_roc_curves/","text":"KnowledgeFlow # Description # Comparing different classifiers on one dataset can also be done via ROC curves , not just via Accuracy, Correlation coefficient etc. In the Explorer it is not possible to do that for several classifiers, this is only possible in the KnowledgeFlow . This is the basic setup (based on a Wekalist post): ArffLoader ---dataSet---> ClassAssigner ---dataSet---> ClassValuePicker (the class label you want the plot for) ---dataSet---> CrossValidationFoldMaker ---trainingSet/testSet (i.e. BOTH connections)---> Classifier of your choice ---batchClassifier---> ClassifierPerformanceEvaluator ---thresholdData---> ModelPerformanceChart This setup can be easily extended to host several classifiers, which illustrates the Plotting_multiple_roc.kfml example, containing J48 and RandomForest as classifiers. Java # Description # The VisualizeMultipleROC.java class lets you display several ROC curves in a single plot. The data it is using for display is from previously saved ROC curves. This example class is just a modified version of the VisualizeROC.java class, which displays only a single ROC curve (see Visualizing ROC curve article). See also # Wikipedia article on ROC curve Visualizing ROC curve ROC curves Downloads # Plotting_multiple_roc.kfml - Example KnowledgeFlow layout file VisualizeMultipleROC.java ( stable , developer )","title":"KnowledgeFlow"},{"location":"plotting_multiple_roc_curves/#knowledgeflow","text":"","title":"KnowledgeFlow"},{"location":"plotting_multiple_roc_curves/#description","text":"Comparing different classifiers on one dataset can also be done via ROC curves , not just via Accuracy, Correlation coefficient etc. In the Explorer it is not possible to do that for several classifiers, this is only possible in the KnowledgeFlow . This is the basic setup (based on a Wekalist post): ArffLoader ---dataSet---> ClassAssigner ---dataSet---> ClassValuePicker (the class label you want the plot for) ---dataSet---> CrossValidationFoldMaker ---trainingSet/testSet (i.e. BOTH connections)---> Classifier of your choice ---batchClassifier---> ClassifierPerformanceEvaluator ---thresholdData---> ModelPerformanceChart This setup can be easily extended to host several classifiers, which illustrates the Plotting_multiple_roc.kfml example, containing J48 and RandomForest as classifiers.","title":"Description"},{"location":"plotting_multiple_roc_curves/#java","text":"","title":"Java"},{"location":"plotting_multiple_roc_curves/#description_1","text":"The VisualizeMultipleROC.java class lets you display several ROC curves in a single plot. The data it is using for display is from previously saved ROC curves. This example class is just a modified version of the VisualizeROC.java class, which displays only a single ROC curve (see Visualizing ROC curve article).","title":"Description"},{"location":"plotting_multiple_roc_curves/#see-also","text":"Wikipedia article on ROC curve Visualizing ROC curve ROC curves","title":"See also"},{"location":"plotting_multiple_roc_curves/#downloads","text":"Plotting_multiple_roc.kfml - Example KnowledgeFlow layout file VisualizeMultipleROC.java ( stable , developer )","title":"Downloads"},{"location":"primer/","text":"WEKA is a comprehensive workbench for machine learning and data mining. Its main strengths lie in the classification area, where many of the main machine learning approaches have been implemented within a clean, object-oriented Java class hierarchy. Regression, association rule mining, time series prediction, and clustering algorithms have also been implemented. This document serves as a brief introduction to using WEKA from the command line interface. We will begin by describing basic concepts and ideas. Then, we will describe the weka.filters package, which is used to transform input data, e.g., for preprocessing, transformation, feature generation and so on. Following that, we will consider some machine learning algorithms that generate classification models. Afterwards, some practical examples are given. Note that, in the doc directory of the WEKA installation directory, you can find documentation of all Java classes in WEKA. Prepare to use it since this introduction is not intended to be complete. If you want to know exactly what is going on, take a look at the source code, which can be found in weka-src.jar and can be extracted via the jar utility from the Java Development Kit. Basic concepts # Dataset # A set of data items, the dataset, is a very basic concept of machine learning. A dataset is roughly equivalent to a two-dimensional spreadsheet or database table. In WEKA, it is implemented by the Instances class. A dataset is a collection of examples, each one of class Instance . Each Instance consists of a number of attributes, any of which can be nominal (= one of a predefined list of values), numeric (= a real or integer number) or a string (= an arbitrary long list of characters, enclosed in \"double quotes\"). WEKA also supports date attributes and relational attributes. The external representation of an Instances class is an ARFF file, which consists of a header describing the attribute types and the data as comma-separated list. Here is a short, commented example. A complete description of the ARFF file format can be found here . % This is a toy example, the UCI weather dataset. % Any relation to real weather is purely coincidental. Comment lines at the beginning of the dataset should give an indication of its source, context and meaning. @relation golfWeatherMichigan_1988/02/10_14days Here we state the internal name of the dataset. Try to be as descriptive as possible. @attribute outlook {sunny, overcast rainy} @attribute windy {TRUE, FALSE} Here we define two nominal attributes, outlook and windy . The former has three values: sunny , overcast and rainy ; the latter two: TRUE and FALSE . Nominal values with special characters, commas or spaces are enclosed in 'single quotes'. @attribute temperature numeric @attribute humidity numeric These lines define two numeric attributes. @attribute play {yes, no} The last attribute is the default target or class variable used for prediction. In our case it is a nominal attribute with two values, making this a binary classification problem. @data sunny,FALSE,85,85,no sunny,TRUE,80,90,no overcast,FALSE,83,86,yes rainy,FALSE,70,96,yes rainy,FALSE,68,80,yes The rest of the dataset consists of the token @data, followed by comma-separated values for the attributes -- one line per example. In our case there are five examples. Some basic statistics and validation of given ARFF files can be obtained via the main() routine of weka.core.Instances : java weka.core.Instances data/soybean.arff weka.core offers some other useful routines, e.g., converters.C45Loader and converters.CSVLoader , which can be used to convert C45 datasets and comma/tab-separated datasets respectively, e.g.: java weka.core.converters.CSVLoader data.csv > data.arff java weka.core.converters.C45Loader c45_filestem > data.arff Classifier # Any classification or regression algorithm in WEKA is derived from the abstract Classifier class. Surprisingly little is needed for a basic classifier: a routine which generates a classifier model from a training dataset (= buildClassifier ) and another routine which produces a classification for a given instance (= classifyInstance ), or generates a probability distribution for all classes of the instance (= distributionForInstance ). A classifier model is an arbitrary complex mapping from predictor attributes to the class attribute. The specific form and creation of this mapping, or model, differs from classifier to classifier. For example, ZeroR's model just consists of a single value: the most common class in the case of classification problems, or the median of all numeric values in case of predicting a numeric value (= regression learning). ZeroR is a trivial classifier, but it gives a lower bound on the performance of a given dataset that should be significantly improved by more complex classifiers. As such it is a reasonable test of how well the class can be predicted without considering the other attributes. Later , we will explain how to interpret the output from classifiers in detail -- for now just focus on the Correctly Classified Instances in the section Stratified cross-validation and notice how it improves from ZeroR to J48 when we use the soybean data: java weka.classifiers.rules.ZeroR -t soybean.arff java weka.classifiers.trees.J48 -t soybean.arff There are various approaches to determine the performance of classifiers. It can most simply be measured by counting the proportion of correctly predicted examples in a test dataset. This value is the classification accuracy , which is also 1-ErrorRate . Both terms are used in literature. The simplest case for evaluation is when we use a training set and a test set which are mutually independent. This is referred to as hold-out estimate. To estimate variance in these performance estimates, hold-out estimates may be computed by repeatedly by resampling the same dataset -- i.e., randomly shuffling it and then splitting it into training and test sets with a specific proportion of the examples, collecting all estimates on the test sets and computing average and standard deviation of accuracy. A more elaborate method is k -fold cross-validation. Here, a number of folds k is specified. The dataset is randomly shuffled and then split into k folds of equal size. In each iteration, one fold is used for testing and the other k-1 folds are used for training the classifier. The test results are collected and pooled (or averaged) over all folds. This gives the cross-validation estimate of accuracy. The folds can be purely random or slightly modified to create the same class distributions in each fold as in the complete dataset. In the latter case the cross-validation is called stratified . Leave-one-out (loo) cross-validation signifies that k is equal to the number of examples. Out of necessity, loo cv has to be non-stratified, i.e., the class distributions in the test sets are not the same as those in the training data. Therefore loo CV can produce misleading results in rare cases. However it is still quite useful in dealing with small datasets since it utilizes the greatest amount of training data from the dataset. weka filters # The weka.filters package contains Java classes that transform datasets -- by removing or adding attributes, resampling the dataset, removing examples and so on. This package offers useful support for data preprocessing, which is an important step in machine learning. All filters offer the command-line option -i for specifying the input dataset, and the option -o for specifying the output dataset. If any of these parameters is not given, this specifies standard input resp. output for use within pipes. Other parameters are specific to each filter and can be found out via - h , as with any other class. The weka.filters package is organized into supervised and unsupervised filtering, both of which are again subdivided into instance and attribute filtering. We will discuss each of the four subsection separately. weka.filters.supervised # Classes below weka.filters.supervised in WEKA's Java class hierarchy are for supervised filtering, i.e., taking advantage of the class information. For those filters, a class must be assigned by providing the index of the class attribute via -c . attribute # Discretize is used to discretize numeric attributes into nominal ones, based on the class information, via Fayyad & Irani's MDL method, or optionally with Kononeko's MDL method. Some learning schemes or classifiers can only process nominal data, e.g., rules.Prism ; and in some cases discretization may also reduce learning time and help combat overfitting. java weka.filters.supervised.attribute.Discretize -i data/iris.arff -o iris-nom.arff -c last java weka.filters.supervised.attribute.Discretize -i data/cpu.arff -o cpu-classvendor-nom.arff -c first NominalToBinary encodes all nominal attributes into binary (two-valued) attributes, which can be used to transform the dataset into a purely numeric representation, e.g., for visualization via multi-dimensional scaling. java weka.filters.supervised.attribute.NominalToBinary -i data/contact-lenses.arff -o contact-lenses-bin.arff -c last Note that most classifiers in WEKA utilize transformation filters internally, e.g., Logistic and SMO, so you may not have to use these filters explicity. instance # Resample creates a stratified subsample of the given dataset. This means that overall class distributions are approximately retained within the sample. A bias towards uniform class distribution can be specified via - B . java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -c last -Z 5 java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-uniform-5%.arff -c last -Z 5 -B 1 StratifiedRemoveFolds creates stratified cross-validation folds of the given dataset. This means that per default the class distributions are approximately retained within each fold. The following example splits soybean.arff into stratified training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-train.arff \\ -c last -N 4 -F 1 -V java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-test.arff \\ -c last -N 4 -F 1 weka.filters.unsupervised # Classes below weka.filters.unsupervised in WEKA's Java class hierarchy are for unsupervised filtering, e.g., the non-stratified version of Resample. A class should not be assigned here. attribute # StringToWordVector transforms string attributes into a word vectors, e.g., creating one attribute for each word that either encodes presence or word count ( -C ) within the string. -W can be used to set an approximate limit on the number of words. When a class is assigned, the limit applies to each class separately. This filter is useful for text mining. Obfuscate renames the dataset name, all attribute names and nominal attribute values. This is intended for exchanging sensitive datasets without giving away restricted information. Remove is intended for explicit deletion of attributes from a dataset, e.g. for removing attributes of the iris dataset: java weka.filters.unsupervised.attribute.Remove -R 1 -2 -i data/iris.arff -o iris-simplified.arff java weka.filters.unsupervised.attribute.Remove -V -R 3 -last -i data/iris.arff -o iris-simplified.arff instance # Resample creates a non-stratified subsample of the given dataset. It performs random sampling without regard to the class information. Otherwise it is equivalent to its supervised variant. java weka.filters.unsupervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -Z 5 RemoveFolds creates cross-validation folds of the given dataset. The class distributions are not retained. The following example splits soybean.arff into training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-train.arff -c last -N 4 -F 1 -V java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-test.arff -c last -N 4 -F 1 RemoveWithValues filters instances according to the value of an attribute. java weka.filters.unsupervised.instance.RemoveWithValues -i data/soybean.arff \\ -o soybean-without_herbicide_injury.arff -V -C last -L 19 weka.classifiers # Classifiers are at the core of WEKA. There are a lot of common options for classifiers, most of which are related to evaluation purposes. We will focus on the most important ones. All others including classifier-specific parameters can be found via - h , as usual. Parameter Description -t specifies the training file (ARFF format) -T specifies the test file in (ARFF format). If this parameter is missing, a crossvalidation will be performed (default: 10-fold cv) -x This parameter determines the number of folds for the cross-validation. A cv will only be performed if -T is missing. -c As we already know from the weka.filters section, this parameter sets the class variable with a one-based index. -d The model after training can be saved via this parameter. Each classifier has a different binary format for the model, so it can only be read back by the ct same classifier on a compatible dataset. Only the model on the training set is saved, not the multiple models generated via cross-validation. -l Loads a previously saved model, usually for testing on new, previously unseen data. In that case, a compatible test file should be specified, i.e. the same ributes in the same order. -p If a test file is specified, this parameter shows you the predictions and one attribute (0 for none) for all test instances. -o This parameter switches the human-readable output of the model description off. In case of support vector machines or NaiveBayes, this makes some sense unless you want to parse and visualize a lot of information. We now give a short list of selected classifiers in WEKA: trees.J48 A clone of the C4.5 decision tree learner bayes.NaiveBayes A Naive Bayesian learner. -K switches on kernel density estimation for numerical attributes which often improves performance. meta.ClassificationViaRegression -W functions.LinearRegression Multi-response linear regression. functions.Logistic Logistic Regression. functions.SMO Support Vector Machine (linear, polynomial and RBF kernel) with Seuential Minimal Optimization Algorithm due to [Platt, 1998]. Defaults to SVM with linear kernel, -E 5 -C 10 gives an SVM with polynomial kernel of degree 5 and lambda=10. lazy.KStar Instance-Based learner. -E sets the blend entropy automatically, which is usa`lly preferable. lazy.IBk Instance-Based learner with fixed neighborhood. -K sets the number of neighbors tou`se. IB1 is equivalent to IBk -K 1 rules.JRip A clone of the RIPPER rule learner. Based on a simple example, we will now explain the output of a typical classifier, weka.classifiers.trees.J48 . Consider the following call from the command line, or start the WEKA explorer and train J48 on weather.numeric.arff: java weka.classifiers.trees.J48 -t data/weather.numeric.arff J48 pruned tree ------------------ outlook = sunny | humidity <= 75: yes (2.0) | humidity > 75: no (3.0) outlook = overcast: yes (4.0) outlook = rainy | windy = TRUE: no (2.0) | windy = FALSE: yes (3.0) Number of Leaves : 5 Size of the tree : 8 The first part, unless you specify -o , is a human-readable form of the training set model. In this case, it is a decision tree. outlook is at the root of the tree and determines the first decision. In case it is overcast, we'll always play golf. The numbers in (parentheses) at the end of each leaf tell us the number of examples in this leaf. If one or more leaves were not pure (= all of the same class), the number of misclassified examples would also be given, after a /slash/ Time taken to build model: 0.05 seconds Time taken to test model on training data: 0 seconds As you can see, a decision tree learns quite fast and is evaluated even faster. == Error on training data == Correctly Classified Instance 14 100 % Incorrectly Classified Instances 0 0 % Kappa statistic 1 Mean absolute error 0 Root mean squared error 0 Relative absolute error 0 % Root relative squared error 0 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 1 0 1 1 1 yes 1 0 1 1 1 no == Confusion Matrix == a b <-- classified as 9 0 | a = yes 0 5 | b = no This is quite boring: our classifier is perfect, at least on the training data -- all instances were classified correctly and all errors are zero. As is usually the case, the training set accuracy is too optimistic. The detailed accuracy by class and the confusion matrix is similarily trivial. == Stratified cross-validation == Correctly Classified Instances 9 64.2857 % Incorrectly Classified Instances 5 35.7143 % Kappa statistic 0.186 Mean absolute error 0.2857 Root mean squared error 0.4818 Relative absolute error 60 % Root relative squared error 97.6586 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 0.778 0.6 0.7 0.778 0.737 yes 0.4 0.222 0.5 0.4 0.444 no == Confusion Matrix == a b <-- classified as 7 2 | a = yes 3 2 | b = no The stratified cross-validation paints a more realistic picture. The accuracy is around 64%. The kappa statistic measures the agreement of prediction with the true class -- 1.0 signifies complete agreement. The error values that are shown, e.g., the root of the mean squared error, indicate the accuracy of the probability estimates that are generated by the classification model. The confusion matrix is more commonly named contingency table . In our case we have two classes, and therefore a 2x2 confusion matrix, the matrix could be arbitrarily large. The number of correctly classified instances is the sum of diagonals in the matrix; all others are incorrectly classified (class \"a\" gets misclassified as \"b\" exactly twice, and class \"b\" gets misclassified as \"a\" three times). The True Positive (TP) rate is the proportion of examples which were classified as class x , among all examples which truly have class x , i.e., how much of the class was captured correctly. It is equivalent to Recall . In the confusion matrix, this is the diagonal element divided by the sum over the relevant row, i.e., 7/(7+2)=0.778 for class yes and 2/(3+2)=0.4 for class no in our example. The False Positive (FP) rate is the proportion of examples which were classified as class x , but belong to a different class, among all examples which are not of class x . In the matrix, this is the column sum of class x minus the diagonal element, divided by the row sums of all other classes; i.e. 3/5=0.6 for class yes and 2/9=0.222 for class no . The Precision is the proportion of the examples which truly have class x among all those which were classified as class x . In the matrix, this is the diagonal element divided by the sum over the relevant column, i.e. 7/(7+3)=0.7 for class yes and 2/(2+2)=0.5 for class no . The F-Measure is simply 2 Precision Recall/(Precision+Recall), a combined measure for precision and recall. These measures are useful for comparing classifiers. However, if more detailed information about the classifier's predictions are necessary, -p # outputs just the predictions for each test instance, along with a range of one-based attribute ids (0 for none). Let's look at the following example. We shall assume soybean-train.arff and soybean-test.arff have been constructed via weka.filters.supervised.instance.StratifiedRemoveFolds as in a previous example. java weka . classifiers . bayes . NaiveBayes - K - t soybean - train . arff - T soybean - test . arff - p 0 0 diaporthe-stem-canker 0.9999672587892333 diaporthe-stem-canker 1 diaporthe-stem-canker 0.9999992614503429 diaporthe-stem-canker 2 diaporthe-stem-canker 0.999998948559035 diaporthe-stem-canker 3 diaporthe-stem-canker 0.9999998441238833 diaporthe-stem-canker 4 diaporthe-stem-canker 0.9999989997681132 diaporthe-stem-canker 5 rhizoctonia-root-rot 0.9999999395928124 rhizoctonia-root-rot 6 rhizoctonia-root-rot 0.999998912860593 rhizoctonia-root-rot 7 rhizoctonia-root-rot 0.9999994386283236 rhizoctonia-root-rot ... The values in each line are separated by a single space. The fields are the zero-based test instance id, followed by the predicted class value, the confidence for the prediction (estimated probability of predicted class), and the true class. All these are correctly classified, so let's look at a few erroneous ones. 32 phyllosticta-leaf-spot 0.7789710144361445 brown-spot ... 39 alternarialeaf-spot 0.6403333824349896 brown-spot ... 44 phyllosticta-leaf-spot 0.893568420641914 brown-spot ... 46 alternarialeaf-spot 0.5788190397739439 brown-spot ... 73 brown-spot 0.4943768155314637 alternarialeaf-spot ... In each of these cases, a misclassification occurred, mostly between classes alternarialeaf-spot and brown-spot . The confidences seem to be lower than for correct classification, so for a real-life application it may make sense to output don't know below a certain threshold. WEKA also outputs a trailing newline. If we had chosen a range of attributes via -p , e.g., -p first-last , the mentioned attributes would have been output afterwards as comma-separated values, in parantheses. However, the zero-based instance id in the first column offers a safer way to determine the test instances. Usually, if you evaluate a classifier for a longer experiment, you will do something like this (for csh): java -Xmx1024m weka.classifiers.trees.J48 -t data.arff -k -d J48-data.model > & ! J48-data.out & The -Xmx1024m parameter for maximum heap size enables the Java heap, where Java stores objects, to grow to a maximum size of 1024 Megabytes. There is no overhead involved, it just leaves more room for the heap to grow. The - k flag gives you some additional performance statistics. In case your model performs well, it makes sense to save it via -d - you can always delete it later! The implicit cross-validation gives a more reasonable estimate of the expected accuracy on unseen data than the training set accuracy. The output both of standard error and output should be redirected, so you get both errors and the normal output of your classifier. The last & starts the task in the background. Keep an eye on your task via top and if you notice the hard disk works hard all the time (for linux), this probably means your task needs too much memory and will not finish in time for the exam. ;-) In that case, switch to a faster classifier or use filters , e.g., for Resample to reduce the size of your dataset or StratifiedRemoveFolds to create training and test sets - for most classifiers, training takes more time than testing. So, now you have run a lot of experiments -- which classifier is best? Try cat *.out | grep -A 3 \"Stratified\" | grep \"^Correctly\" ...this should give you all cross-validated accuracies. If the cross-validated accuracy is roughly the same as the training set accuracy, this indicates that your classifiers is presumably not overfitting the training set. Assume you have found the best classifier. To apply it on a new dataset, use something like java weka.classifiers.trees.J48 -l J48-data.model -T new-data.arff You will have to use the same classifier to load the model, but you need not set any options. Just add the new test file via -T . If you want, -p first-last will output all test instances with classifications and confidence scores, followed by all attribute values, so you can look at each error separately. The following more complex csh script creates datasets for learning curves, creating a 75% training set and 25% test set from a given dataset, then successively reducing the test set by factor 1.2 (83%), until it is also 25% in size. All this is repeated thirty times, with different random reorderings (- S ) and the results are written to different directories. The Experimenter GUI in WEKA can be used to design and run similar experiments. #!/bin/csh foreach f ( $* ) set run = 1 while ( $run < = 30 ) mkdir $run > & ! /dev/null java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -c last -i ../ $f -o $run /t_ $f java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -V -c last -i ../ $f -o $run /t0 $f foreach nr ( 0 1 2 3 4 5 ) set nrp1 = $nr @ nrp1++ java weka.filters.supervised.instance.Resample -S 0 -Z 83 -c last -i $run /t $nr$f -o $run /t $nrp1$f end echo Run $run of $f done . @ run++ end end If meta classifiers are used, i.e. classifiers whose options include classifier specifications - for example, StackingC or ClassificationViaRegression , care must be taken not to mix the parameters. For example, java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -S 1 \\ -t data/iris.arff -x 2 gives us an illegal options exception for -S 1 . This parameter is meant for LinearRegression, not for ClassificationViaRegression, but WEKA does not know this by itself. One way to clarify this situation is to enclose the classifier specification, including all parameters, in \"double\" quotes, like this: java weka.classifiers.meta.ClassificationViaRegression -W \"weka.classifiers.functions.LinearRegression -S 1\" \\ -t data/iris.arff -x 2 However this does not always work, depending on how the option handling was implemented in the top-level classifier. While for Stacking this approach would work quite well, for ClassificationViaRegression it does not. We get the dubious error message that the class weka.classifiers.functions.LinearRegression -S 1 cannot be found. Fortunately, there is another approach: All parameters given after -- are processed by the first sub-classifier; another -- lets us specify parameters for the second sub-classifier and so on. java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression \\ -t data/iris.arff -x 2 -- -S 1 In some cases, both approaches have to be mixed, for example: java weka.classifiers.meta.Stacking -B \"weka.classifiers.lazy.IBk -K 10\" \\ -M \"weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -- -S 1\" \\ -t data/iris.arff -x 2 Notice that while ClassificationViaRegression honors the -- parameter, Stacking itself does not.","title":"Primer"},{"location":"primer/#basic-concepts","text":"","title":"Basic concepts"},{"location":"primer/#dataset","text":"A set of data items, the dataset, is a very basic concept of machine learning. A dataset is roughly equivalent to a two-dimensional spreadsheet or database table. In WEKA, it is implemented by the Instances class. A dataset is a collection of examples, each one of class Instance . Each Instance consists of a number of attributes, any of which can be nominal (= one of a predefined list of values), numeric (= a real or integer number) or a string (= an arbitrary long list of characters, enclosed in \"double quotes\"). WEKA also supports date attributes and relational attributes. The external representation of an Instances class is an ARFF file, which consists of a header describing the attribute types and the data as comma-separated list. Here is a short, commented example. A complete description of the ARFF file format can be found here . % This is a toy example, the UCI weather dataset. % Any relation to real weather is purely coincidental. Comment lines at the beginning of the dataset should give an indication of its source, context and meaning. @relation golfWeatherMichigan_1988/02/10_14days Here we state the internal name of the dataset. Try to be as descriptive as possible. @attribute outlook {sunny, overcast rainy} @attribute windy {TRUE, FALSE} Here we define two nominal attributes, outlook and windy . The former has three values: sunny , overcast and rainy ; the latter two: TRUE and FALSE . Nominal values with special characters, commas or spaces are enclosed in 'single quotes'. @attribute temperature numeric @attribute humidity numeric These lines define two numeric attributes. @attribute play {yes, no} The last attribute is the default target or class variable used for prediction. In our case it is a nominal attribute with two values, making this a binary classification problem. @data sunny,FALSE,85,85,no sunny,TRUE,80,90,no overcast,FALSE,83,86,yes rainy,FALSE,70,96,yes rainy,FALSE,68,80,yes The rest of the dataset consists of the token @data, followed by comma-separated values for the attributes -- one line per example. In our case there are five examples. Some basic statistics and validation of given ARFF files can be obtained via the main() routine of weka.core.Instances : java weka.core.Instances data/soybean.arff weka.core offers some other useful routines, e.g., converters.C45Loader and converters.CSVLoader , which can be used to convert C45 datasets and comma/tab-separated datasets respectively, e.g.: java weka.core.converters.CSVLoader data.csv > data.arff java weka.core.converters.C45Loader c45_filestem > data.arff","title":"Dataset"},{"location":"primer/#classifier","text":"Any classification or regression algorithm in WEKA is derived from the abstract Classifier class. Surprisingly little is needed for a basic classifier: a routine which generates a classifier model from a training dataset (= buildClassifier ) and another routine which produces a classification for a given instance (= classifyInstance ), or generates a probability distribution for all classes of the instance (= distributionForInstance ). A classifier model is an arbitrary complex mapping from predictor attributes to the class attribute. The specific form and creation of this mapping, or model, differs from classifier to classifier. For example, ZeroR's model just consists of a single value: the most common class in the case of classification problems, or the median of all numeric values in case of predicting a numeric value (= regression learning). ZeroR is a trivial classifier, but it gives a lower bound on the performance of a given dataset that should be significantly improved by more complex classifiers. As such it is a reasonable test of how well the class can be predicted without considering the other attributes. Later , we will explain how to interpret the output from classifiers in detail -- for now just focus on the Correctly Classified Instances in the section Stratified cross-validation and notice how it improves from ZeroR to J48 when we use the soybean data: java weka.classifiers.rules.ZeroR -t soybean.arff java weka.classifiers.trees.J48 -t soybean.arff There are various approaches to determine the performance of classifiers. It can most simply be measured by counting the proportion of correctly predicted examples in a test dataset. This value is the classification accuracy , which is also 1-ErrorRate . Both terms are used in literature. The simplest case for evaluation is when we use a training set and a test set which are mutually independent. This is referred to as hold-out estimate. To estimate variance in these performance estimates, hold-out estimates may be computed by repeatedly by resampling the same dataset -- i.e., randomly shuffling it and then splitting it into training and test sets with a specific proportion of the examples, collecting all estimates on the test sets and computing average and standard deviation of accuracy. A more elaborate method is k -fold cross-validation. Here, a number of folds k is specified. The dataset is randomly shuffled and then split into k folds of equal size. In each iteration, one fold is used for testing and the other k-1 folds are used for training the classifier. The test results are collected and pooled (or averaged) over all folds. This gives the cross-validation estimate of accuracy. The folds can be purely random or slightly modified to create the same class distributions in each fold as in the complete dataset. In the latter case the cross-validation is called stratified . Leave-one-out (loo) cross-validation signifies that k is equal to the number of examples. Out of necessity, loo cv has to be non-stratified, i.e., the class distributions in the test sets are not the same as those in the training data. Therefore loo CV can produce misleading results in rare cases. However it is still quite useful in dealing with small datasets since it utilizes the greatest amount of training data from the dataset.","title":"Classifier"},{"location":"primer/#weka-filters","text":"The weka.filters package contains Java classes that transform datasets -- by removing or adding attributes, resampling the dataset, removing examples and so on. This package offers useful support for data preprocessing, which is an important step in machine learning. All filters offer the command-line option -i for specifying the input dataset, and the option -o for specifying the output dataset. If any of these parameters is not given, this specifies standard input resp. output for use within pipes. Other parameters are specific to each filter and can be found out via - h , as with any other class. The weka.filters package is organized into supervised and unsupervised filtering, both of which are again subdivided into instance and attribute filtering. We will discuss each of the four subsection separately.","title":"weka filters"},{"location":"primer/#wekafilterssupervised","text":"Classes below weka.filters.supervised in WEKA's Java class hierarchy are for supervised filtering, i.e., taking advantage of the class information. For those filters, a class must be assigned by providing the index of the class attribute via -c .","title":"weka.filters.supervised"},{"location":"primer/#attribute","text":"Discretize is used to discretize numeric attributes into nominal ones, based on the class information, via Fayyad & Irani's MDL method, or optionally with Kononeko's MDL method. Some learning schemes or classifiers can only process nominal data, e.g., rules.Prism ; and in some cases discretization may also reduce learning time and help combat overfitting. java weka.filters.supervised.attribute.Discretize -i data/iris.arff -o iris-nom.arff -c last java weka.filters.supervised.attribute.Discretize -i data/cpu.arff -o cpu-classvendor-nom.arff -c first NominalToBinary encodes all nominal attributes into binary (two-valued) attributes, which can be used to transform the dataset into a purely numeric representation, e.g., for visualization via multi-dimensional scaling. java weka.filters.supervised.attribute.NominalToBinary -i data/contact-lenses.arff -o contact-lenses-bin.arff -c last Note that most classifiers in WEKA utilize transformation filters internally, e.g., Logistic and SMO, so you may not have to use these filters explicity.","title":"attribute"},{"location":"primer/#instance","text":"Resample creates a stratified subsample of the given dataset. This means that overall class distributions are approximately retained within the sample. A bias towards uniform class distribution can be specified via - B . java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -c last -Z 5 java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-uniform-5%.arff -c last -Z 5 -B 1 StratifiedRemoveFolds creates stratified cross-validation folds of the given dataset. This means that per default the class distributions are approximately retained within each fold. The following example splits soybean.arff into stratified training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-train.arff \\ -c last -N 4 -F 1 -V java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-test.arff \\ -c last -N 4 -F 1","title":"instance"},{"location":"primer/#wekafiltersunsupervised","text":"Classes below weka.filters.unsupervised in WEKA's Java class hierarchy are for unsupervised filtering, e.g., the non-stratified version of Resample. A class should not be assigned here.","title":"weka.filters.unsupervised"},{"location":"primer/#attribute_1","text":"StringToWordVector transforms string attributes into a word vectors, e.g., creating one attribute for each word that either encodes presence or word count ( -C ) within the string. -W can be used to set an approximate limit on the number of words. When a class is assigned, the limit applies to each class separately. This filter is useful for text mining. Obfuscate renames the dataset name, all attribute names and nominal attribute values. This is intended for exchanging sensitive datasets without giving away restricted information. Remove is intended for explicit deletion of attributes from a dataset, e.g. for removing attributes of the iris dataset: java weka.filters.unsupervised.attribute.Remove -R 1 -2 -i data/iris.arff -o iris-simplified.arff java weka.filters.unsupervised.attribute.Remove -V -R 3 -last -i data/iris.arff -o iris-simplified.arff","title":"attribute"},{"location":"primer/#instance_1","text":"Resample creates a non-stratified subsample of the given dataset. It performs random sampling without regard to the class information. Otherwise it is equivalent to its supervised variant. java weka.filters.unsupervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -Z 5 RemoveFolds creates cross-validation folds of the given dataset. The class distributions are not retained. The following example splits soybean.arff into training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-train.arff -c last -N 4 -F 1 -V java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-test.arff -c last -N 4 -F 1 RemoveWithValues filters instances according to the value of an attribute. java weka.filters.unsupervised.instance.RemoveWithValues -i data/soybean.arff \\ -o soybean-without_herbicide_injury.arff -V -C last -L 19","title":"instance"},{"location":"primer/#wekaclassifiers","text":"Classifiers are at the core of WEKA. There are a lot of common options for classifiers, most of which are related to evaluation purposes. We will focus on the most important ones. All others including classifier-specific parameters can be found via - h , as usual. Parameter Description -t specifies the training file (ARFF format) -T specifies the test file in (ARFF format). If this parameter is missing, a crossvalidation will be performed (default: 10-fold cv) -x This parameter determines the number of folds for the cross-validation. A cv will only be performed if -T is missing. -c As we already know from the weka.filters section, this parameter sets the class variable with a one-based index. -d The model after training can be saved via this parameter. Each classifier has a different binary format for the model, so it can only be read back by the ct same classifier on a compatible dataset. Only the model on the training set is saved, not the multiple models generated via cross-validation. -l Loads a previously saved model, usually for testing on new, previously unseen data. In that case, a compatible test file should be specified, i.e. the same ributes in the same order. -p If a test file is specified, this parameter shows you the predictions and one attribute (0 for none) for all test instances. -o This parameter switches the human-readable output of the model description off. In case of support vector machines or NaiveBayes, this makes some sense unless you want to parse and visualize a lot of information. We now give a short list of selected classifiers in WEKA: trees.J48 A clone of the C4.5 decision tree learner bayes.NaiveBayes A Naive Bayesian learner. -K switches on kernel density estimation for numerical attributes which often improves performance. meta.ClassificationViaRegression -W functions.LinearRegression Multi-response linear regression. functions.Logistic Logistic Regression. functions.SMO Support Vector Machine (linear, polynomial and RBF kernel) with Seuential Minimal Optimization Algorithm due to [Platt, 1998]. Defaults to SVM with linear kernel, -E 5 -C 10 gives an SVM with polynomial kernel of degree 5 and lambda=10. lazy.KStar Instance-Based learner. -E sets the blend entropy automatically, which is usa`lly preferable. lazy.IBk Instance-Based learner with fixed neighborhood. -K sets the number of neighbors tou`se. IB1 is equivalent to IBk -K 1 rules.JRip A clone of the RIPPER rule learner. Based on a simple example, we will now explain the output of a typical classifier, weka.classifiers.trees.J48 . Consider the following call from the command line, or start the WEKA explorer and train J48 on weather.numeric.arff: java weka.classifiers.trees.J48 -t data/weather.numeric.arff J48 pruned tree ------------------ outlook = sunny | humidity <= 75: yes (2.0) | humidity > 75: no (3.0) outlook = overcast: yes (4.0) outlook = rainy | windy = TRUE: no (2.0) | windy = FALSE: yes (3.0) Number of Leaves : 5 Size of the tree : 8 The first part, unless you specify -o , is a human-readable form of the training set model. In this case, it is a decision tree. outlook is at the root of the tree and determines the first decision. In case it is overcast, we'll always play golf. The numbers in (parentheses) at the end of each leaf tell us the number of examples in this leaf. If one or more leaves were not pure (= all of the same class), the number of misclassified examples would also be given, after a /slash/ Time taken to build model: 0.05 seconds Time taken to test model on training data: 0 seconds As you can see, a decision tree learns quite fast and is evaluated even faster. == Error on training data == Correctly Classified Instance 14 100 % Incorrectly Classified Instances 0 0 % Kappa statistic 1 Mean absolute error 0 Root mean squared error 0 Relative absolute error 0 % Root relative squared error 0 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 1 0 1 1 1 yes 1 0 1 1 1 no == Confusion Matrix == a b <-- classified as 9 0 | a = yes 0 5 | b = no This is quite boring: our classifier is perfect, at least on the training data -- all instances were classified correctly and all errors are zero. As is usually the case, the training set accuracy is too optimistic. The detailed accuracy by class and the confusion matrix is similarily trivial. == Stratified cross-validation == Correctly Classified Instances 9 64.2857 % Incorrectly Classified Instances 5 35.7143 % Kappa statistic 0.186 Mean absolute error 0.2857 Root mean squared error 0.4818 Relative absolute error 60 % Root relative squared error 97.6586 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 0.778 0.6 0.7 0.778 0.737 yes 0.4 0.222 0.5 0.4 0.444 no == Confusion Matrix == a b <-- classified as 7 2 | a = yes 3 2 | b = no The stratified cross-validation paints a more realistic picture. The accuracy is around 64%. The kappa statistic measures the agreement of prediction with the true class -- 1.0 signifies complete agreement. The error values that are shown, e.g., the root of the mean squared error, indicate the accuracy of the probability estimates that are generated by the classification model. The confusion matrix is more commonly named contingency table . In our case we have two classes, and therefore a 2x2 confusion matrix, the matrix could be arbitrarily large. The number of correctly classified instances is the sum of diagonals in the matrix; all others are incorrectly classified (class \"a\" gets misclassified as \"b\" exactly twice, and class \"b\" gets misclassified as \"a\" three times). The True Positive (TP) rate is the proportion of examples which were classified as class x , among all examples which truly have class x , i.e., how much of the class was captured correctly. It is equivalent to Recall . In the confusion matrix, this is the diagonal element divided by the sum over the relevant row, i.e., 7/(7+2)=0.778 for class yes and 2/(3+2)=0.4 for class no in our example. The False Positive (FP) rate is the proportion of examples which were classified as class x , but belong to a different class, among all examples which are not of class x . In the matrix, this is the column sum of class x minus the diagonal element, divided by the row sums of all other classes; i.e. 3/5=0.6 for class yes and 2/9=0.222 for class no . The Precision is the proportion of the examples which truly have class x among all those which were classified as class x . In the matrix, this is the diagonal element divided by the sum over the relevant column, i.e. 7/(7+3)=0.7 for class yes and 2/(2+2)=0.5 for class no . The F-Measure is simply 2 Precision Recall/(Precision+Recall), a combined measure for precision and recall. These measures are useful for comparing classifiers. However, if more detailed information about the classifier's predictions are necessary, -p # outputs just the predictions for each test instance, along with a range of one-based attribute ids (0 for none). Let's look at the following example. We shall assume soybean-train.arff and soybean-test.arff have been constructed via weka.filters.supervised.instance.StratifiedRemoveFolds as in a previous example. java weka . classifiers . bayes . NaiveBayes - K - t soybean - train . arff - T soybean - test . arff - p 0 0 diaporthe-stem-canker 0.9999672587892333 diaporthe-stem-canker 1 diaporthe-stem-canker 0.9999992614503429 diaporthe-stem-canker 2 diaporthe-stem-canker 0.999998948559035 diaporthe-stem-canker 3 diaporthe-stem-canker 0.9999998441238833 diaporthe-stem-canker 4 diaporthe-stem-canker 0.9999989997681132 diaporthe-stem-canker 5 rhizoctonia-root-rot 0.9999999395928124 rhizoctonia-root-rot 6 rhizoctonia-root-rot 0.999998912860593 rhizoctonia-root-rot 7 rhizoctonia-root-rot 0.9999994386283236 rhizoctonia-root-rot ... The values in each line are separated by a single space. The fields are the zero-based test instance id, followed by the predicted class value, the confidence for the prediction (estimated probability of predicted class), and the true class. All these are correctly classified, so let's look at a few erroneous ones. 32 phyllosticta-leaf-spot 0.7789710144361445 brown-spot ... 39 alternarialeaf-spot 0.6403333824349896 brown-spot ... 44 phyllosticta-leaf-spot 0.893568420641914 brown-spot ... 46 alternarialeaf-spot 0.5788190397739439 brown-spot ... 73 brown-spot 0.4943768155314637 alternarialeaf-spot ... In each of these cases, a misclassification occurred, mostly between classes alternarialeaf-spot and brown-spot . The confidences seem to be lower than for correct classification, so for a real-life application it may make sense to output don't know below a certain threshold. WEKA also outputs a trailing newline. If we had chosen a range of attributes via -p , e.g., -p first-last , the mentioned attributes would have been output afterwards as comma-separated values, in parantheses. However, the zero-based instance id in the first column offers a safer way to determine the test instances. Usually, if you evaluate a classifier for a longer experiment, you will do something like this (for csh): java -Xmx1024m weka.classifiers.trees.J48 -t data.arff -k -d J48-data.model > & ! J48-data.out & The -Xmx1024m parameter for maximum heap size enables the Java heap, where Java stores objects, to grow to a maximum size of 1024 Megabytes. There is no overhead involved, it just leaves more room for the heap to grow. The - k flag gives you some additional performance statistics. In case your model performs well, it makes sense to save it via -d - you can always delete it later! The implicit cross-validation gives a more reasonable estimate of the expected accuracy on unseen data than the training set accuracy. The output both of standard error and output should be redirected, so you get both errors and the normal output of your classifier. The last & starts the task in the background. Keep an eye on your task via top and if you notice the hard disk works hard all the time (for linux), this probably means your task needs too much memory and will not finish in time for the exam. ;-) In that case, switch to a faster classifier or use filters , e.g., for Resample to reduce the size of your dataset or StratifiedRemoveFolds to create training and test sets - for most classifiers, training takes more time than testing. So, now you have run a lot of experiments -- which classifier is best? Try cat *.out | grep -A 3 \"Stratified\" | grep \"^Correctly\" ...this should give you all cross-validated accuracies. If the cross-validated accuracy is roughly the same as the training set accuracy, this indicates that your classifiers is presumably not overfitting the training set. Assume you have found the best classifier. To apply it on a new dataset, use something like java weka.classifiers.trees.J48 -l J48-data.model -T new-data.arff You will have to use the same classifier to load the model, but you need not set any options. Just add the new test file via -T . If you want, -p first-last will output all test instances with classifications and confidence scores, followed by all attribute values, so you can look at each error separately. The following more complex csh script creates datasets for learning curves, creating a 75% training set and 25% test set from a given dataset, then successively reducing the test set by factor 1.2 (83%), until it is also 25% in size. All this is repeated thirty times, with different random reorderings (- S ) and the results are written to different directories. The Experimenter GUI in WEKA can be used to design and run similar experiments. #!/bin/csh foreach f ( $* ) set run = 1 while ( $run < = 30 ) mkdir $run > & ! /dev/null java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -c last -i ../ $f -o $run /t_ $f java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -V -c last -i ../ $f -o $run /t0 $f foreach nr ( 0 1 2 3 4 5 ) set nrp1 = $nr @ nrp1++ java weka.filters.supervised.instance.Resample -S 0 -Z 83 -c last -i $run /t $nr$f -o $run /t $nrp1$f end echo Run $run of $f done . @ run++ end end If meta classifiers are used, i.e. classifiers whose options include classifier specifications - for example, StackingC or ClassificationViaRegression , care must be taken not to mix the parameters. For example, java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -S 1 \\ -t data/iris.arff -x 2 gives us an illegal options exception for -S 1 . This parameter is meant for LinearRegression, not for ClassificationViaRegression, but WEKA does not know this by itself. One way to clarify this situation is to enclose the classifier specification, including all parameters, in \"double\" quotes, like this: java weka.classifiers.meta.ClassificationViaRegression -W \"weka.classifiers.functions.LinearRegression -S 1\" \\ -t data/iris.arff -x 2 However this does not always work, depending on how the option handling was implemented in the top-level classifier. While for Stacking this approach would work quite well, for ClassificationViaRegression it does not. We get the dubious error message that the class weka.classifiers.functions.LinearRegression -S 1 cannot be found. Fortunately, there is another approach: All parameters given after -- are processed by the first sub-classifier; another -- lets us specify parameters for the second sub-classifier and so on. java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression \\ -t data/iris.arff -x 2 -- -S 1 In some cases, both approaches have to be mixed, for example: java weka.classifiers.meta.Stacking -B \"weka.classifiers.lazy.IBk -K 10\" \\ -M \"weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -- -S 1\" \\ -t data/iris.arff -x 2 Notice that while ClassificationViaRegression honors the -- parameter, Stacking itself does not.","title":"weka.classifiers"},{"location":"properties_file/","text":"General # A properties file is a simple text file with this structure: <key>=<value> Notes: Comments start with the hash sign # . Backslashes within values need to be doubled (the backslashes get interpreted already when a property is read). To make a rather long property line more readable, one can use a backslash to continue on the next line. The Filter property, e.g., looks like this: weka.filters.Filter = \\ > weka.filters.supervised.attribute, \\ > weka.filters.supervised.instance, \\ > weka.filters.unsupervised.attribute, \\ > weka.filters.unsupervised.instance Precedence # The Weka property files (extension .props ) are searched for in the following order: current directory (< Weka 3.7.2) the user's home directory (see FAQ Where is my home directory located? for more information) (>= Weka 3.7.2) $WEKA_HOME/props (the default value for WEKA_HOME is user's home directory/wekafiles). the class path (normally the weka.jar file) If WEKA encounters those files it only supplements the properties, never overrides them. In other words, a property in the property file of the current directory has a higher precedence than the one in the user's home directory. Note: Under Cywgin , the home directory is still the Windows one, since the java installation will be still one for Windows. How to modify a .props file? # It is quite possible, that the default setup of WEKA is not to your liking and that you want to tweak it a little bit. The use of .props files instead of hard-coding makes it quite easy to modify WEKA's behavior. As example, we are modifying the background color of the 2D plots in the Explorer, changing it to dark gray . The responsible .props file is weka/gui/visualize/Visualize.props . These are the necessary steps: close WEKA extract the .props file from the weka.jar , using an archive manager that can handle ZIP files (e.g., 7-Zip under Windows) place this .props file in your home directory (see FAQ Where is my home directory located? on how to determine your home directory), or for Weka 3.7.2 or higher place this .props file in $WEKA_HOME/props (the default value of WEKA_HOME is user's home directory/wekafiles) open this .props with a text editor ( NB: Notepad under Windows might not handle the Unix line-endings correctly!) navigate to the property weka.gui.visualize.Plot2D.backgroundColour and change the color after the equal sign (\"=\") to darkGray (the article about weka/gui/visualize/Visualize.props lists all possible colors) save the file and restart WEKA Notes # Escaping Backslashes in values need to be escaped (i.e., doubled), otherwise they get interpreted as character sequence. E.g., \"is\\this\" will be interpreted as \"is his\". Correctly escaped, this would read as \"is\\this\". See also # Further information about specific props files: weka/core/Capabilities.props weka/core/logging/Logging.props weka/experiment/DatabaseUtils.props weka/gui/GenericObjectEditor.props weka/gui/GUIEditors.props weka/gui/GenericPropertiesCreator.props weka/gui/GenericPropertiesCreator.excludes weka/gui/LookAndFeel.props weka/gui/MemoryUsage.props weka/gui/SimpleCLI.props weka/gui/beans/Beans.props weka/gui/experiment/Experimenter.props weka/gui/explorer/Explorer.props weka/gui/scripting/Groovy.props weka/gui/scripting/Jython.props weka/gui/treevisualizer/TreeVisualizer.props weka/gui/visualize/Visualize.props","title":"Properties File"},{"location":"properties_file/#general","text":"A properties file is a simple text file with this structure: <key>=<value> Notes: Comments start with the hash sign # . Backslashes within values need to be doubled (the backslashes get interpreted already when a property is read). To make a rather long property line more readable, one can use a backslash to continue on the next line. The Filter property, e.g., looks like this: weka.filters.Filter = \\ > weka.filters.supervised.attribute, \\ > weka.filters.supervised.instance, \\ > weka.filters.unsupervised.attribute, \\ > weka.filters.unsupervised.instance","title":"General"},{"location":"properties_file/#precedence","text":"The Weka property files (extension .props ) are searched for in the following order: current directory (< Weka 3.7.2) the user's home directory (see FAQ Where is my home directory located? for more information) (>= Weka 3.7.2) $WEKA_HOME/props (the default value for WEKA_HOME is user's home directory/wekafiles). the class path (normally the weka.jar file) If WEKA encounters those files it only supplements the properties, never overrides them. In other words, a property in the property file of the current directory has a higher precedence than the one in the user's home directory. Note: Under Cywgin , the home directory is still the Windows one, since the java installation will be still one for Windows.","title":"Precedence"},{"location":"properties_file/#how-to-modify-a-props-file","text":"It is quite possible, that the default setup of WEKA is not to your liking and that you want to tweak it a little bit. The use of .props files instead of hard-coding makes it quite easy to modify WEKA's behavior. As example, we are modifying the background color of the 2D plots in the Explorer, changing it to dark gray . The responsible .props file is weka/gui/visualize/Visualize.props . These are the necessary steps: close WEKA extract the .props file from the weka.jar , using an archive manager that can handle ZIP files (e.g., 7-Zip under Windows) place this .props file in your home directory (see FAQ Where is my home directory located? on how to determine your home directory), or for Weka 3.7.2 or higher place this .props file in $WEKA_HOME/props (the default value of WEKA_HOME is user's home directory/wekafiles) open this .props with a text editor ( NB: Notepad under Windows might not handle the Unix line-endings correctly!) navigate to the property weka.gui.visualize.Plot2D.backgroundColour and change the color after the equal sign (\"=\") to darkGray (the article about weka/gui/visualize/Visualize.props lists all possible colors) save the file and restart WEKA","title":"How to modify a .props file?"},{"location":"properties_file/#notes","text":"Escaping Backslashes in values need to be escaped (i.e., doubled), otherwise they get interpreted as character sequence. E.g., \"is\\this\" will be interpreted as \"is his\". Correctly escaped, this would read as \"is\\this\".","title":"Notes"},{"location":"properties_file/#see-also","text":"Further information about specific props files: weka/core/Capabilities.props weka/core/logging/Logging.props weka/experiment/DatabaseUtils.props weka/gui/GenericObjectEditor.props weka/gui/GUIEditors.props weka/gui/GenericPropertiesCreator.props weka/gui/GenericPropertiesCreator.excludes weka/gui/LookAndFeel.props weka/gui/MemoryUsage.props weka/gui/SimpleCLI.props weka/gui/beans/Beans.props weka/gui/experiment/Experimenter.props weka/gui/explorer/Explorer.props weka/gui/scripting/Groovy.props weka/gui/scripting/Jython.props weka/gui/treevisualizer/TreeVisualizer.props weka/gui/visualize/Visualize.props","title":"See also"},{"location":"props_file/","text":"see Properties file","title":"Props file"},{"location":"removing_misclassified_instances_from_dataset/","text":"Sometimes it is necessary to clean out the instances misclassified by a classifier from a dataset. The following example loads a dataset, runs the RemoveMisclassified filter and saves the resulting dataset in another file again: RemoveMisclassifiedTest <input.arff> <classname> <output.arff> Source code: import weka.classifiers.Classifier ; import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.instance.RemoveMisclassified ; import java.io.BufferedReader ; import java.io.BufferedWriter ; import java.io.FileReader ; import java.io.FileWriter ; /** * Runs the RemoveMisclassified filter over a given ARFF file. * First parameter is the input file, the second the classifier * to use and the third one is the output file. * * Usage: RemoveMisclassifiedTest input.arff classname output.arff * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class RemoveMisclassifiedTest { public static void main ( String [] args ) throws Exception { if ( args . length != 3 ) { System . out . println ( \"\\nUsage: RemoveMisclassifiedTest input.arff classname output.arff\\n\" ); System . exit ( 1 ); } // get data Instances input = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); input . setClassIndex ( input . numAttributes () - 1 ); // get classifier Classifier c = Classifier . forName ( args [ 1 ] , new String [ 0 ] ); // setup and run filter RemoveMisclassified filter = new RemoveMisclassified (); filter . setClassifier ( c ); filter . setClassIndex ( - 1 ); filter . setNumFolds ( 0 ); filter . setThreshold ( 0.1 ); filter . setMaxIterations ( 0 ); filter . setInputFormat ( input ); Instances output = Filter . useFilter ( input , filter ); // output file BufferedWriter writer = new BufferedWriter ( new FileWriter ( args [ 2 ] )); writer . write ( output . toString ()); writer . newLine (); writer . flush (); writer . close (); } } See also # Use Weka in your Java code - for general use of the Weka API Save Instances to an ARFF File - for saving an Instances object to a file Downloads # RemoveMisclassifiedTest.java","title":"Removing misclassified instances from dataset"},{"location":"removing_misclassified_instances_from_dataset/#see-also","text":"Use Weka in your Java code - for general use of the Weka API Save Instances to an ARFF File - for saving an Instances object to a file","title":"See also"},{"location":"removing_misclassified_instances_from_dataset/#downloads","text":"RemoveMisclassifiedTest.java","title":"Downloads"},{"location":"requirements/","text":"The following matrix shows which minimum version of Java is necessary to run a specific Weka version. The latest official releases of Weka require Java 8 or later. Note that if you are using Windows and your computer has a display with high pixel density (HiDPI), you may need to use Java 9 or later to avoid problems with inappropriate scaling of Weka's graphical user interfaces. Weka Java 1.4 Java 5 Java 6 Java 7 Java 8 or later <3.4.0 \u2611 \u2611 \u2611 \u2611 \u2611 3.4.x \u2611 \u2611 \u2611 \u2611 \u2611 3.5.x <3.5.3 \u2611 \u2611 \u2611 \u2611 3.6.x \u2611 \u2611 \u2611 \u2611 3.7.x 3.7.0 <3.7.14 \u2611 \u2611 3.8.x <3.8.2 \u2611 3.9.x <3.9.2 \u2611","title":"Requirements"},{"location":"roc_curves/","text":"General # Weka just varies the threshold on the class probability estimates in each case. What does that mean? In case of a classifier that does not return proper class probabilities (like SMO with the -M option, or IB1), you will end up with only two points in the curve. Using a classifier that returns proper distributions, like BayesNet, J48 or SMO with -M option for building logistic models, you will get nice curves. The class used for calculating the ROC and also the AUC (= area under the curve) is weka.classifiers.evaluation.ThresholdCurve . Commandline # You can output the data for the ROC curves with the following options: -threshold-file <file> The file to save the threshold data to. The format is determined by the extensions, e.g., '.arff' for ARFF format or '.csv' for CSV. -threshold-label <label> The class label to determine the threshold data for (default is the first label) Here's an example for using J48 on the UCI dataset anneal , generating the ROC curve file for label U from a 10-fold cross-validation: java weka.classifiers.trees.J48 -t /some/where/anneal.arff \\ -threshold-file anneal_roc_U.arff -threshold-label U Explorer # Generating # The Weka Explorer enables you to plot the ROC ( Receiver operating characteristic ) curve for a certain class label of dataset: run a classifier on a dataset right-click in the result list on the result you want to display the curve for select Visualize threshold curve and choose the class label you want the plot for Note: the AUC for this plot is also displayed, just above the actual plot. Saving # You can save the ROC curve in two ways: as an ARFF file, containing the data points (can be displayed again) as an image (using Alt+Shift+Left click to bring up a save dialog) Loading # A previously saved ROC data file can be displayed in two ways: without the AUC - with the following command java [CLASSPATH|-classpath <your-classpath>] weka.gui.visualize.VisualizePanel <file> with the AUC - needs this source code KnowledgeFlow # See Plotting multiple ROC curves . See also # Plotting multiple ROC curves Generating ROC curve Links # WikiPedia article on ROC curve weka.classifiers.evaluation.ThresholdCurve","title":"Roc curves"},{"location":"roc_curves/#general","text":"Weka just varies the threshold on the class probability estimates in each case. What does that mean? In case of a classifier that does not return proper class probabilities (like SMO with the -M option, or IB1), you will end up with only two points in the curve. Using a classifier that returns proper distributions, like BayesNet, J48 or SMO with -M option for building logistic models, you will get nice curves. The class used for calculating the ROC and also the AUC (= area under the curve) is weka.classifiers.evaluation.ThresholdCurve .","title":"General"},{"location":"roc_curves/#commandline","text":"You can output the data for the ROC curves with the following options: -threshold-file <file> The file to save the threshold data to. The format is determined by the extensions, e.g., '.arff' for ARFF format or '.csv' for CSV. -threshold-label <label> The class label to determine the threshold data for (default is the first label) Here's an example for using J48 on the UCI dataset anneal , generating the ROC curve file for label U from a 10-fold cross-validation: java weka.classifiers.trees.J48 -t /some/where/anneal.arff \\ -threshold-file anneal_roc_U.arff -threshold-label U","title":"Commandline"},{"location":"roc_curves/#explorer","text":"","title":"Explorer"},{"location":"roc_curves/#generating","text":"The Weka Explorer enables you to plot the ROC ( Receiver operating characteristic ) curve for a certain class label of dataset: run a classifier on a dataset right-click in the result list on the result you want to display the curve for select Visualize threshold curve and choose the class label you want the plot for Note: the AUC for this plot is also displayed, just above the actual plot.","title":"Generating"},{"location":"roc_curves/#saving","text":"You can save the ROC curve in two ways: as an ARFF file, containing the data points (can be displayed again) as an image (using Alt+Shift+Left click to bring up a save dialog)","title":"Saving"},{"location":"roc_curves/#loading","text":"A previously saved ROC data file can be displayed in two ways: without the AUC - with the following command java [CLASSPATH|-classpath <your-classpath>] weka.gui.visualize.VisualizePanel <file> with the AUC - needs this source code","title":"Loading"},{"location":"roc_curves/#knowledgeflow","text":"See Plotting multiple ROC curves .","title":"KnowledgeFlow"},{"location":"roc_curves/#see-also","text":"Plotting multiple ROC curves Generating ROC curve","title":"See also"},{"location":"roc_curves/#links","text":"WikiPedia article on ROC curve weka.classifiers.evaluation.ThresholdCurve","title":"Links"},{"location":"saving_and_loading_models/","text":"You save a trained classifier with the -d option ( dumping ), e.g.: java weka.classifiers.trees.J48 -C 0 .25 -M 2 -t /some/where/train.arff -d /other/place/j48.model And you can load it with -l and use it on a test set, e.g.: java weka.classifiers.trees.J48 -l /other/place/j48.model -T /some/where/test.arff Note, when loading a model you no longer need to supply specific parameters to the classifier. Explorer # A trained model can be saved like this, e.g., J48: train your model on the training data /some/where/train.arff right-click in the Results list on the item which model you want to save select Save model and save it to /other/place/j48.model You can load the previously saved model with the following steps: load your test data /some/where/test.arff via the Supplied test set button right-click in the Results list , select Load model and choose /other/place/j48.model in the More options dialog, change the Output predictions to CSV or another format (and specify a file in the options of the output format), if you want to store the predictions in a file rather than having to copy/paste them select Re-evaluate model on current test set Based on this Weka Mailing List post. Making Predictions with your model without retraining # See the Making predictions article for detailed information. Source code # See Serialization for code examples. See also # Serialization","title":"Saving and loading models"},{"location":"saving_and_loading_models/#explorer","text":"A trained model can be saved like this, e.g., J48: train your model on the training data /some/where/train.arff right-click in the Results list on the item which model you want to save select Save model and save it to /other/place/j48.model You can load the previously saved model with the following steps: load your test data /some/where/test.arff via the Supplied test set button right-click in the Results list , select Load model and choose /other/place/j48.model in the More options dialog, change the Output predictions to CSV or another format (and specify a file in the options of the output format), if you want to store the predictions in a file rather than having to copy/paste them select Re-evaluate model on current test set Based on this Weka Mailing List post.","title":"Explorer"},{"location":"saving_and_loading_models/#making-predictions-with-your-model-without-retraining","text":"See the Making predictions article for detailed information.","title":"Making Predictions with your model without retraining"},{"location":"saving_and_loading_models/#source-code","text":"See Serialization for code examples.","title":"Source code"},{"location":"saving_and_loading_models/#see-also","text":"Serialization","title":"See also"},{"location":"serialization/","text":"Serialization is the process of saving an object in a persistent form, e.g., on the harddisk as a bytestream. Deserialization is the process in the opposite direction, creating an object from a persistently saved data structure. In Java , an object can be serialized if it imports the java.io.Serializable interface. Members of an object that are not supposed to be serialized, need to be prefixed with the keyword transient . In the following you'll find some Java code snippets for serializing and deserializing a J48 classifier. Of course, serialization is not limited to classifiers. Most schemes in Weka, like clusterers and filters, are also serializable. Serializing # Here we create a J48 classifier cls , train it with a dataset /some/where/data.arff , and save the built model to a file /some/where/j48.model . // create J48 Classifier cls = new J48 (); // train Instances inst = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/data.arff\" ))); inst . setClassIndex ( inst . numAttributes () - 1 ); cls . buildClassifier ( inst ); // serialize model ObjectOutputStream oos = new ObjectOutputStream ( new FileOutputStream ( \"/some/where/j48.model\" )); oos . writeObject ( cls ); oos . flush (); oos . close (); If you use the SerializationHelper class, then this shrinks to: // serialize model weka . core . SerializationHelper . write ( \"/some/where/j48.model\" , cls ); Deserializing # Here the previously saved model is deserialized as cls and again available for classification. // deserialize model ObjectInputStream ois = new ObjectInputStream ( new FileInputStream ( \"/some/where/j48.model\" )); Classifier cls = ( Classifier ) ois . readObject (); ois . close (); Or, with the SerializationHelper class: // deserialize model Classifier cls = ( Classifier ) weka . core . SerializationHelper . read ( \"/some/where/j48.model\" ); Serialization in Weka # The Explorer serializes the classifier and the training header together. This makes it easy to test whether a dataset is compatible with the dataset the classifier was trained with. The commandline option -d of the developer version stores the training header as well. In order to read serialized models that contain the header information as well, you can use the readAll method of the weka.core.SerializationHelper . For serializing models with their datasets, use writeAll . See also # Use Weka in your Java code Links # Java Serialization Specifications","title":"Serialization"},{"location":"serialization/#serializing","text":"Here we create a J48 classifier cls , train it with a dataset /some/where/data.arff , and save the built model to a file /some/where/j48.model . // create J48 Classifier cls = new J48 (); // train Instances inst = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/data.arff\" ))); inst . setClassIndex ( inst . numAttributes () - 1 ); cls . buildClassifier ( inst ); // serialize model ObjectOutputStream oos = new ObjectOutputStream ( new FileOutputStream ( \"/some/where/j48.model\" )); oos . writeObject ( cls ); oos . flush (); oos . close (); If you use the SerializationHelper class, then this shrinks to: // serialize model weka . core . SerializationHelper . write ( \"/some/where/j48.model\" , cls );","title":"Serializing"},{"location":"serialization/#deserializing","text":"Here the previously saved model is deserialized as cls and again available for classification. // deserialize model ObjectInputStream ois = new ObjectInputStream ( new FileInputStream ( \"/some/where/j48.model\" )); Classifier cls = ( Classifier ) ois . readObject (); ois . close (); Or, with the SerializationHelper class: // deserialize model Classifier cls = ( Classifier ) weka . core . SerializationHelper . read ( \"/some/where/j48.model\" );","title":"Deserializing"},{"location":"serialization/#serialization-in-weka","text":"The Explorer serializes the classifier and the training header together. This makes it easy to test whether a dataset is compatible with the dataset the classifier was trained with. The commandline option -d of the developer version stores the training header as well. In order to read serialized models that contain the header information as well, you can use the readAll method of the weka.core.SerializationHelper . For serializing models with their datasets, use writeAll .","title":"Serialization in Weka"},{"location":"serialization/#see-also","text":"Use Weka in your Java code","title":"See also"},{"location":"serialization/#links","text":"Java Serialization Specifications","title":"Links"},{"location":"speeding_up_weka/","text":"The following is based on a post from Eibe Frank on the Weka mailing list. CPU acceleration # WEKA implementations of algorithms that are based on standard linear algebra generally apply the MTJ library, which, optionally, can use a much faster native backend than the default pure-Java backend. Install the appropriate netlibNative* package for your platform using the WEKA package manager to get this acceleration. To get further speed-ups, compile OpenBLAS or similar for your particular computer and link it with MTJ/NetlibJava, i.e., to make use of multi-threaded linear algebra (see info at https://github.com/fommil/netlib-java ). There is good news for Mac OS X users: OS X comes with a system-optimised library (vecLib) and all you need to do is install the WEKA package netlibNativeOSX* to get high-speed matrix algebra. Here is a list of WEKA schemes (not sure whether it\u2019s complete) that can benefit from this (but only on sufficiently large data): GaussianProcesses PrincipalComponents LinearRegression M5P M5Rules MultivariateGaussianEstimator There are also some schemes in various packages: LDA QDA FLDA LatentSemanticAnalysis Nystroem RotationForest LeastMedSq RBFNetwork GPU acceleration # Now, regarding GPUs, it's actually possible to set up a GPU-based backend for MTJ/NetlibJava. However, WEKA uses double-precision arithmetic and consumer-grade GPUs are optimised for single-precision arithmetic and very slow when using double-precision arithmetic. (Neural networks in deep learning libraries are generally trained using single-precision arithmetic.) Anyway, see instructions for using NVBLAS (Nvidia's BLAS wrapper) on Ubuntu 20.04 in article MTJ with NVBLAS . I tried PrincipalComponents (see instructions below for how I did this) and LDA with NVBLAS. However, in my experiments, only a small fraction of the BLAS operations were off-loaded to the GPU by NVBLAS, even though the names of the BLAS routines in nvblas.log are all in the list of GPU-based routines at https://docs.nvidia.com/cuda/nvblas/index.html#routines Perhaps it also depends on the parameter settings of those routines as to whether they are off-loaded to the GPU. It seems NVBLAS uses some heuristics to guess whether it's worthwhile to use the GPU instead of the CPU for certain input parameters. So, apart from using wekaDeeplearning4j, there is this second way of using GPUs: by applying the NVBLAS backend with MTJ/netlib-java. The third way to use a GPU from WEKA is to install XGBoost with GPU support in Python or R (note that the GPU support for XGBoost was actually developed here at Waikato by Rory Mitchell!). That can then be used in WEKA via the RPlugin and wekaPython. In 2020, when I tried this on a Windows machine, it only worked out of the box using wekaPython though, and not using R. Here is an email to a colleague that I wrote at the time: The fourth way to use a GPU from WEKA is to use the kerasZoo package, which has https://github.com/Waikato/weka-3.8/blob/master/packages/internal/kerasZoo/src/main/java/weka/classifiers/keras/KerasZooClassifier.java However, it may need to be updated to work with the very latest version of WEKA. Finally, I suppose any installed GPU-support in R/Python for any of other the learning schemes in MLR (version 1) or scikit-learn should be \"transparently\" available in WEKA via RPlugin and wekaPython respectively. Unfortunately, in contrast to the MTJ and wekaDeeplearning4j options, wekaPython or RPlugin always incurs an overheard (both, in terms of time and memory) by transferring the data from WEKA to R or Python respectively (and then from RAM to GPU device memory!).","title":"Speeding up weka"},{"location":"speeding_up_weka/#cpu-acceleration","text":"WEKA implementations of algorithms that are based on standard linear algebra generally apply the MTJ library, which, optionally, can use a much faster native backend than the default pure-Java backend. Install the appropriate netlibNative* package for your platform using the WEKA package manager to get this acceleration. To get further speed-ups, compile OpenBLAS or similar for your particular computer and link it with MTJ/NetlibJava, i.e., to make use of multi-threaded linear algebra (see info at https://github.com/fommil/netlib-java ). There is good news for Mac OS X users: OS X comes with a system-optimised library (vecLib) and all you need to do is install the WEKA package netlibNativeOSX* to get high-speed matrix algebra. Here is a list of WEKA schemes (not sure whether it\u2019s complete) that can benefit from this (but only on sufficiently large data): GaussianProcesses PrincipalComponents LinearRegression M5P M5Rules MultivariateGaussianEstimator There are also some schemes in various packages: LDA QDA FLDA LatentSemanticAnalysis Nystroem RotationForest LeastMedSq RBFNetwork","title":"CPU acceleration"},{"location":"speeding_up_weka/#gpu-acceleration","text":"Now, regarding GPUs, it's actually possible to set up a GPU-based backend for MTJ/NetlibJava. However, WEKA uses double-precision arithmetic and consumer-grade GPUs are optimised for single-precision arithmetic and very slow when using double-precision arithmetic. (Neural networks in deep learning libraries are generally trained using single-precision arithmetic.) Anyway, see instructions for using NVBLAS (Nvidia's BLAS wrapper) on Ubuntu 20.04 in article MTJ with NVBLAS . I tried PrincipalComponents (see instructions below for how I did this) and LDA with NVBLAS. However, in my experiments, only a small fraction of the BLAS operations were off-loaded to the GPU by NVBLAS, even though the names of the BLAS routines in nvblas.log are all in the list of GPU-based routines at https://docs.nvidia.com/cuda/nvblas/index.html#routines Perhaps it also depends on the parameter settings of those routines as to whether they are off-loaded to the GPU. It seems NVBLAS uses some heuristics to guess whether it's worthwhile to use the GPU instead of the CPU for certain input parameters. So, apart from using wekaDeeplearning4j, there is this second way of using GPUs: by applying the NVBLAS backend with MTJ/netlib-java. The third way to use a GPU from WEKA is to install XGBoost with GPU support in Python or R (note that the GPU support for XGBoost was actually developed here at Waikato by Rory Mitchell!). That can then be used in WEKA via the RPlugin and wekaPython. In 2020, when I tried this on a Windows machine, it only worked out of the box using wekaPython though, and not using R. Here is an email to a colleague that I wrote at the time: The fourth way to use a GPU from WEKA is to use the kerasZoo package, which has https://github.com/Waikato/weka-3.8/blob/master/packages/internal/kerasZoo/src/main/java/weka/classifiers/keras/KerasZooClassifier.java However, it may need to be updated to work with the very latest version of WEKA. Finally, I suppose any installed GPU-support in R/Python for any of other the learning schemes in MLR (version 1) or scikit-learn should be \"transparently\" available in WEKA via RPlugin and wekaPython respectively. Unfortunately, in contrast to the MTJ and wekaDeeplearning4j options, wekaPython or RPlugin always incurs an overheard (both, in terms of time and memory) by transferring the data from WEKA to R or Python respectively (and then from RAM to GPU device memory!).","title":"GPU acceleration"},{"location":"stemmers/","text":"Introduction # Weka supports stemming algorithms in the developer version. The stemming algorithms are located in the following package: weka.core.stemmers Currently, the Lovins Stemmer (+ iterated version) and support for the Snowball stemmers are included. Snowball stemmers # Weka contains a wrapper class for the Snowball stemmers (containing the Porter stemmer and several other stemmers for different languages). The relevant class is weka.core.stemmers.SnowballStemmer. The Snowball classes are not included, they only have to be present in the classpath. The reason for this is, that the Weka team doesn't have to watch out for new versions of the stemmers and update them. There are three ways of getting hold of the Snowball stemmers: For Weka 3.7.x you can install an unofficial package You can add the pre-compiled snowball-20051019.jar archive to your classpath and you're set. (based on source code from 2005-10-19, compiled 2005-10-22) You can compile the stemmers yourself with the newest sources. Just download the snowball-20051019.zip . Note: the patch target is specific to the source code from 2005-10-19. PTStemmer # PTStemmer is a stemmer library for Portuguese developed by Pedro Oliveira. In order to use this library: you can install the unofficial package when using Weka 3.7.x you just need to download the ptstemmer.jar and add them to your classpath. The source code of the wrapper project is also available: ptstemmer-weka-src-20091105.tar.gz . NB: the source code and the resulting jars are based on version 1.0 of the PTStemmer library. Using stemmers # The stemmers can either be used: from commandline within the StringToWordVector (package weka.filters.unsupervised.attribute ) Commandline # All stemmers support the following options: -h for displaying a brief help -i <input-file> The file to process -o <output-file> The file to output the processed data to (default stdout ) -l Uses lowercase strings, i.e. the input is automatically converted to lower case StringToWordVector # Just use the GenericObjectEditor to choose the right stemmer and the desired options (if the stemmer offers these). Adding new stemmers # You can easily add new stemmers, if you follow these guidelines (for use in the GenericObjectEditor ): they should be located in the weka.core.stemmers package and they must implement the interface weka.core.stemmers.Stemmer . Links # Snowball homepage ANT homepage","title":"Introduction"},{"location":"stemmers/#introduction","text":"Weka supports stemming algorithms in the developer version. The stemming algorithms are located in the following package: weka.core.stemmers Currently, the Lovins Stemmer (+ iterated version) and support for the Snowball stemmers are included.","title":"Introduction"},{"location":"stemmers/#snowball-stemmers","text":"Weka contains a wrapper class for the Snowball stemmers (containing the Porter stemmer and several other stemmers for different languages). The relevant class is weka.core.stemmers.SnowballStemmer. The Snowball classes are not included, they only have to be present in the classpath. The reason for this is, that the Weka team doesn't have to watch out for new versions of the stemmers and update them. There are three ways of getting hold of the Snowball stemmers: For Weka 3.7.x you can install an unofficial package You can add the pre-compiled snowball-20051019.jar archive to your classpath and you're set. (based on source code from 2005-10-19, compiled 2005-10-22) You can compile the stemmers yourself with the newest sources. Just download the snowball-20051019.zip . Note: the patch target is specific to the source code from 2005-10-19.","title":"Snowball stemmers"},{"location":"stemmers/#ptstemmer","text":"PTStemmer is a stemmer library for Portuguese developed by Pedro Oliveira. In order to use this library: you can install the unofficial package when using Weka 3.7.x you just need to download the ptstemmer.jar and add them to your classpath. The source code of the wrapper project is also available: ptstemmer-weka-src-20091105.tar.gz . NB: the source code and the resulting jars are based on version 1.0 of the PTStemmer library.","title":"PTStemmer"},{"location":"stemmers/#using-stemmers","text":"The stemmers can either be used: from commandline within the StringToWordVector (package weka.filters.unsupervised.attribute )","title":"Using stemmers"},{"location":"stemmers/#commandline","text":"All stemmers support the following options: -h for displaying a brief help -i <input-file> The file to process -o <output-file> The file to output the processed data to (default stdout ) -l Uses lowercase strings, i.e. the input is automatically converted to lower case","title":"Commandline"},{"location":"stemmers/#stringtowordvector","text":"Just use the GenericObjectEditor to choose the right stemmer and the desired options (if the stemmer offers these).","title":"StringToWordVector"},{"location":"stemmers/#adding-new-stemmers","text":"You can easily add new stemmers, if you follow these guidelines (for use in the GenericObjectEditor ): they should be located in the weka.core.stemmers package and they must implement the interface weka.core.stemmers.Stemmer .","title":"Adding new stemmers"},{"location":"stemmers/#links","text":"Snowball homepage ANT homepage","title":"Links"},{"location":"stop_word_filtering_and_attributes/","text":"I have WEKA 3 6 3 When I do stop word filtering I get more attributes than before, however the program performs better and is faster and more accure. Hercules Dalianis hercules@dsv.su.se","title":"Stop word filtering and attributes"},{"location":"subversion/","text":"General # The Subversion repository for WEKA has been disabled. We have switched to a Git repository. More information is on this page .","title":"General"},{"location":"subversion/#general","text":"The Subversion repository for WEKA has been disabled. We have switched to a Git repository. More information is on this page .","title":"General"},{"location":"text_categorization_with_weka/","text":"In the following one can find some information of how to use Weka for text categorization . Import # Weka needs the data to be present in ARFF or XRFF format in order to perform any classification tasks. Directories # One can transform the text files with the following tools into ARFF format (depending on the version of Weka you are using): TextDirectoryToArff tool (3.4.x and >= 3.5.3) this Java class transforms a directory of files into an ARFF file TextDirectoryLoader converter (> 3.5.3) this converter is based on the TextDirectoryToArff tool and located in the weka.core.converters package Example directory layout for TextDirectoryLoader : ... | +- text_example | +- class1 | | | + file1.txt | | | + file2.txt | | | ... | +- class2 | | | + another_file1.txt | | | + another_file2.txt | | | ... The above directory structure can be turned into an ARFF file like this: java weka.core.converters.TextDirectoryLoader -dir text_example > text_example.arff CSV files # CSV files can be imported in Weka easily via the Weka Explorer or via commandline via the CSVLoader class: java weka.core.converters.CSVLoader file.csv > file.arff By default, non-numerical attributes get imported as NOMINAL attributes, which is not necessarily desired for textual data, especially if one wants to use the StringToWordVector filter. In order to change the attribute to STRING , one can run the NominalToString filter (package weka.filters.unsupervised.attribute ) on the data, specifying the attribute index or range of indices that should be converted (NB: this filter does not exclude the class attribute from conversion!). In order to retain the attribute types, one needs to save the file in ARFF or XRFF format (or in the compressed version of these formats). Third-party tools # TagHelper Tools , which allows one to transform texts into vectors of stemmed or unstemmed unigrams, bigrams, part-of-speech bigrams, and some user defined features, and then saves this representation to ARFF . Currently processes English, German, and Chinese. Spanish and Portugese are in progress. Working with textual data # Conversion # Most classifiers in Weka cannot handle String attributes. For these learning schemes one has to process the data with appropriate filters, e.g., the StringToWordVector filter which can perform TF/IDF transformation . The StringToWordVector filter places the class attribute of the generated output data at the beginning. In case you'd to like to have it as last attribute again, you can use the Reorder filter with the following setup: weka.filters.unsupervised.attribute.Reorder -R 2 -last,first And with the MultiFilter you can also apply both filters in one go, instead of subsequently. Makes it easier in the Explorer for instance. Stopwords # The StringToWordVector filter can also work with a different stopword list than the built-in one (based on the Rainbow system). One can use the -stopwords option to load the external stopwords file. The format for such a stopword file is one stopword per line, lines starting with '#' are interpreted as comments and ignored. Note: There was a bug in Weka 3.5.6 (which introduced the support of external stopwords lists), which ignored the external stopwords list. Later versions from 21/07/2007 on will work correctly. UTF-8 # In case you are working with text files containing non-ASCII characters, e.g., Arabic, you might encounter some display problems under Windows. Java was designed to display UTF-8 , which should include arabic characters. By default, Java uses code page 1252 under Windows, which garbles the display of other characters. In order to fix this, you will have to modify the java command-line with which you start up Weka: java -Dfile.encoding = utf-8 -classpath ... The -Dfile.encoding=utf-8 tells Java to explicitly use UTF-8 encoding instead of the default CP1252 . If you are starting Weka via start menu and you use a recent version (at least 3.5.8 or 3.4.13), then you will just have to modify the fileEncoding placeholder in the RunWeka.ini accordingly. Examples # text_example.zip - contains a directory structure and example files that can be imported with the TextDirectoryLoader converter. TextCategorizationTest.java - uses the TextDirectoryLoader converter to turn a directory structure into a dataset, applies the StringToWordVector and builds a classifier with the filtered data. See also # Batch filtering - for generating a test set with the same dictionary as the training set All text categorization articles Links # Javadoc StringToWordVector TextDirectoryLoader","title":"Text categorization with weka"},{"location":"text_categorization_with_weka/#import","text":"Weka needs the data to be present in ARFF or XRFF format in order to perform any classification tasks.","title":"Import"},{"location":"text_categorization_with_weka/#directories","text":"One can transform the text files with the following tools into ARFF format (depending on the version of Weka you are using): TextDirectoryToArff tool (3.4.x and >= 3.5.3) this Java class transforms a directory of files into an ARFF file TextDirectoryLoader converter (> 3.5.3) this converter is based on the TextDirectoryToArff tool and located in the weka.core.converters package Example directory layout for TextDirectoryLoader : ... | +- text_example | +- class1 | | | + file1.txt | | | + file2.txt | | | ... | +- class2 | | | + another_file1.txt | | | + another_file2.txt | | | ... The above directory structure can be turned into an ARFF file like this: java weka.core.converters.TextDirectoryLoader -dir text_example > text_example.arff","title":"Directories"},{"location":"text_categorization_with_weka/#csv-files","text":"CSV files can be imported in Weka easily via the Weka Explorer or via commandline via the CSVLoader class: java weka.core.converters.CSVLoader file.csv > file.arff By default, non-numerical attributes get imported as NOMINAL attributes, which is not necessarily desired for textual data, especially if one wants to use the StringToWordVector filter. In order to change the attribute to STRING , one can run the NominalToString filter (package weka.filters.unsupervised.attribute ) on the data, specifying the attribute index or range of indices that should be converted (NB: this filter does not exclude the class attribute from conversion!). In order to retain the attribute types, one needs to save the file in ARFF or XRFF format (or in the compressed version of these formats).","title":"CSV files"},{"location":"text_categorization_with_weka/#third-party-tools","text":"TagHelper Tools , which allows one to transform texts into vectors of stemmed or unstemmed unigrams, bigrams, part-of-speech bigrams, and some user defined features, and then saves this representation to ARFF . Currently processes English, German, and Chinese. Spanish and Portugese are in progress.","title":"Third-party tools"},{"location":"text_categorization_with_weka/#working-with-textual-data","text":"","title":"Working with textual data"},{"location":"text_categorization_with_weka/#conversion","text":"Most classifiers in Weka cannot handle String attributes. For these learning schemes one has to process the data with appropriate filters, e.g., the StringToWordVector filter which can perform TF/IDF transformation . The StringToWordVector filter places the class attribute of the generated output data at the beginning. In case you'd to like to have it as last attribute again, you can use the Reorder filter with the following setup: weka.filters.unsupervised.attribute.Reorder -R 2 -last,first And with the MultiFilter you can also apply both filters in one go, instead of subsequently. Makes it easier in the Explorer for instance.","title":"Conversion"},{"location":"text_categorization_with_weka/#stopwords","text":"The StringToWordVector filter can also work with a different stopword list than the built-in one (based on the Rainbow system). One can use the -stopwords option to load the external stopwords file. The format for such a stopword file is one stopword per line, lines starting with '#' are interpreted as comments and ignored. Note: There was a bug in Weka 3.5.6 (which introduced the support of external stopwords lists), which ignored the external stopwords list. Later versions from 21/07/2007 on will work correctly.","title":"Stopwords"},{"location":"text_categorization_with_weka/#utf-8","text":"In case you are working with text files containing non-ASCII characters, e.g., Arabic, you might encounter some display problems under Windows. Java was designed to display UTF-8 , which should include arabic characters. By default, Java uses code page 1252 under Windows, which garbles the display of other characters. In order to fix this, you will have to modify the java command-line with which you start up Weka: java -Dfile.encoding = utf-8 -classpath ... The -Dfile.encoding=utf-8 tells Java to explicitly use UTF-8 encoding instead of the default CP1252 . If you are starting Weka via start menu and you use a recent version (at least 3.5.8 or 3.4.13), then you will just have to modify the fileEncoding placeholder in the RunWeka.ini accordingly.","title":"UTF-8"},{"location":"text_categorization_with_weka/#examples","text":"text_example.zip - contains a directory structure and example files that can be imported with the TextDirectoryLoader converter. TextCategorizationTest.java - uses the TextDirectoryLoader converter to turn a directory structure into a dataset, applies the StringToWordVector and builds a classifier with the filtered data.","title":"Examples"},{"location":"text_categorization_with_weka/#see-also","text":"Batch filtering - for generating a test set with the same dictionary as the training set All text categorization articles","title":"See also"},{"location":"text_categorization_with_weka/#links","text":"Javadoc StringToWordVector TextDirectoryLoader","title":"Links"},{"location":"text_classification_with_weka/","text":"see Text categorization with Weka","title":"Text classification with weka"},{"location":"troubleshooting/","text":"Click on one of the links for more information: Weka download problems OutOfMemoryException StackOverflowError just-in-time (JIT) compiler CSV file conversion ARFF file doesn't load Error message: nominal value not declared in header, read Token[X], line Y Spaces in labels of ARFF files Single quotes in labels of ARFF files CLASSPATH problems Instance ID Visualization Memory consumption and Garbage collector GUIChooser starts but not Experimenter or Explorer KnowledgeFlow toolbars are empty OSX Mountain Lion - Weka x-y-z is damaged and can't be installed. You should eject the disk image Ubuntu 18.04: WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS See also the Frequently Asked Questions .","title":"Troubleshooting"},{"location":"tuning_classifier_parameters/","text":"see Optimizing parameters","title":"Tuning classifier parameters"},{"location":"use_weka_in_your_java_code/","text":"The most common components you might want to use are Instances - your data Filter - for preprocessing the data Classifier/Clusterer - built on the processed data Evaluating - how good is the classifier/clusterer? Attribute selection - removing irrelevant attributes from your data The following sections explain how to use them in your own code. A link to an example class can be found at the end of this page, under the Links section. The classifiers and filters always list their options in the Javadoc API ( stable , developer version) specification. A comprehensive source of information is the chapter Using the API of the Weka manual. Packages # Initialization # In order to get your installed Weka packages initialized and also the internal MTJ and arpack libraries added to the classpath, call the loadPackages method of the weka.core.WekaPackageManager class before you instantiate any other classifiers, clusterers, filters, etc: import weka.core.WekaPackageManager; ... WekaPackageManager.loadPackages(false); Management # You can list all packages using: for ( Package p : WekaPackageManager . getAllPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); Currently installed packages using: for ( Package p : WekaPackageManager . getInstalledPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); And packages that are available for installation with: for ( Package p : WekaPackageManager . getAvailablePackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); The following installs the latest version (version parameter is null ) of the alternatingModelTrees package: WekaPackageManager . installPackageFromRepository ( \"alternatingModelTrees\" , null , System . out ); And this call uninstalls the package: WekaPackageManager . uninstallPackage ( \"alternatingModelTrees\" , true , System . out ); You can install a package also directly from a URL, e.g.: java . net . URL url = new java . net . URL ( \"https://sourceforge.net/projects/weka/files/weka-packages/DilcaDistance1.0.2.zip/download\" ); WekaPackageManager . installPackageFromURL ( url , System . out ); Instantiation # For instantiating classes from packages, you can use the forName method of the weka.core.Utils class. The following example shows how to instantiate the (hypothetical) classifier com.example.FunkyClassifier , which is available from a Weka package that is currently installed: import weka.core.Utils; import weka.classifiers.Classifier; ... Classifier cls = (Classifier) Utils.forName( Classifier.class, \"com.example.FunkyClassifier\", new String[]{\"-R\", \"1\", \"-another-option\"}); Instances # Datasets # The DataSource class is not limited to ARFF files. It can also read CSV files and other formats (basically all file formats that Weka can import via its converters; it uses the file extension to determine the associated loader). import weka.core.converters.ConverterUtils.DataSource ; ... DataSource source = new DataSource ( \"/some/where/data.arff\" ); Instances data = source . getDataSet (); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); Database # Reading from Databases is slightly more complicated, but still very easy. First, you'll have to modify your DatabaseUtils.props file to reflect your database connection. Suppose you want to connect to a MySQL server that is running on the local machine on the default port 3306 . The MySQL JDBC driver is called Connector/J . (The driver class is org.gjt.mm.mysql.Driver .) The database where your target data resides is called some_database . Since you're only reading, you can use the default user nobody without a password. Your props file must contain the following lines: jdbcDriver = org.gjt.mm.mysql.Driver jdbcURL = jdbc:mysql://localhost:3306/some_database Secondly, your Java code needs to look like this to load the data from the database: import weka.core.Instances ; import weka.experiment.InstanceQuery ; ... InstanceQuery query = new InstanceQuery (); query . setUsername ( \"nobody\" ); query . setPassword ( \"\" ); query . setQuery ( \"select * from whatsoever\" ); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query . retrieveInstances (); Notes: * Don't forget to add the JDBC driver to your CLASSPATH . * For MS Access, you must use the JDBC-ODBC-bridge that is part of a JDK. The Windows databases article explains how to do this. * InstanceQuery automatically converts VARCHAR database columns to NOMINAL attributes, and long TEXT database columns to STRING attributes. So if you use InstanceQuery to do text mining against text that appears in a VARCHAR column, Weka will regard such text as nominal values. Thus it will fail to tokenize and mine that text. Use the NominalToString or StringToNominal filter (package weka.filters.unsupervised.attribute ) to convert the attributes into the correct type. Option handling # Weka schemes that implement the weka.core.OptionHandler interface, such as classifiers, clusterers, and filters, offer the following methods for setting and retrieving options: void setOptions(String[] options) String[] getOptions() There are several ways of setting the options: Manually creating a String array: String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; options [ 1 ] = \"1\" ; Using a single command-line string and using the splitOptions method of the weka.core.Utils class to turn it into an array: String [] options = weka . core . Utils . splitOptions ( \"-R 1\" ); Using the OptionsToCode.java class to automatically turn a command line into code. Especially handy if the command line contains nested classes that have their own options, such as kernels for SMO: java OptionsToCode weka.classifiers.functions.SMO will generate output like this: // create new instance of scheme weka . classifiers . functions . SMO scheme = new weka . classifiers . functions . SMO (); // set options scheme . setOptions ( weka . core . Utils . splitOptions ( \"-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \\\"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\\\"\" )); Also, the OptionTree.java tool allows you to view a nested options string, e.g., used at the command line, as a tree. This can help you spot nesting errors. Filter # A filter has two different properties: supervised or unsupervised either takes the class attribute into account or not attribute - or instance -based e.g., removing a certain attribute or removing instances that meet a certain condition Most filters implement the OptionHandler interface, which means you can set the options via a String array, rather than setting them each manually via set -methods. For example, if you want to remove the first attribute of a dataset, you need this filter weka . filters . unsupervised . attribute . Remove with this option -R 1 If you have an Instances object, called data , you can create and apply the filter like this: import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Remove ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; // \"range\" options [ 1 ] = \"1\" ; // first attribute Remove remove = new Remove (); // new instance of filter remove . setOptions ( options ); // set options remove . setInputFormat ( data ); // inform filter about dataset **AFTER** setting options Instances newData = Filter . useFilter ( data , remove ); // apply filter Filtering on-the-fly # The FilteredClassifer meta-classifier is an easy way of filtering data on the fly. It removes the necessity of filtering the data before the classifier can be trained. Also, the data need not be passed through the trained filter again at prediction time. The following is an example of using this meta-classifier with the Remove filter and J48 for getting rid of a numeric ID attribute in the data: import weka.classifiers.meta.FilteredClassifier ; import weka.classifiers.trees.J48 ; import weka.filters.unsupervised.attribute.Remove ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // filter Remove rm = new Remove (); rm . setAttributeIndices ( \"1\" ); // remove 1st attribute // classifier J48 j48 = new J48 (); j48 . setUnpruned ( true ); // using an unpruned J48 // meta-classifier FilteredClassifier fc = new FilteredClassifier (); fc . setFilter ( rm ); fc . setClassifier ( j48 ); // train and make predictions fc . buildClassifier ( train ); for ( int i = 0 ; i < test . numInstances (); i ++ ) { double pred = fc . classifyInstance ( test . instance ( i )); System . out . print ( \"ID: \" + test . instance ( i ). value ( 0 )); System . out . print ( \", actual: \" + test . classAttribute (). value (( int ) test . instance ( i ). classValue ())); System . out . println ( \", predicted: \" + test . classAttribute (). value (( int ) pred )); } Other handy meta-schemes in Weka: weka.clusterers.FilteredClusterer weka.assocations.FilteredAssociator Batch filtering # On the command line, you can enable a second input/output pair (via -r and -s ) with the -b option, in order to process the second file with the same filter setup as the first one. Necessary, if you're using attribute selection or standardization - otherwise you end up with incompatible datasets. This is done fairly easy, since one initializes the filter only once with the setInputFormat(Instances) method, namely with the training set, and then applies the filter subsequently to the training set and the test set. The following example shows how to apply the Standardize filter to a train and a test set. Instances train = ... // from somewhere Instances test = ... // from somewhere Standardize filter = new Standardize (); filter . setInputFormat ( train ); // initializing the filter once with training set Instances newTrain = Filter . useFilter ( train , filter ); // configures the Filter based on train instances and returns filtered instances Instances newTest = Filter . useFilter ( test , filter ); // create new test set Calling conventions # The setInputFormat(Instances) method always has to be the last call before the filter is applied, e.g., with Filter.useFilter(Instances,Filter) . Why? First, it is the convention for using filters and, secondly, lots of filters generate the header of the output format in the setInputFormat(Instances) method with the currently set options (setting otpions after this call doesn't have any effect any more). Classification # The necessary classes can be found in this package: weka.classifiers Building a Classifier # Batch # A Weka classifier is rather simple to train on a given dataset. E.g., we can train an unpruned C4.5 tree algorithm on a given dataset data . The training is done via the buildClassifier(Instances) method. import weka.classifiers.trees.J48 ; ... String [] options = new String [ 1 ] ; options [ 0 ] = \"-U\" ; // unpruned tree J48 tree = new J48 (); // new instance of tree tree . setOptions ( options ); // set the options tree . buildClassifier ( data ); // build classifier Incremental # Classifiers implementing the weka.classifiers.UpdateableClassifier interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc of this interface to see what classifiers are implementing it. The actual process of training an incremental classifier is fairly simple: Call buildClassifier(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClassifier(Instance) method to feed the classifier new weka.core.Instance objects, one by one. Here is an example using data from a weka.core.converters.ArffLoader to train weka.classifiers.bayes.NaiveBayesUpdateable : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); structure . setClassIndex ( structure . numAttributes () - 1 ); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable (); nb . buildClassifier ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) nb . updateClassifier ( current ); A working example is IncrementalClassifier.java . Evaluating # Cross-validation # If you only have a training set and no test you might want to evaluate the classifier by using 10 times 10-fold cross-validation. This can be easily done via the Evaluation class. Here we seed the random selection of our folds for the CV with 1 . Check out the Evaluation class for more information about the statistics it produces. import weka.classifiers.Evaluation ; import java.util.Random ; ... Evaluation eval = new Evaluation ( newData ); eval . crossValidateModel ( tree , newData , 10 , new Random ( 1 )); Note: The classifier (in our example tree ) should not be trained when handed over to the crossValidateModel method. Why? If the classifier does not abide to the Weka convention that a classifier must be re-initialized every time the buildClassifier method is called (in other words: subsequent calls to the buildClassifier method always return the same results), you will get inconsistent and worthless results. The crossValidateModel takes care of training and evaluating the classifier. (It creates a copy of the original classifier that you hand over to the crossValidateModel for each run of the cross-validation.) Train/test set # In case you have a dedicated test set, you can train the classifier and then evaluate it on this test set. In the following example, a J48 is instantiated, trained and then evaluated. Some statistics are printed to stdout : import weka.core.Instances ; import weka.classifiers.Evaluation ; import weka.classifiers.trees.J48 ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // train classifier Classifier cls = new J48 (); cls . buildClassifier ( train ); // evaluate classifier and print some statistics Evaluation eval = new Evaluation ( train ); eval . evaluateModel ( cls , test ); System . out . println ( eval . toSummaryString ( \"\\nResults\\n======\\n\" , false )); Statistics # Some methods for retrieving the results from the evaluation: nominal class correct() - number of correctly classified instances (see also incorrect() ) pctCorrect() - percentage of correctly classified instances (see also pctIncorrect() ) kappa() - Kappa statistics numeric class correlationCoefficient() - correlation coefficient general meanAbsoluteError() - the mean absolute error rootMeanSquaredError() - the root mean squared error unclassified() - number of unclassified instances pctUnclassified() - percentage of unclassified instances If you want to have the exact same behavior as from the command line, use this call: import weka.classifiers.trees.J48 ; import weka.classifiers.Evaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( Evaluation . evaluateModel ( new J48 (), options )); ROC curves/AUC # You can also generate ROC curves/AUC with the predictions Weka recorded during testing. You can access these predictions via the predictions() method of the Evaluation class. See the Generating ROC curve article for a full example of how to generate ROC curves. Classifying instances # In case you have an unlabeled dataset that you want to classify with your newly trained classifier, you can use the following code snippet. It loads the file /some/where/unlabeled.arff , uses the previously built classifier tree to label the instances, and saves the labeled data as /some/where/labeled.arff . import java.io.BufferedReader ; import java.io.BufferedWriter ; import java.io.FileReader ; import java.io.FileWriter ; import weka.core.Instances ; ... // load unlabeled data Instances unlabeled = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/unlabeled.arff\" ))); // set class attribute unlabeled . setClassIndex ( unlabeled . numAttributes () - 1 ); // create copy Instances labeled = new Instances ( unlabeled ); // label instances for ( int i = 0 ; i < unlabeled . numInstances (); i ++ ) { double clsLabel = tree . classifyInstance ( unlabeled . instance ( i )); labeled . instance ( i ). setClassValue ( clsLabel ); } // save labeled data BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"/some/where/labeled.arff\" )); writer . write ( labeled . toString ()); writer . newLine (); writer . flush (); writer . close (); Note on nominal classes: If you're interested in the distribution over all the classes, use the method distributionForInstance(Instance) . This method returns a double array with the probability for each class. The returned double value from classifyInstance (or the index in the array returned by distributionForInstance ) is just the index for the string values in the attribute. That is, if you want the string representation for the class label returned above clsLabel , then you can print it like this: System . out . println ( clsLabel + \" -> \" + unlabeled . classAttribute (). value (( int ) clsLabel )); Clustering # Clustering is similar to classification. The necessary classes can be found in this package: weka.clusterers Building a Clusterer # Batch # A clusterer is built in much the same way as a classifier, but the buildClusterer(Instances) method instead of buildClassifier(Instances) . The following code snippet shows how to build an EM clusterer with a maximum of 100 iterations. import weka.clusterers.EM ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-I\" ; // max. iterations options [ 1 ] = \"100\" ; EM clusterer = new EM (); // new instance of clusterer clusterer . setOptions ( options ); // set the options clusterer . buildClusterer ( data ); // build the clusterer Incremental # Clusterers implementing the weka.clusterers.UpdateableClusterer interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc for this interface to see which clusterers implement it. The actual process of training an incremental clusterer is fairly simple: Call buildClusterer(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClusterer(Instance) method to feed the clusterer new weka.core.Instance objects, one by one. Call updateFinished() after all Instance objects have been processed, for the clusterer to perform additional computations. Here is an example using data from a weka.core.converters.ArffLoader to train weka.clusterers.Cobweb : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); // train Cobweb Cobweb cw = new Cobweb (); cw . buildClusterer ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) cw . updateClusterer ( current ); cw . updateFinished (); A working example is IncrementalClusterer.java . Evaluating # For evaluating a clusterer, you can use the ClusterEvaluation class. In this example, the number of clusters found is written to output: import weka.clusterers.ClusterEvaluation ; import weka.clusterers.Clusterer ; ... ClusterEvaluation eval = new ClusterEvaluation (); Clusterer clusterer = new EM (); // new clusterer instance, default options clusterer . buildClusterer ( data ); // build clusterer eval . setClusterer ( clusterer ); // the cluster to evaluate eval . evaluateClusterer ( newData ); // data to evaluate the clusterer on System . out . println ( \"# of clusters: \" + eval . getNumClusters ()); // output # of clusters Or, in the case of DensityBasedClusterer , you can cross-validate the clusterer (Note: with MakeDensityBasedClusterer you can turn any clusterer into a density-based one): import weka.clusterers.ClusterEvaluation ; import weka.clusterers.DensityBasedClusterer ; import weka.core.Instances ; import java.util.Random ; ... Instances data = ... // from somewhere DensityBasedClusterer clusterer = new ... // the clusterer to evaluate double logLikelyhood = ClusterEvaluation . crossValidateModel ( // cross-validate clusterer , data , 10 , // with 10 folds new Random ( 1 )); // and random number generator with seed 1 Or, if you want the same behavior/print-out from command line, use this call: import weka.clusterers.EM ; import weka.clusterers.ClusterEvaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( ClusterEvaluation . evaluateClusterer ( new EM (), options )); Clustering instances # The only difference with regard to classification is the method name. Instead of classifyInstance(Instance) , it is now clusterInstance(Instance) . The method for obtaining the distribution is still the same, i.e., distributionForInstance(Instance) . Classes to clusters evaluation # If your data contains a class attribute and you want to check how well the generated clusters fit the classes, you can perform a so-called classes to clusters evaluation. The Weka Explorer offers this functionality, and it's quite easy to implement. These are the necessary steps (complete source code: ClassesToClusters.java ): load the data and set the class attribute Instances data = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/file.arff\" ))); data . setClassIndex ( data . numAttributes () - 1 ); generate the class-less data to train the clusterer with weka . filters . unsupervised . attribute . Remove filter = new weka . filters . unsupervised . attribute . Remove (); filter . setAttributeIndices ( \"\" + ( data . classIndex () + 1 )); filter . setInputFormat ( data ); Instances dataClusterer = Filter . useFilter ( data , filter ); train the clusterer, e.g., EM EM clusterer = new EM (); // set further options for EM, if necessary... clusterer . buildClusterer ( dataClusterer ); evaluate the clusterer with the data still containing the class attribute ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( data ); print the results of the evaluation to stdout System . out . println ( eval . clusterResultsToString ()); Attribute selection # There is no real need to use the attribute selection classes directly in your own code, since there are already a meta-classifier and a filter available for applying attribute selection, but the low-level approach is still listed for the sake of completeness. The following examples all use CfsSubsetEval and GreedyStepwise (backwards). The code listed below is taken from the AttributeSelectionTest.java . Meta-Classifier # The following meta-classifier performs a preprocessing step of attribute selection before the data gets presented to the base classifier (in the example here, this is J48 ). Instances data = ... // from somewhere AttributeSelectedClassifier classifier = new AttributeSelectedClassifier (); CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); J48 base = new J48 (); classifier . setClassifier ( base ); classifier . setEvaluator ( eval ); classifier . setSearch ( search ); // 10-fold cross-validation Evaluation evaluation = new Evaluation ( data ); evaluation . crossValidateModel ( classifier , data , 10 , new Random ( 1 )); System . out . println ( evaluation . toSummaryString ()); Filter # The filter approach is straightforward: after setting up the filter, one just filters the data through the filter and obtains the reduced dataset. Instances data = ... // from somewhere AttributeSelection filter = new AttributeSelection (); // package weka.filters.supervised.attribute! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); filter . setEvaluator ( eval ); filter . setSearch ( search ); filter . setInputFormat ( data ); // generate new data Instances newData = Filter . useFilter ( data , filter ); System . out . println ( newData ); Low-level # If neither the meta-classifier nor filter approach is suitable for your purposes, you can use the attribute selection classes themselves. Instances data = ... // from somewhere AttributeSelection attsel = new AttributeSelection (); // package weka.attributeSelection! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); attsel . setEvaluator ( eval ); attsel . setSearch ( search ); attsel . SelectAttributes ( data ); // obtain the attribute indices that were selected int [] indices = attsel . selectedAttributes (); System . out . println ( Utils . arrayToString ( indices )); Note on randomization # Most machine learning schemes, like classifiers and clusterers, are susceptible to the ordering of the data. Using a different seed for randomizing the data will most likely produce a different result. For example, the Explorer, or a classifier/clusterer run from the command line, uses only a seeded java.util.Random number generator, whereas the weka.core.Instances.getRandomNumberGenerator(int) (which the WekaDemo.java uses) also takes the data into account for seeding. Unless one runs 10-fold cross-validation 10 times and averages the results, one will most likely get different results. See also # Weka Examples - pointer to collection of example classes Databases - for more information about using databases in Weka (includes ODBC, e.g., for MS Access) weka/experiment/DatabaseUtils.props - the database setup file Generating cross-validation folds (Java approach) - in case you want to run 10-fold cross-validation manually Generating classifier evaluation output manually - if you want to generate some of the evaluation statistics output manually Creating Instances on-the-fly - explains how to generate a weka.core.Instances object from scratch Save Instances to an ARFF File - shows how to output a dataset Using the Experiment API Examples # The following are a few sample classes for using various parts of the Weka API: WekaDemo.java ( stable , developer ) - little demo class that loads data from a file, runs it through a filter and trains/evaluates a classifier ClusteringDemo.java ( stable , developer ) - a basic example for using the clusterer API ClassesToClusters.java ( stable , developer ) - performs a classes to clusters evaluation like in the Explorer AttributeSelectionTest.java ( stable , developer ) - example code for using the attribute selection API M5PExample.java ( stable , developer ) - example using M5P to obtain data from database, train model, serialize it to a file, and use this serialized model to make predictions again. OptionsToCode.java ( stable , developer ) - turns a Weka command line for a scheme with options into Java code, correctly escaping quotes and backslashes. OptionTree.java ( stable , developer ) - displays nested Weka options as tree. IncrementalClassifier.java ( stable , developer ) - Example class for how to train an incremental classifier (in this case, weka.classifiers.bayes.NaiveBayesUpdateable ). IncrementalClusterer.java ( stable , developer ) - Example class for how to train an incremental clusterer (in this case, weka.clusterers.Cobweb ). Links # Weka API Stable version Developer version","title":"Use weka in your java code"},{"location":"use_weka_in_your_java_code/#packages","text":"","title":"Packages"},{"location":"use_weka_in_your_java_code/#initialization","text":"In order to get your installed Weka packages initialized and also the internal MTJ and arpack libraries added to the classpath, call the loadPackages method of the weka.core.WekaPackageManager class before you instantiate any other classifiers, clusterers, filters, etc: import weka.core.WekaPackageManager; ... WekaPackageManager.loadPackages(false);","title":"Initialization"},{"location":"use_weka_in_your_java_code/#management","text":"You can list all packages using: for ( Package p : WekaPackageManager . getAllPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); Currently installed packages using: for ( Package p : WekaPackageManager . getInstalledPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); And packages that are available for installation with: for ( Package p : WekaPackageManager . getAvailablePackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); The following installs the latest version (version parameter is null ) of the alternatingModelTrees package: WekaPackageManager . installPackageFromRepository ( \"alternatingModelTrees\" , null , System . out ); And this call uninstalls the package: WekaPackageManager . uninstallPackage ( \"alternatingModelTrees\" , true , System . out ); You can install a package also directly from a URL, e.g.: java . net . URL url = new java . net . URL ( \"https://sourceforge.net/projects/weka/files/weka-packages/DilcaDistance1.0.2.zip/download\" ); WekaPackageManager . installPackageFromURL ( url , System . out );","title":"Management"},{"location":"use_weka_in_your_java_code/#instantiation","text":"For instantiating classes from packages, you can use the forName method of the weka.core.Utils class. The following example shows how to instantiate the (hypothetical) classifier com.example.FunkyClassifier , which is available from a Weka package that is currently installed: import weka.core.Utils; import weka.classifiers.Classifier; ... Classifier cls = (Classifier) Utils.forName( Classifier.class, \"com.example.FunkyClassifier\", new String[]{\"-R\", \"1\", \"-another-option\"});","title":"Instantiation"},{"location":"use_weka_in_your_java_code/#instances","text":"","title":"Instances"},{"location":"use_weka_in_your_java_code/#datasets","text":"The DataSource class is not limited to ARFF files. It can also read CSV files and other formats (basically all file formats that Weka can import via its converters; it uses the file extension to determine the associated loader). import weka.core.converters.ConverterUtils.DataSource ; ... DataSource source = new DataSource ( \"/some/where/data.arff\" ); Instances data = source . getDataSet (); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 );","title":"Datasets"},{"location":"use_weka_in_your_java_code/#database","text":"Reading from Databases is slightly more complicated, but still very easy. First, you'll have to modify your DatabaseUtils.props file to reflect your database connection. Suppose you want to connect to a MySQL server that is running on the local machine on the default port 3306 . The MySQL JDBC driver is called Connector/J . (The driver class is org.gjt.mm.mysql.Driver .) The database where your target data resides is called some_database . Since you're only reading, you can use the default user nobody without a password. Your props file must contain the following lines: jdbcDriver = org.gjt.mm.mysql.Driver jdbcURL = jdbc:mysql://localhost:3306/some_database Secondly, your Java code needs to look like this to load the data from the database: import weka.core.Instances ; import weka.experiment.InstanceQuery ; ... InstanceQuery query = new InstanceQuery (); query . setUsername ( \"nobody\" ); query . setPassword ( \"\" ); query . setQuery ( \"select * from whatsoever\" ); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query . retrieveInstances (); Notes: * Don't forget to add the JDBC driver to your CLASSPATH . * For MS Access, you must use the JDBC-ODBC-bridge that is part of a JDK. The Windows databases article explains how to do this. * InstanceQuery automatically converts VARCHAR database columns to NOMINAL attributes, and long TEXT database columns to STRING attributes. So if you use InstanceQuery to do text mining against text that appears in a VARCHAR column, Weka will regard such text as nominal values. Thus it will fail to tokenize and mine that text. Use the NominalToString or StringToNominal filter (package weka.filters.unsupervised.attribute ) to convert the attributes into the correct type.","title":"Database"},{"location":"use_weka_in_your_java_code/#option-handling","text":"Weka schemes that implement the weka.core.OptionHandler interface, such as classifiers, clusterers, and filters, offer the following methods for setting and retrieving options: void setOptions(String[] options) String[] getOptions() There are several ways of setting the options: Manually creating a String array: String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; options [ 1 ] = \"1\" ; Using a single command-line string and using the splitOptions method of the weka.core.Utils class to turn it into an array: String [] options = weka . core . Utils . splitOptions ( \"-R 1\" ); Using the OptionsToCode.java class to automatically turn a command line into code. Especially handy if the command line contains nested classes that have their own options, such as kernels for SMO: java OptionsToCode weka.classifiers.functions.SMO will generate output like this: // create new instance of scheme weka . classifiers . functions . SMO scheme = new weka . classifiers . functions . SMO (); // set options scheme . setOptions ( weka . core . Utils . splitOptions ( \"-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \\\"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\\\"\" )); Also, the OptionTree.java tool allows you to view a nested options string, e.g., used at the command line, as a tree. This can help you spot nesting errors.","title":"Option handling"},{"location":"use_weka_in_your_java_code/#filter","text":"A filter has two different properties: supervised or unsupervised either takes the class attribute into account or not attribute - or instance -based e.g., removing a certain attribute or removing instances that meet a certain condition Most filters implement the OptionHandler interface, which means you can set the options via a String array, rather than setting them each manually via set -methods. For example, if you want to remove the first attribute of a dataset, you need this filter weka . filters . unsupervised . attribute . Remove with this option -R 1 If you have an Instances object, called data , you can create and apply the filter like this: import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Remove ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; // \"range\" options [ 1 ] = \"1\" ; // first attribute Remove remove = new Remove (); // new instance of filter remove . setOptions ( options ); // set options remove . setInputFormat ( data ); // inform filter about dataset **AFTER** setting options Instances newData = Filter . useFilter ( data , remove ); // apply filter","title":"Filter"},{"location":"use_weka_in_your_java_code/#filtering-on-the-fly","text":"The FilteredClassifer meta-classifier is an easy way of filtering data on the fly. It removes the necessity of filtering the data before the classifier can be trained. Also, the data need not be passed through the trained filter again at prediction time. The following is an example of using this meta-classifier with the Remove filter and J48 for getting rid of a numeric ID attribute in the data: import weka.classifiers.meta.FilteredClassifier ; import weka.classifiers.trees.J48 ; import weka.filters.unsupervised.attribute.Remove ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // filter Remove rm = new Remove (); rm . setAttributeIndices ( \"1\" ); // remove 1st attribute // classifier J48 j48 = new J48 (); j48 . setUnpruned ( true ); // using an unpruned J48 // meta-classifier FilteredClassifier fc = new FilteredClassifier (); fc . setFilter ( rm ); fc . setClassifier ( j48 ); // train and make predictions fc . buildClassifier ( train ); for ( int i = 0 ; i < test . numInstances (); i ++ ) { double pred = fc . classifyInstance ( test . instance ( i )); System . out . print ( \"ID: \" + test . instance ( i ). value ( 0 )); System . out . print ( \", actual: \" + test . classAttribute (). value (( int ) test . instance ( i ). classValue ())); System . out . println ( \", predicted: \" + test . classAttribute (). value (( int ) pred )); } Other handy meta-schemes in Weka: weka.clusterers.FilteredClusterer weka.assocations.FilteredAssociator","title":"Filtering on-the-fly"},{"location":"use_weka_in_your_java_code/#batch-filtering","text":"On the command line, you can enable a second input/output pair (via -r and -s ) with the -b option, in order to process the second file with the same filter setup as the first one. Necessary, if you're using attribute selection or standardization - otherwise you end up with incompatible datasets. This is done fairly easy, since one initializes the filter only once with the setInputFormat(Instances) method, namely with the training set, and then applies the filter subsequently to the training set and the test set. The following example shows how to apply the Standardize filter to a train and a test set. Instances train = ... // from somewhere Instances test = ... // from somewhere Standardize filter = new Standardize (); filter . setInputFormat ( train ); // initializing the filter once with training set Instances newTrain = Filter . useFilter ( train , filter ); // configures the Filter based on train instances and returns filtered instances Instances newTest = Filter . useFilter ( test , filter ); // create new test set","title":"Batch filtering"},{"location":"use_weka_in_your_java_code/#calling-conventions","text":"The setInputFormat(Instances) method always has to be the last call before the filter is applied, e.g., with Filter.useFilter(Instances,Filter) . Why? First, it is the convention for using filters and, secondly, lots of filters generate the header of the output format in the setInputFormat(Instances) method with the currently set options (setting otpions after this call doesn't have any effect any more).","title":"Calling conventions"},{"location":"use_weka_in_your_java_code/#classification","text":"The necessary classes can be found in this package: weka.classifiers","title":"Classification"},{"location":"use_weka_in_your_java_code/#building-a-classifier","text":"","title":"Building a Classifier"},{"location":"use_weka_in_your_java_code/#batch","text":"A Weka classifier is rather simple to train on a given dataset. E.g., we can train an unpruned C4.5 tree algorithm on a given dataset data . The training is done via the buildClassifier(Instances) method. import weka.classifiers.trees.J48 ; ... String [] options = new String [ 1 ] ; options [ 0 ] = \"-U\" ; // unpruned tree J48 tree = new J48 (); // new instance of tree tree . setOptions ( options ); // set the options tree . buildClassifier ( data ); // build classifier","title":"Batch"},{"location":"use_weka_in_your_java_code/#incremental","text":"Classifiers implementing the weka.classifiers.UpdateableClassifier interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc of this interface to see what classifiers are implementing it. The actual process of training an incremental classifier is fairly simple: Call buildClassifier(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClassifier(Instance) method to feed the classifier new weka.core.Instance objects, one by one. Here is an example using data from a weka.core.converters.ArffLoader to train weka.classifiers.bayes.NaiveBayesUpdateable : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); structure . setClassIndex ( structure . numAttributes () - 1 ); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable (); nb . buildClassifier ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) nb . updateClassifier ( current ); A working example is IncrementalClassifier.java .","title":"Incremental"},{"location":"use_weka_in_your_java_code/#evaluating","text":"","title":"Evaluating"},{"location":"use_weka_in_your_java_code/#cross-validation","text":"If you only have a training set and no test you might want to evaluate the classifier by using 10 times 10-fold cross-validation. This can be easily done via the Evaluation class. Here we seed the random selection of our folds for the CV with 1 . Check out the Evaluation class for more information about the statistics it produces. import weka.classifiers.Evaluation ; import java.util.Random ; ... Evaluation eval = new Evaluation ( newData ); eval . crossValidateModel ( tree , newData , 10 , new Random ( 1 )); Note: The classifier (in our example tree ) should not be trained when handed over to the crossValidateModel method. Why? If the classifier does not abide to the Weka convention that a classifier must be re-initialized every time the buildClassifier method is called (in other words: subsequent calls to the buildClassifier method always return the same results), you will get inconsistent and worthless results. The crossValidateModel takes care of training and evaluating the classifier. (It creates a copy of the original classifier that you hand over to the crossValidateModel for each run of the cross-validation.)","title":"Cross-validation"},{"location":"use_weka_in_your_java_code/#traintest-set","text":"In case you have a dedicated test set, you can train the classifier and then evaluate it on this test set. In the following example, a J48 is instantiated, trained and then evaluated. Some statistics are printed to stdout : import weka.core.Instances ; import weka.classifiers.Evaluation ; import weka.classifiers.trees.J48 ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // train classifier Classifier cls = new J48 (); cls . buildClassifier ( train ); // evaluate classifier and print some statistics Evaluation eval = new Evaluation ( train ); eval . evaluateModel ( cls , test ); System . out . println ( eval . toSummaryString ( \"\\nResults\\n======\\n\" , false ));","title":"Train/test set"},{"location":"use_weka_in_your_java_code/#statistics","text":"Some methods for retrieving the results from the evaluation: nominal class correct() - number of correctly classified instances (see also incorrect() ) pctCorrect() - percentage of correctly classified instances (see also pctIncorrect() ) kappa() - Kappa statistics numeric class correlationCoefficient() - correlation coefficient general meanAbsoluteError() - the mean absolute error rootMeanSquaredError() - the root mean squared error unclassified() - number of unclassified instances pctUnclassified() - percentage of unclassified instances If you want to have the exact same behavior as from the command line, use this call: import weka.classifiers.trees.J48 ; import weka.classifiers.Evaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( Evaluation . evaluateModel ( new J48 (), options ));","title":"Statistics"},{"location":"use_weka_in_your_java_code/#roc-curvesauc","text":"You can also generate ROC curves/AUC with the predictions Weka recorded during testing. You can access these predictions via the predictions() method of the Evaluation class. See the Generating ROC curve article for a full example of how to generate ROC curves.","title":"ROC curves/AUC"},{"location":"use_weka_in_your_java_code/#classifying-instances","text":"In case you have an unlabeled dataset that you want to classify with your newly trained classifier, you can use the following code snippet. It loads the file /some/where/unlabeled.arff , uses the previously built classifier tree to label the instances, and saves the labeled data as /some/where/labeled.arff . import java.io.BufferedReader ; import java.io.BufferedWriter ; import java.io.FileReader ; import java.io.FileWriter ; import weka.core.Instances ; ... // load unlabeled data Instances unlabeled = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/unlabeled.arff\" ))); // set class attribute unlabeled . setClassIndex ( unlabeled . numAttributes () - 1 ); // create copy Instances labeled = new Instances ( unlabeled ); // label instances for ( int i = 0 ; i < unlabeled . numInstances (); i ++ ) { double clsLabel = tree . classifyInstance ( unlabeled . instance ( i )); labeled . instance ( i ). setClassValue ( clsLabel ); } // save labeled data BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"/some/where/labeled.arff\" )); writer . write ( labeled . toString ()); writer . newLine (); writer . flush (); writer . close (); Note on nominal classes: If you're interested in the distribution over all the classes, use the method distributionForInstance(Instance) . This method returns a double array with the probability for each class. The returned double value from classifyInstance (or the index in the array returned by distributionForInstance ) is just the index for the string values in the attribute. That is, if you want the string representation for the class label returned above clsLabel , then you can print it like this: System . out . println ( clsLabel + \" -> \" + unlabeled . classAttribute (). value (( int ) clsLabel ));","title":"Classifying instances"},{"location":"use_weka_in_your_java_code/#clustering","text":"Clustering is similar to classification. The necessary classes can be found in this package: weka.clusterers","title":"Clustering"},{"location":"use_weka_in_your_java_code/#building-a-clusterer","text":"","title":"Building a Clusterer"},{"location":"use_weka_in_your_java_code/#batch_1","text":"A clusterer is built in much the same way as a classifier, but the buildClusterer(Instances) method instead of buildClassifier(Instances) . The following code snippet shows how to build an EM clusterer with a maximum of 100 iterations. import weka.clusterers.EM ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-I\" ; // max. iterations options [ 1 ] = \"100\" ; EM clusterer = new EM (); // new instance of clusterer clusterer . setOptions ( options ); // set the options clusterer . buildClusterer ( data ); // build the clusterer","title":"Batch"},{"location":"use_weka_in_your_java_code/#incremental_1","text":"Clusterers implementing the weka.clusterers.UpdateableClusterer interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc for this interface to see which clusterers implement it. The actual process of training an incremental clusterer is fairly simple: Call buildClusterer(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClusterer(Instance) method to feed the clusterer new weka.core.Instance objects, one by one. Call updateFinished() after all Instance objects have been processed, for the clusterer to perform additional computations. Here is an example using data from a weka.core.converters.ArffLoader to train weka.clusterers.Cobweb : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); // train Cobweb Cobweb cw = new Cobweb (); cw . buildClusterer ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) cw . updateClusterer ( current ); cw . updateFinished (); A working example is IncrementalClusterer.java .","title":"Incremental"},{"location":"use_weka_in_your_java_code/#evaluating_1","text":"For evaluating a clusterer, you can use the ClusterEvaluation class. In this example, the number of clusters found is written to output: import weka.clusterers.ClusterEvaluation ; import weka.clusterers.Clusterer ; ... ClusterEvaluation eval = new ClusterEvaluation (); Clusterer clusterer = new EM (); // new clusterer instance, default options clusterer . buildClusterer ( data ); // build clusterer eval . setClusterer ( clusterer ); // the cluster to evaluate eval . evaluateClusterer ( newData ); // data to evaluate the clusterer on System . out . println ( \"# of clusters: \" + eval . getNumClusters ()); // output # of clusters Or, in the case of DensityBasedClusterer , you can cross-validate the clusterer (Note: with MakeDensityBasedClusterer you can turn any clusterer into a density-based one): import weka.clusterers.ClusterEvaluation ; import weka.clusterers.DensityBasedClusterer ; import weka.core.Instances ; import java.util.Random ; ... Instances data = ... // from somewhere DensityBasedClusterer clusterer = new ... // the clusterer to evaluate double logLikelyhood = ClusterEvaluation . crossValidateModel ( // cross-validate clusterer , data , 10 , // with 10 folds new Random ( 1 )); // and random number generator with seed 1 Or, if you want the same behavior/print-out from command line, use this call: import weka.clusterers.EM ; import weka.clusterers.ClusterEvaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( ClusterEvaluation . evaluateClusterer ( new EM (), options ));","title":"Evaluating"},{"location":"use_weka_in_your_java_code/#clustering-instances","text":"The only difference with regard to classification is the method name. Instead of classifyInstance(Instance) , it is now clusterInstance(Instance) . The method for obtaining the distribution is still the same, i.e., distributionForInstance(Instance) .","title":"Clustering instances"},{"location":"use_weka_in_your_java_code/#classes-to-clusters-evaluation","text":"If your data contains a class attribute and you want to check how well the generated clusters fit the classes, you can perform a so-called classes to clusters evaluation. The Weka Explorer offers this functionality, and it's quite easy to implement. These are the necessary steps (complete source code: ClassesToClusters.java ): load the data and set the class attribute Instances data = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/file.arff\" ))); data . setClassIndex ( data . numAttributes () - 1 ); generate the class-less data to train the clusterer with weka . filters . unsupervised . attribute . Remove filter = new weka . filters . unsupervised . attribute . Remove (); filter . setAttributeIndices ( \"\" + ( data . classIndex () + 1 )); filter . setInputFormat ( data ); Instances dataClusterer = Filter . useFilter ( data , filter ); train the clusterer, e.g., EM EM clusterer = new EM (); // set further options for EM, if necessary... clusterer . buildClusterer ( dataClusterer ); evaluate the clusterer with the data still containing the class attribute ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( data ); print the results of the evaluation to stdout System . out . println ( eval . clusterResultsToString ());","title":"Classes to clusters evaluation"},{"location":"use_weka_in_your_java_code/#attribute-selection","text":"There is no real need to use the attribute selection classes directly in your own code, since there are already a meta-classifier and a filter available for applying attribute selection, but the low-level approach is still listed for the sake of completeness. The following examples all use CfsSubsetEval and GreedyStepwise (backwards). The code listed below is taken from the AttributeSelectionTest.java .","title":"Attribute selection"},{"location":"use_weka_in_your_java_code/#meta-classifier","text":"The following meta-classifier performs a preprocessing step of attribute selection before the data gets presented to the base classifier (in the example here, this is J48 ). Instances data = ... // from somewhere AttributeSelectedClassifier classifier = new AttributeSelectedClassifier (); CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); J48 base = new J48 (); classifier . setClassifier ( base ); classifier . setEvaluator ( eval ); classifier . setSearch ( search ); // 10-fold cross-validation Evaluation evaluation = new Evaluation ( data ); evaluation . crossValidateModel ( classifier , data , 10 , new Random ( 1 )); System . out . println ( evaluation . toSummaryString ());","title":"Meta-Classifier"},{"location":"use_weka_in_your_java_code/#filter_1","text":"The filter approach is straightforward: after setting up the filter, one just filters the data through the filter and obtains the reduced dataset. Instances data = ... // from somewhere AttributeSelection filter = new AttributeSelection (); // package weka.filters.supervised.attribute! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); filter . setEvaluator ( eval ); filter . setSearch ( search ); filter . setInputFormat ( data ); // generate new data Instances newData = Filter . useFilter ( data , filter ); System . out . println ( newData );","title":"Filter"},{"location":"use_weka_in_your_java_code/#low-level","text":"If neither the meta-classifier nor filter approach is suitable for your purposes, you can use the attribute selection classes themselves. Instances data = ... // from somewhere AttributeSelection attsel = new AttributeSelection (); // package weka.attributeSelection! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); attsel . setEvaluator ( eval ); attsel . setSearch ( search ); attsel . SelectAttributes ( data ); // obtain the attribute indices that were selected int [] indices = attsel . selectedAttributes (); System . out . println ( Utils . arrayToString ( indices ));","title":"Low-level"},{"location":"use_weka_in_your_java_code/#note-on-randomization","text":"Most machine learning schemes, like classifiers and clusterers, are susceptible to the ordering of the data. Using a different seed for randomizing the data will most likely produce a different result. For example, the Explorer, or a classifier/clusterer run from the command line, uses only a seeded java.util.Random number generator, whereas the weka.core.Instances.getRandomNumberGenerator(int) (which the WekaDemo.java uses) also takes the data into account for seeding. Unless one runs 10-fold cross-validation 10 times and averages the results, one will most likely get different results.","title":"Note on randomization"},{"location":"use_weka_in_your_java_code/#see-also","text":"Weka Examples - pointer to collection of example classes Databases - for more information about using databases in Weka (includes ODBC, e.g., for MS Access) weka/experiment/DatabaseUtils.props - the database setup file Generating cross-validation folds (Java approach) - in case you want to run 10-fold cross-validation manually Generating classifier evaluation output manually - if you want to generate some of the evaluation statistics output manually Creating Instances on-the-fly - explains how to generate a weka.core.Instances object from scratch Save Instances to an ARFF File - shows how to output a dataset Using the Experiment API","title":"See also"},{"location":"use_weka_in_your_java_code/#examples","text":"The following are a few sample classes for using various parts of the Weka API: WekaDemo.java ( stable , developer ) - little demo class that loads data from a file, runs it through a filter and trains/evaluates a classifier ClusteringDemo.java ( stable , developer ) - a basic example for using the clusterer API ClassesToClusters.java ( stable , developer ) - performs a classes to clusters evaluation like in the Explorer AttributeSelectionTest.java ( stable , developer ) - example code for using the attribute selection API M5PExample.java ( stable , developer ) - example using M5P to obtain data from database, train model, serialize it to a file, and use this serialized model to make predictions again. OptionsToCode.java ( stable , developer ) - turns a Weka command line for a scheme with options into Java code, correctly escaping quotes and backslashes. OptionTree.java ( stable , developer ) - displays nested Weka options as tree. IncrementalClassifier.java ( stable , developer ) - Example class for how to train an incremental classifier (in this case, weka.classifiers.bayes.NaiveBayesUpdateable ). IncrementalClusterer.java ( stable , developer ) - Example class for how to train an incremental clusterer (in this case, weka.clusterers.Cobweb ).","title":"Examples"},{"location":"use_weka_in_your_java_code/#links","text":"Weka API Stable version Developer version","title":"Links"},{"location":"use_weka_with_the_microsoft_net_framework/","text":"So you want to use Weka with your existing Microsoft .NET code? Or you want to use your .NET components in a Java-based system that uses Weka? Achieving .NET and Java interoperability is possible, but there is no 'one size fits all' solution. The sheer number of different ways to attempt this should give an indication of the difficulty of the problem. That said, here is a summary of some of the possibilities that you can try. Direct Interoperability # IKVM # IKVM is an implementation of Java for .NET. It allows you to call your Java classes from directly from your .NET code, and via the GNU Classpath provides most of the standard Java API for use in .NET. It also provides a .NET version of the Java Virtual Machine. In other words, with this software you can now use almost any Java class in your .NET code! You could even develop for .NET using Java, and then easily import your classes into your .NET system. Even better, IKVM is Open Source software and is freely available. The only disadvantage is that its functionality is limited to the extent of the GNU Classpath; however this now covers most of the API, and is rapidly expanding. It also doesn't appear to have any functionality in the other direction - you can't run .NET code in Java. For use with Weka, IKVM has successfully been tested on a simple C# program that runs a classifier on a dataset. The GUI will not load at the time of writing, but I suspect that most of the Weka API will work fine. Because of this, IKVM is recommended for use in small Open Source research projects using Weka. See this IKVM with Weka tutorial for more detail. Bridging Software # Bridging software allows you to use your Java classes in your .NET code, and your .NET classes in your java code. This works by running both the .NET and Java Virtual Machines simultaneously, and creating proxy classes that 'stand in' for each class in the alternative framework. Runtime bridges are relatively computationally-efficient, and provide seamless and flexible interoperability solutions. The main disadvantage with this method is that the software tools that facilitate this are generally expensive third-party programs that must be purchased. Some .NET / Java bridging tools: JNBridge JBind2.net JuggerNET J-Integra for .NET JACOB An open-source project that provides allows you to call COM components from Java (but not vice-versa). Hosting .NET Controls in Java This tutorial explains how to write your own custom COM bridges using JNI. This is similar to what JACOB does, except that you will have to code it yourself. Java Native Interface SDK for .NET and tools that use it: Object-Oriented JNI for .NET This .NET library implements regular JNI SDK in .NET. OOJNI .NET Add-in for MS Visual Studio Generates wrappers for java classes from Java Bytecode selected in C++, Managed C++, C#, J#, VB. Indirect Interoperability # Interoperability using a Database # If both your .NET components and your Java components only need to asynchronously interface with a database, interoperability is very simple. The components from the different frameworks do not have to know about each other - they just interact with the database as they normally would. For more detail, see Database Interoperability . Using Enterprise Messaging Services # Messaging services are used to facilitate asynchronous communication between different components in a system. They provide an API for sending messages between components, and provide security, data integrity checking and error handling to ensure reliable information transfer. To use these systems for interoperability between .NET and Java, you may need a messaging system for each framework that has the capability of talking to messaging systems from the other framework. The disadvantage of this type of system is that it may be expensive and difficult to set up. However, if you already have such a system in place, it makes sense to use it for interoperability purposes. .NET Enterprise Messaging Services Microsoft BizTalk Server Java Enterprise Messaging Services IBM Websphere MQ Fiorano MQ Web Services # Web Services are a common method used for achieving interoperability. The main attractions are that web-based systems are both platform and language independant, and there exist standard protocols to facilitate the communication. A Web Services based approach generally involves setting up a web server, and some proxy classes in each framework to communicate with this server. Communication is generally achieved through XML-based protocols such as SOAP. The problem with this method is that serialization to XML can create large files that must be sent to the web server, so there may be efficiency issues. Microsoft WSE and JWSDP are freely available extensions to .NET and Java respectively, for the purposes of developing web-server based solutions. Tutorials for using this method for interoperability can be found here , here and here . Other Useful Links # An introduction to IKVM - This article supplies a code example of Java/.NET interoperability using IKVM. IKVM.NET - An open source implementation of Java for .NET (Highly recommended). IKVM with Weka tutorial - A tutorial on using Weka with C# using IKVM. This tutorial is part of the WekaWiki. Java/.NET Interop: Bridging Muddled Waters - A very comprehensive set of articles on the interoperability problem. Watch out for the various authors' biases towards their own companies and products though. Microsoft .NET and Java/J2EE Interoperability - An MSDN directory of articles on .NET / Java interopererability. Mono - An open source .NET development environment, cross platform. Java/.NET Integration as Simple as Possible - the article, which describes the simplest way to embed .NET controls into a Java GUI with OOJNI\u00ae Problems using weka from VB .NET in VS 2005 - you may experience some hiccups when attempting to use an IKVM generated assembly in your VB .Net code in Visual Studio 2005...","title":"Use weka with the microsoft net framework"},{"location":"use_weka_with_the_microsoft_net_framework/#direct-interoperability","text":"","title":"Direct Interoperability"},{"location":"use_weka_with_the_microsoft_net_framework/#ikvm","text":"IKVM is an implementation of Java for .NET. It allows you to call your Java classes from directly from your .NET code, and via the GNU Classpath provides most of the standard Java API for use in .NET. It also provides a .NET version of the Java Virtual Machine. In other words, with this software you can now use almost any Java class in your .NET code! You could even develop for .NET using Java, and then easily import your classes into your .NET system. Even better, IKVM is Open Source software and is freely available. The only disadvantage is that its functionality is limited to the extent of the GNU Classpath; however this now covers most of the API, and is rapidly expanding. It also doesn't appear to have any functionality in the other direction - you can't run .NET code in Java. For use with Weka, IKVM has successfully been tested on a simple C# program that runs a classifier on a dataset. The GUI will not load at the time of writing, but I suspect that most of the Weka API will work fine. Because of this, IKVM is recommended for use in small Open Source research projects using Weka. See this IKVM with Weka tutorial for more detail.","title":"IKVM"},{"location":"use_weka_with_the_microsoft_net_framework/#bridging-software","text":"Bridging software allows you to use your Java classes in your .NET code, and your .NET classes in your java code. This works by running both the .NET and Java Virtual Machines simultaneously, and creating proxy classes that 'stand in' for each class in the alternative framework. Runtime bridges are relatively computationally-efficient, and provide seamless and flexible interoperability solutions. The main disadvantage with this method is that the software tools that facilitate this are generally expensive third-party programs that must be purchased. Some .NET / Java bridging tools: JNBridge JBind2.net JuggerNET J-Integra for .NET JACOB An open-source project that provides allows you to call COM components from Java (but not vice-versa). Hosting .NET Controls in Java This tutorial explains how to write your own custom COM bridges using JNI. This is similar to what JACOB does, except that you will have to code it yourself. Java Native Interface SDK for .NET and tools that use it: Object-Oriented JNI for .NET This .NET library implements regular JNI SDK in .NET. OOJNI .NET Add-in for MS Visual Studio Generates wrappers for java classes from Java Bytecode selected in C++, Managed C++, C#, J#, VB.","title":"Bridging Software"},{"location":"use_weka_with_the_microsoft_net_framework/#indirect-interoperability","text":"","title":"Indirect Interoperability"},{"location":"use_weka_with_the_microsoft_net_framework/#interoperability-using-a-database","text":"If both your .NET components and your Java components only need to asynchronously interface with a database, interoperability is very simple. The components from the different frameworks do not have to know about each other - they just interact with the database as they normally would. For more detail, see Database Interoperability .","title":"Interoperability using a Database"},{"location":"use_weka_with_the_microsoft_net_framework/#using-enterprise-messaging-services","text":"Messaging services are used to facilitate asynchronous communication between different components in a system. They provide an API for sending messages between components, and provide security, data integrity checking and error handling to ensure reliable information transfer. To use these systems for interoperability between .NET and Java, you may need a messaging system for each framework that has the capability of talking to messaging systems from the other framework. The disadvantage of this type of system is that it may be expensive and difficult to set up. However, if you already have such a system in place, it makes sense to use it for interoperability purposes. .NET Enterprise Messaging Services Microsoft BizTalk Server Java Enterprise Messaging Services IBM Websphere MQ Fiorano MQ","title":"Using Enterprise Messaging Services"},{"location":"use_weka_with_the_microsoft_net_framework/#web-services","text":"Web Services are a common method used for achieving interoperability. The main attractions are that web-based systems are both platform and language independant, and there exist standard protocols to facilitate the communication. A Web Services based approach generally involves setting up a web server, and some proxy classes in each framework to communicate with this server. Communication is generally achieved through XML-based protocols such as SOAP. The problem with this method is that serialization to XML can create large files that must be sent to the web server, so there may be efficiency issues. Microsoft WSE and JWSDP are freely available extensions to .NET and Java respectively, for the purposes of developing web-server based solutions. Tutorials for using this method for interoperability can be found here , here and here .","title":"Web Services"},{"location":"use_weka_with_the_microsoft_net_framework/#other-useful-links","text":"An introduction to IKVM - This article supplies a code example of Java/.NET interoperability using IKVM. IKVM.NET - An open source implementation of Java for .NET (Highly recommended). IKVM with Weka tutorial - A tutorial on using Weka with C# using IKVM. This tutorial is part of the WekaWiki. Java/.NET Interop: Bridging Muddled Waters - A very comprehensive set of articles on the interoperability problem. Watch out for the various authors' biases towards their own companies and products though. Microsoft .NET and Java/J2EE Interoperability - An MSDN directory of articles on .NET / Java interopererability. Mono - An open source .NET development environment, cross platform. Java/.NET Integration as Simple as Possible - the article, which describes the simplest way to embed .NET controls into a Java GUI with OOJNI\u00ae Problems using weka from VB .NET in VS 2005 - you may experience some hiccups when attempting to use an IKVM generated assembly in your VB .Net code in Visual Studio 2005...","title":"Other Useful Links"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/","text":"A new framework to create ARFF from JPA Entities. What makes me want to develop this project: Not having to worry about parameters of access to the database, since having my application integrated to JPA Use Entities (POJO) JPA and its metadata to generate the ARFF when you need to outsource information for use by third parties. Generate new Entity (POJO) JPA based on information obtained in the ARFF (planned to release V0.9.0) Site About Project: http://socialsla.github.io/Weka-JPA-Persistence Source on GitHub: https://github.com/SocialSLA/Weka-JPA-Persistence How code # try { Weka2JPAHelper l_helper; // inject with CDI or create a class with new Weka2JPAHelper(Logger,EntityManager); l_helper.save(new File(\"Teste.arff\"), A_JPA_Entity.class); }catch(IOException e){} If you want create the data to create a ARFF file from other source, you can use a JPA Entity like the example: try{ Weka2JPAHelper l_helper = CDIManager.get(Weka2JPAHelper.class); Collection<Entity2Weka> l_list = new ArrayList<Entity2Weka>(); Classification l_positivo = new Classification(1, \"Positivo\"); Classification l_negativo = new Classification(-1, \"Negativo\"); Classification l_neutro = new Classification(0, \"Neutro\"); l_list.add(new Entity2Weka(\"Post numero 1 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 2 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 3 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 4 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 5 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 6 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(null, l_positivo)); l_list.add(new Entity2Weka(null, l_neutro)); l_list.add(new Entity2Weka(null, l_negativo)); l_list.add(new Entity2Weka(\"Post numero 10 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 16 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", null)); l_helper.save(new File(\"Test.arff\"), Entity2Weka.class, l_list); CDIManager.stop(); }catch{IOException e){} Entity Example # The Entity class with name Entity2Weka is a Entity Base used to get attributes and make header file: @Entity public class Entity2Weka { @Id private Integer id; // id not is used @Column private String post; // generate a Sring Attribute @ManyToOne private Classification classification; // generate a relational attribute public Entity2Weka(){} public Entity2Weka(String p_string, Classification p_classification) { post = p_string; classification = p_classification; } } Classification Entity is a slave entity from entity base: @Entity @NamedQueries({ @NamedQuery(name = Classification.NAMED_QUERY_FIND_ALL, query = \"SELECT c FROM Classification c\"), @NamedQuery(name = Classification.NAMED_QUERY_FIND_BY_NAME, query = \"SELECT c FROM Classification c WHERE c.name = ?\") }) public class Classification { public static final String NAMED_QUERY_FIND_ALL = \"Classification.findAll\"; public static final String NAMED_QUERY_FIND_BY_NAME = \"Classification.findByName\"; @Id private Integer id; @Column(length = 50, nullable = false, unique = true) private String name; @Column(length = 200, nullable = true) private String description; public Classification() { } public Classification(int p_i, String p_name) { id = p_i; name = p_name; } ..... public toString(){ return name; } } Result this ARFF # @relation Entity2Weka @attribute post string @attribute classification {'Desconhecido (-888)','Negativo (-1)','Neutro (0)','Positivo (1)','Ambiguo (999)'} @data 'Post numero 1 para teste do helper','Positivo (1)' 'Post numero 2 para teste do helper','Negativo (-1)' 'Post numero 3 para teste do helper','Neutro (0)' 'Post numero 4 para teste do helper','Negativo (-1)' 'Post numero 5 para teste do helper','Positivo (1)' 'Post numero 6 para teste do helper','Negativo (-1)' 'Post numero 7 para teste do helper','Positivo (1)' 'Post numero 8 para teste do helper','Neutro (0)' 'Post numero 9 para teste do helper','Negativo (-1)' ?,'Positivo (1)' ?,'Neutro (0)' ?,'Negativo (-1)' 'Post numero 10 para teste do helper',? 'Post numero 16 para teste do helper',? 'Post numero 7 para teste do helper',? 'Post numero 8 para teste do helper',? 'Post numero 9 para teste do helper',? Download source # Download this two pakages: WEKA JPA Persistence - V0.0.4 CDI Utils - for Configuration Injection","title":"Using a new java framework to create arff from jpa entity"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#how-code","text":"try { Weka2JPAHelper l_helper; // inject with CDI or create a class with new Weka2JPAHelper(Logger,EntityManager); l_helper.save(new File(\"Teste.arff\"), A_JPA_Entity.class); }catch(IOException e){} If you want create the data to create a ARFF file from other source, you can use a JPA Entity like the example: try{ Weka2JPAHelper l_helper = CDIManager.get(Weka2JPAHelper.class); Collection<Entity2Weka> l_list = new ArrayList<Entity2Weka>(); Classification l_positivo = new Classification(1, \"Positivo\"); Classification l_negativo = new Classification(-1, \"Negativo\"); Classification l_neutro = new Classification(0, \"Neutro\"); l_list.add(new Entity2Weka(\"Post numero 1 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 2 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 3 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 4 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 5 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 6 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(null, l_positivo)); l_list.add(new Entity2Weka(null, l_neutro)); l_list.add(new Entity2Weka(null, l_negativo)); l_list.add(new Entity2Weka(\"Post numero 10 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 16 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", null)); l_helper.save(new File(\"Test.arff\"), Entity2Weka.class, l_list); CDIManager.stop(); }catch{IOException e){}","title":"How code"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#entity-example","text":"The Entity class with name Entity2Weka is a Entity Base used to get attributes and make header file: @Entity public class Entity2Weka { @Id private Integer id; // id not is used @Column private String post; // generate a Sring Attribute @ManyToOne private Classification classification; // generate a relational attribute public Entity2Weka(){} public Entity2Weka(String p_string, Classification p_classification) { post = p_string; classification = p_classification; } } Classification Entity is a slave entity from entity base: @Entity @NamedQueries({ @NamedQuery(name = Classification.NAMED_QUERY_FIND_ALL, query = \"SELECT c FROM Classification c\"), @NamedQuery(name = Classification.NAMED_QUERY_FIND_BY_NAME, query = \"SELECT c FROM Classification c WHERE c.name = ?\") }) public class Classification { public static final String NAMED_QUERY_FIND_ALL = \"Classification.findAll\"; public static final String NAMED_QUERY_FIND_BY_NAME = \"Classification.findByName\"; @Id private Integer id; @Column(length = 50, nullable = false, unique = true) private String name; @Column(length = 200, nullable = true) private String description; public Classification() { } public Classification(int p_i, String p_name) { id = p_i; name = p_name; } ..... public toString(){ return name; } }","title":"Entity Example"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#result-this-arff","text":"@relation Entity2Weka @attribute post string @attribute classification {'Desconhecido (-888)','Negativo (-1)','Neutro (0)','Positivo (1)','Ambiguo (999)'} @data 'Post numero 1 para teste do helper','Positivo (1)' 'Post numero 2 para teste do helper','Negativo (-1)' 'Post numero 3 para teste do helper','Neutro (0)' 'Post numero 4 para teste do helper','Negativo (-1)' 'Post numero 5 para teste do helper','Positivo (1)' 'Post numero 6 para teste do helper','Negativo (-1)' 'Post numero 7 para teste do helper','Positivo (1)' 'Post numero 8 para teste do helper','Neutro (0)' 'Post numero 9 para teste do helper','Negativo (-1)' ?,'Positivo (1)' ?,'Neutro (0)' ?,'Negativo (-1)' 'Post numero 10 para teste do helper',? 'Post numero 16 para teste do helper',? 'Post numero 7 para teste do helper',? 'Post numero 8 para teste do helper',? 'Post numero 9 para teste do helper',?","title":"Result this ARFF"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#download-source","text":"Download this two pakages: WEKA JPA Persistence - V0.0.4 CDI Utils - for Configuration Injection","title":"Download source"},{"location":"using_cluster_algorithms/","text":"This article discusses the use of cluster schemes in Weka from the commandline. This functionality is, of course, also available from the GUI, namely the Explorer and the KnowledgeFlow. Commandline # Clusterers can be used in a similar fashion Weka's classifiers: -t <file> specifies the training file -T <file> specifies the test file -p <attribute range> for outputting predictions (if a test file is present, then for this one, otherwise the train file) -c <index> performs a classes to clusters evaluation (during training, the class attribute will be automatically ignored) -x <folds> performs cross-validation for density-based clusterers (no classes to clusters evaluation possible!). With weka.clusterers.MakeDensityBasedClusterer , any clusterer can be turned into a density-based one. -d <file> and -l <file> for saving and loading serialized models Some examples: EM with train and test file: java weka.clusterers.EM \\ -I 10 \\ # only 10 iterations -t train.arff \\ -T test.arff * SimpleKMeans with classes to clusters evaluation: java weka.clusterers.SimpleKMeans -t train.arff \\ -c last # the class attribute is the last Sample output: ... Class attribute: class Classes to Clusters: 0 1 <-- assigned to cluster 242 442 | 3 22 77 | 2 Cluster 0 <-- 2 Cluster 1 <-- 3 Incorrectly clustered instances : 319.0 40.7407 % * running 2-fold cross-validation on SimpleKMeans (we need to use MakeDensityBasedClusterer , since SimpleKMeans is no density-based clusterer!): java weka.clusterers.MakeDensityBasedClusterer -W weka.clusterers.SimpleKMeans \\ -t train.arff \\ -x 2 # 2 folds Sample output: ... 2 fold CV Log Likelihood: -40.9751 Filters # Weka contains some filters that make life easier with the cluster algorithms. AddCluster # The filter weka.filters.unsupervised.attribute.AddCluster adds the cluster number as nominal attribute to the data processed by the filter. This makes the post-processing or analyzing of the cluster assignments easier than with the -p X option. Here's an example for the UCI dataset anneal using SimpleKMeans : java weka.filters.unsupervised.attribute.AddCluster \\ -W \"weka.clusterers.SimpleKMeans -N 6 -S 42\" \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff And some example output: @relation ... @attribute family {'?',GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS} @attribute product-type {C,H,G} ... @attribute class {1,2,3,4,5,U} @attribute cluster {cluster1,cluster2,cluster3,cluster4,cluster5,cluster6} @data '?',C,A,8,0,'?',S,'?',0,'?','?',G,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',COIL,0.7,610,0,'?',0,'?',3,cluster2 '?',C,R,0,0,'?',S,2,0,'?','?',E,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',Y,'?','?','?',COIL,3.2,610,0,'?',0,'?',3,cluster2 ClusterMembership # If you're more interested in the probability for each each cluster an instance gets assigned, you can use the filter weka.filters.unsupervised.attribute.ClusterMembership . Here's an example for the UCI dataset anneal using EM : java weka.filters.unsupervised.attribute.ClusterMembership \\ -W weka.clusterers.EM \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff \\ -- \\ # additional options for EM follow after the -- -I 10 And some example output: @relation ... @attribute pCluster_0_0 numeric @attribute pCluster_0_1 numeric @attribute pCluster_0_2 numeric @attribute pCluster_0_3 numeric @attribute pCluster_0_4 numeric @attribute pCluster_0_5 numeric @data 0.000147,0.009863,0,0.98999,0.000001,0 0.00292,0.000002,0,0.997078,0,0 ... Classifiers # ClassificationViaClustering # A new meta-classifier, weka.classifiers.meta.ClassificationViaClustering , got introduced in the developer version (>3.5.6), which mimics the clusters to classes functionality of the weka.core.ClusterEvaluation class. A user defined cluster algorithm is built with the training data presented to the meta-classifier (after the class attribute got removed, of course) and then the mapping between classes and clusters is determined. This mapping is then used for predicting class labels of unseen instances. Here's an example for the UCI dataset balance-scale : java weka.classifiers.meta.ClassificationViaClustering \\ -t balance-scale.arff \\ -W weka.clusterers.SimpleKMeans \\ -- \\ -N 3 # additional parameters for SimpleKMeans, since the dataset has 3 class labels And some sample output: ... Clusters to classes mapping: 1. Cluster: B (2) 2. Cluster: R (3) 3. Cluster: L (1) Classes to clusters mapping: 1. Class (L): 3. Cluster 2. Class (B): 1. Cluster 3. Class (R): 2. Cluster ... Note: In order to obtain a useful model, you have to make sure that the cluster algorithms are properly setup for the dataset you are using. E.g., SimpleKMeans has a fixed number of clusters that it should determine. Trying to determine 2 clusters on a dataset with 5 class labels isn't very useful. Notes # All examples are for a Linux bash. For Windows or the SimplCLI, just remove the backslashes and collapse all the lines into a single one. Comments in the examples follow a # and need to be removed, of course. ... denotes an omission of unnecessary content. See also # Use Weka in your Java code , section Clustering explains using the Weka API for clusterers Batch filtering - shows how to use filters in batch mode Serialization - for using serialized/saved models","title":"Using cluster algorithms"},{"location":"using_cluster_algorithms/#commandline","text":"Clusterers can be used in a similar fashion Weka's classifiers: -t <file> specifies the training file -T <file> specifies the test file -p <attribute range> for outputting predictions (if a test file is present, then for this one, otherwise the train file) -c <index> performs a classes to clusters evaluation (during training, the class attribute will be automatically ignored) -x <folds> performs cross-validation for density-based clusterers (no classes to clusters evaluation possible!). With weka.clusterers.MakeDensityBasedClusterer , any clusterer can be turned into a density-based one. -d <file> and -l <file> for saving and loading serialized models Some examples: EM with train and test file: java weka.clusterers.EM \\ -I 10 \\ # only 10 iterations -t train.arff \\ -T test.arff * SimpleKMeans with classes to clusters evaluation: java weka.clusterers.SimpleKMeans -t train.arff \\ -c last # the class attribute is the last Sample output: ... Class attribute: class Classes to Clusters: 0 1 <-- assigned to cluster 242 442 | 3 22 77 | 2 Cluster 0 <-- 2 Cluster 1 <-- 3 Incorrectly clustered instances : 319.0 40.7407 % * running 2-fold cross-validation on SimpleKMeans (we need to use MakeDensityBasedClusterer , since SimpleKMeans is no density-based clusterer!): java weka.clusterers.MakeDensityBasedClusterer -W weka.clusterers.SimpleKMeans \\ -t train.arff \\ -x 2 # 2 folds Sample output: ... 2 fold CV Log Likelihood: -40.9751","title":"Commandline"},{"location":"using_cluster_algorithms/#filters","text":"Weka contains some filters that make life easier with the cluster algorithms.","title":"Filters"},{"location":"using_cluster_algorithms/#addcluster","text":"The filter weka.filters.unsupervised.attribute.AddCluster adds the cluster number as nominal attribute to the data processed by the filter. This makes the post-processing or analyzing of the cluster assignments easier than with the -p X option. Here's an example for the UCI dataset anneal using SimpleKMeans : java weka.filters.unsupervised.attribute.AddCluster \\ -W \"weka.clusterers.SimpleKMeans -N 6 -S 42\" \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff And some example output: @relation ... @attribute family {'?',GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS} @attribute product-type {C,H,G} ... @attribute class {1,2,3,4,5,U} @attribute cluster {cluster1,cluster2,cluster3,cluster4,cluster5,cluster6} @data '?',C,A,8,0,'?',S,'?',0,'?','?',G,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',COIL,0.7,610,0,'?',0,'?',3,cluster2 '?',C,R,0,0,'?',S,2,0,'?','?',E,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',Y,'?','?','?',COIL,3.2,610,0,'?',0,'?',3,cluster2","title":"AddCluster"},{"location":"using_cluster_algorithms/#clustermembership","text":"If you're more interested in the probability for each each cluster an instance gets assigned, you can use the filter weka.filters.unsupervised.attribute.ClusterMembership . Here's an example for the UCI dataset anneal using EM : java weka.filters.unsupervised.attribute.ClusterMembership \\ -W weka.clusterers.EM \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff \\ -- \\ # additional options for EM follow after the -- -I 10 And some example output: @relation ... @attribute pCluster_0_0 numeric @attribute pCluster_0_1 numeric @attribute pCluster_0_2 numeric @attribute pCluster_0_3 numeric @attribute pCluster_0_4 numeric @attribute pCluster_0_5 numeric @data 0.000147,0.009863,0,0.98999,0.000001,0 0.00292,0.000002,0,0.997078,0,0 ...","title":"ClusterMembership"},{"location":"using_cluster_algorithms/#classifiers","text":"","title":"Classifiers"},{"location":"using_cluster_algorithms/#classificationviaclustering","text":"A new meta-classifier, weka.classifiers.meta.ClassificationViaClustering , got introduced in the developer version (>3.5.6), which mimics the clusters to classes functionality of the weka.core.ClusterEvaluation class. A user defined cluster algorithm is built with the training data presented to the meta-classifier (after the class attribute got removed, of course) and then the mapping between classes and clusters is determined. This mapping is then used for predicting class labels of unseen instances. Here's an example for the UCI dataset balance-scale : java weka.classifiers.meta.ClassificationViaClustering \\ -t balance-scale.arff \\ -W weka.clusterers.SimpleKMeans \\ -- \\ -N 3 # additional parameters for SimpleKMeans, since the dataset has 3 class labels And some sample output: ... Clusters to classes mapping: 1. Cluster: B (2) 2. Cluster: R (3) 3. Cluster: L (1) Classes to clusters mapping: 1. Class (L): 3. Cluster 2. Class (B): 1. Cluster 3. Class (R): 2. Cluster ... Note: In order to obtain a useful model, you have to make sure that the cluster algorithms are properly setup for the dataset you are using. E.g., SimpleKMeans has a fixed number of clusters that it should determine. Trying to determine 2 clusters on a dataset with 5 class labels isn't very useful.","title":"ClassificationViaClustering"},{"location":"using_cluster_algorithms/#notes","text":"All examples are for a Linux bash. For Windows or the SimplCLI, just remove the backslashes and collapse all the lines into a single one. Comments in the examples follow a # and need to be removed, of course. ... denotes an omission of unnecessary content.","title":"Notes"},{"location":"using_cluster_algorithms/#see-also","text":"Use Weka in your Java code , section Clustering explains using the Weka API for clusterers Batch filtering - shows how to use filters in batch mode Serialization - for using serialized/saved models","title":"See also"},{"location":"using_clusterers/","text":"see Using cluster algorithms","title":"Using clusterers"},{"location":"using_the_api/","text":"Several articles describe certain aspects of using the Weka API: Use Weka in your Java code Weka Examples Generating cross-validation folds Creating an ARFF file Binarize Attribute ARFF files from Text Collections Adding attributes to a dataset Save Instances to an ARFF File Generating ROC curve Visualizing ROC curve Serialization It is possible to use Weka through Jupyter notebooks as well, see the following article for more information: Jupyter notebooks","title":"Using the API"},{"location":"using_the_mathexpression_filter/","text":"The filter MathExpression can be found in this package: weka.filters.unsupervised.attribute It provides a powerful means of performing mathematical transformations of numeric attributes. The following operators are supported: +, -, *, /, (, ), pow, log, abs, cos, exp, sqrt, tan, sin, ceil, floor, rint, MEAN, MAX, MIN, SD, COUNT, SUM, SUMSQUARED, ifelse The attribute value that is being processed, can be referenced as A . Manual discretization # One can even use the filter for manually discretizing numeric attributes, if the other Discretize filters (supervised and unsupervised) cannot be used. This works thanks to the ifelse operator. It is basically a two-step-process: run MathExpression to turn all the values into discrete ones run NumericToNominal to turn the numeric values then into nominal labels Here's an example: a dataset where the first attribute needs to be discretized into 3 bins the bins need to be as follows (-inf-20.0] (20.0-80.0] (80.0-inf) using MathExpression to create discrete values weka.filters.unsupervised.attribute.MathExpression \\ -E \"ifelse(A>20, ifelse(A>80, 3, 2), 1)\" \\ -V \\ -R 1 Note: -V -R 1 means we only want to transform the first attribute. Without -V all the numeric attributes would be transformed according to this expression. * this results in the following transformation (-inf-20.0] -> 1 (20.0-80.0] -> 2 (80.0-inf) -> 3 using NumericToBinary to create a nominal attribute from the numeric one weka.filters.unsupervised.attribute.NumericToNominal \\ -R 1 optional: if one wants to rename those labels, one can use the class listed in the Rename Attribute Values article for that Note: the \"\\\" at the end of the lines tell a *nix bash to continue on the next line.","title":"Using the mathexpression filter"},{"location":"using_the_mathexpression_filter/#manual-discretization","text":"One can even use the filter for manually discretizing numeric attributes, if the other Discretize filters (supervised and unsupervised) cannot be used. This works thanks to the ifelse operator. It is basically a two-step-process: run MathExpression to turn all the values into discrete ones run NumericToNominal to turn the numeric values then into nominal labels Here's an example: a dataset where the first attribute needs to be discretized into 3 bins the bins need to be as follows (-inf-20.0] (20.0-80.0] (80.0-inf) using MathExpression to create discrete values weka.filters.unsupervised.attribute.MathExpression \\ -E \"ifelse(A>20, ifelse(A>80, 3, 2), 1)\" \\ -V \\ -R 1 Note: -V -R 1 means we only want to transform the first attribute. Without -V all the numeric attributes would be transformed according to this expression. * this results in the following transformation (-inf-20.0] -> 1 (20.0-80.0] -> 2 (80.0-inf) -> 3 using NumericToBinary to create a nominal attribute from the numeric one weka.filters.unsupervised.attribute.NumericToNominal \\ -R 1 optional: if one wants to rename those labels, one can use the class listed in the Rename Attribute Values article for that Note: the \"\\\" at the end of the lines tell a *nix bash to continue on the next line.","title":"Manual discretization"},{"location":"using_weka_from_groovy/","text":"Groovy... is an agile and dynamic language for the Java Virtual Machine builds upon the strengths of Java but has additional power features inspired by languages like Python, Ruby and Smalltalk makes modern programming features available to Java developers with almost-zero learning curve supports Domain-Specific Languages and other compact syntax so your code becomes easy to read and maintain makes writing shell and build scripts easy with its powerful processing primitives, OO abilities and an Ant DSL increases developer productivity by reducing scaffolding code when developing web, GUI, database or console applications simplifies testing by supporting unit testing and mocking out-of-the-box seamlessly integrates with all existing Java objects and libraries compiles straight to Java bytecode so you can use it anywhere you can use Java -- taken from the Groovy homepage This article explains how to use Weka classes within Groovy. Groovy CLASSPATH # Additional jars can be added to Groovy in various ways: $GROOVY_HOME/lib Any jar that is placed in this directory will be available in Groovy $GROOVY_HOME/conf/groovy-starter.conf This file lists jar files and directories from which to include jar files. The syntax is fairly easy: load <path> For example, loading the weka.jar that is located in $HOME/myjars: load !{user.home}/myjars/weka.jar Or loading all jars in a directory, e.g., $HOME/myjars: load !{user.home}/myjars/*.jar Java CLASSPATH Groovy automatically imports the Java CLASSPATH , i.e., everything that is listed in the CLASSPATH environment variable. -classpath option When running a Groovy script, you can explicitly tell Groovy, what CLASSPATH to use: groovy -classpath <jars, etc=\"\"> <script.groovy> The Grape: # Groovy dependency manager Grape is a JAR dependency manager embedded into Groovy. Grape lets you quickly add maven repository dependencies to your classpath, making scripting even easier. Weka can be added as a dependency in your Groovy script You can also search for dependencies and versions numbers on mvnrepository.com and it will provide you the Grab annotation form of the pom.xml entry. Accessing Weka classes from Groovy # Requirements # Groovy 1.5.7 or later Weka 3.5.4 or later Implementation # If your Groovy CLASSPATH has been setup correctly, you can use all those classes in Groovy straight away. E.g., after including the weka.jar , I can run the following little script ( UsingJ48.groovy to train J48 on a supplied dataset and output its model: import weka.classifiers.trees.J48 import weka.core.converters.ConverterUtils.DataSource import weka.core.Instances if ( args . size () == 0 ) { println \"Usage: UsingJ48.groovy <arff-file>\" System . exit ( 0 ) } // load data and set class index data = DataSource . read ( args [ 0 ]) data . setClassIndex ( data . numAttributes () - 1 ) // create the model j48 = new J48 () j48 . buildClassifier ( data ) // print out the built model println j48 A slightly more elaborate example can be found in UsingJ48Ext.groovy , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.groovy needs Weka 3.6.x to run, due to some changes in the API. Implementing a Groovy classifier # Requirements # Groovy 1.5.7 or later developer version of Weka later than 3.5.8 Implementation # Implementing a Groovy classifier is pretty straight-forward, since the syntax is almost the same and Java classes are imported/used in Groovy just like in Java. The GeroR.groovy file re-implements the weka.classifiers.rules.ZeroR classifier as Groovy script. The class declaration for GeroR.groovy looks like this, for instance: class GeroR extends Classifier implements WeightedInstancesHandler { ... } For more information on implementing classifiers, see the Writing your own Classifier article. Execution # Groovy # As long as you have the weka.jar in your Groovy environment, you can directly run these scripts using the groovy command. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: groovy -classpath weka.jar \\ /some/where/FunkyClassifier.groovy \\ -t /my/datasets/data.arff \\ <more options for the Groovy script> Weka # In order to execute classifiers written in Groovy, you have to use the weka.classifiers.scripting.GroovyClassifier classifier. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: java weka.jar:groovy-all-1.5.7.jar \\ weka.classifiers.scripting.GroovyClassifier \\ -t /my/datasets/data.arff \\ -G /some/where/FunkyClassifier.groovy \\ -- <more options for the Groovy script> Downloads # UsingJ48.groovy - simple example for using J48 in a Groovy script UsingJ48Ext.groovy - a slightly more elaborate example script for using J48 CustomCV.groovy thread GeroR.groovy - weka.classifiers.rules.ZeroR implemented in Groovy LibsvmWeights.groovy and outputs the best weights multiple_eval.groovy - evaluates multiple classifiers on multiple train/test pairs and outputs statistics AttributeStatistics.groovy - Outputs statistics for each attribute in a dataset Links # Groovy Homepage Getting Started Style Guide Support","title":"Using weka from groovy"},{"location":"using_weka_from_groovy/#groovy-classpath","text":"Additional jars can be added to Groovy in various ways: $GROOVY_HOME/lib Any jar that is placed in this directory will be available in Groovy $GROOVY_HOME/conf/groovy-starter.conf This file lists jar files and directories from which to include jar files. The syntax is fairly easy: load <path> For example, loading the weka.jar that is located in $HOME/myjars: load !{user.home}/myjars/weka.jar Or loading all jars in a directory, e.g., $HOME/myjars: load !{user.home}/myjars/*.jar Java CLASSPATH Groovy automatically imports the Java CLASSPATH , i.e., everything that is listed in the CLASSPATH environment variable. -classpath option When running a Groovy script, you can explicitly tell Groovy, what CLASSPATH to use: groovy -classpath <jars, etc=\"\"> <script.groovy>","title":"Groovy CLASSPATH"},{"location":"using_weka_from_groovy/#the-grape","text":"Groovy dependency manager Grape is a JAR dependency manager embedded into Groovy. Grape lets you quickly add maven repository dependencies to your classpath, making scripting even easier. Weka can be added as a dependency in your Groovy script You can also search for dependencies and versions numbers on mvnrepository.com and it will provide you the Grab annotation form of the pom.xml entry.","title":"The Grape:"},{"location":"using_weka_from_groovy/#accessing-weka-classes-from-groovy","text":"","title":"Accessing Weka classes from Groovy"},{"location":"using_weka_from_groovy/#requirements","text":"Groovy 1.5.7 or later Weka 3.5.4 or later","title":"Requirements"},{"location":"using_weka_from_groovy/#implementation","text":"If your Groovy CLASSPATH has been setup correctly, you can use all those classes in Groovy straight away. E.g., after including the weka.jar , I can run the following little script ( UsingJ48.groovy to train J48 on a supplied dataset and output its model: import weka.classifiers.trees.J48 import weka.core.converters.ConverterUtils.DataSource import weka.core.Instances if ( args . size () == 0 ) { println \"Usage: UsingJ48.groovy <arff-file>\" System . exit ( 0 ) } // load data and set class index data = DataSource . read ( args [ 0 ]) data . setClassIndex ( data . numAttributes () - 1 ) // create the model j48 = new J48 () j48 . buildClassifier ( data ) // print out the built model println j48 A slightly more elaborate example can be found in UsingJ48Ext.groovy , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.groovy needs Weka 3.6.x to run, due to some changes in the API.","title":"Implementation"},{"location":"using_weka_from_groovy/#implementing-a-groovy-classifier","text":"","title":"Implementing a Groovy classifier"},{"location":"using_weka_from_groovy/#requirements_1","text":"Groovy 1.5.7 or later developer version of Weka later than 3.5.8","title":"Requirements"},{"location":"using_weka_from_groovy/#implementation_1","text":"Implementing a Groovy classifier is pretty straight-forward, since the syntax is almost the same and Java classes are imported/used in Groovy just like in Java. The GeroR.groovy file re-implements the weka.classifiers.rules.ZeroR classifier as Groovy script. The class declaration for GeroR.groovy looks like this, for instance: class GeroR extends Classifier implements WeightedInstancesHandler { ... } For more information on implementing classifiers, see the Writing your own Classifier article.","title":"Implementation"},{"location":"using_weka_from_groovy/#execution","text":"","title":"Execution"},{"location":"using_weka_from_groovy/#groovy","text":"As long as you have the weka.jar in your Groovy environment, you can directly run these scripts using the groovy command. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: groovy -classpath weka.jar \\ /some/where/FunkyClassifier.groovy \\ -t /my/datasets/data.arff \\ <more options for the Groovy script>","title":"Groovy"},{"location":"using_weka_from_groovy/#weka","text":"In order to execute classifiers written in Groovy, you have to use the weka.classifiers.scripting.GroovyClassifier classifier. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: java weka.jar:groovy-all-1.5.7.jar \\ weka.classifiers.scripting.GroovyClassifier \\ -t /my/datasets/data.arff \\ -G /some/where/FunkyClassifier.groovy \\ -- <more options for the Groovy script>","title":"Weka"},{"location":"using_weka_from_groovy/#downloads","text":"UsingJ48.groovy - simple example for using J48 in a Groovy script UsingJ48Ext.groovy - a slightly more elaborate example script for using J48 CustomCV.groovy thread GeroR.groovy - weka.classifiers.rules.ZeroR implemented in Groovy LibsvmWeights.groovy and outputs the best weights multiple_eval.groovy - evaluates multiple classifiers on multiple train/test pairs and outputs statistics AttributeStatistics.groovy - Outputs statistics for each attribute in a dataset","title":"Downloads"},{"location":"using_weka_from_groovy/#links","text":"Groovy Homepage Getting Started Style Guide Support","title":"Links"},{"location":"using_weka_from_jython/","text":"Jython is an implementation of the high-level, dynamic, object-oriented language Python written in 100% Pure Java, and seamlessly integrated with the Java platform. It thus allows you to run Python on any Java platform. -- taken from the Jython homepage This article explains how use Weka classes from within Jython and how to write a classifier in Jython that can be used within the Weka framework. Accessing Weka classes from Jython # Requirements # In order for Jython to find the Weka classes, you must export them in your CLASSPATH . Here is an example for adding the weka.jar located in the directory /some/where to the CLASSPATH in a bash under Linux: export CLASSPATH = $CLASSPATH :/some/where/weka.jar Note: Windows users must just the backslash (\"\\\") in the command prompt instead of the slash (\"/\") in paths. Implementation # As soon as one imports classes in a Jython module one can use that class just like in Java. E.g., if one wants to use the J48 classifier, one only needs to import it as follows: import weka.classifiers.trees.J48 as J48 Here's a Jython module ( UsingJ48.py ): import sys import java.io.FileReader as FileReader import weka.core.Instances as Instances import weka.classifiers.trees.J48 as J48 # load data file file = FileReader ( \"/some/where/file.arff\" ) data = Instances ( file ) data . setClassIndex ( data . numAttributes () - 1 ) # create the model j48 = J48 () j48 . buildClassifier ( data ) # print out the built model print j48 A slightly more elaborate example can be found in UsingJ48Ext.py , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.py needs Weka 3.6.x to run, due to some changes in the API. Implementing a Jython classifier # Requirements # Weka >3.5.6 Jython 2.2rc2 (later versions should work as well) Implementation # This section covers the implementation of weka.classifiers.rules.ZeroR in Python, JeroR.py : Subclass an abstract superclass of Weka classifiers (in this case weka.classifiers.Classifier ): class JeroR (**Classifier**, JythonSerializableObject): Note: the JythonSerializableObject interface is necessary for Serialization purposes (Weka creates copies of classifiers via serialization ) You have to implement the following methods: def listOptions(self): Returns an java.util.Enumeration of weka.core.Option objects of all available options. Calling the superclass method is done with *<superclass>*.listOptions() , e.g., Classifier.listOptions() . def setOptions(self, options): Sets the commandline options, with the parameter options being an array of strings. def getOptions(self): Returns an array of strings, containing all the currently set options (to be used with setOptions(self,options) ). def getCapabilities(self): Returns a weka.core.Capabilities object with information about what attributes and classes can be processed by this algorithm. def buildClassifier(self, instances): This method builds the actual model based on the data provided. The first statements in this method should be the ones checking the capabilities of the algorithm against the data and removing all instances with a missing class value: # check the capabilities self . getCapabilities () . testWithFail ( instances ) # remove instances with missing class instances = Instances ( instances ) instances . deleteWithMissingClass () at least one of the following two: def classifyInstance(self, instance): Returns either the index of the predicted class label (for nominal classes) or the regression result (for numeric classes) def distributionForInstance(self, instance): This method returns an array of doubles containing the probabilities for all class labels. In case of a numeric class attribute, the length of this array is 1. In Jython, you can use the [jarray](http://www.jython.org/docs/jarray.html) module to generate a double array. With the following line you can create the correct array to be returned by this method (you still need to fill it with values): result = jarray.zeros(instance.numClasses(), 'd') Of course, the elements of this array must sum up to 1. * def toString(self): Returns a string describing the not-yet-built or built model. The following code snippet simulates the \"main\" method; it creates an instance of the classifier and passes it on to the Classifier.runClassifier method: if __name__ = \"__main__\" : Classifier . runClassifier ( JeroR (), sys . argv [ 1 :]) This doesn't work right out-of-the-box, since Jython cannot access protected static methods in superclasses. One has to set the following value in the Jython registry to make it work (taken from this FAQ): python.security.respectJavaAccessibility = false Documentation # Documentation in Python is done with the so-called doc strings within the class or method the documentation is for. Using HappyDoc , one can use structured text to output nice HTML, similar to Javadoc . Class doc string: class JeroR ( Classifier , JythonSerializableObject ): \"\"\" JeroR is a Jython implementation of the Weka classifier ZeroR 'author' -- FracPete (fracpete at waikato dot ac dot nz) 'version' -- $Revision$ \"\"\" Note: the $Revision$ tag is filled in by a source control system like CVS or Subversion. Method doc string: def classifyInstance ( self , instance ): \"\"\" returns the prediction for the given instance Parameter(s): 'instance' -- the instance to predict the class value for Return: the prediction for the given instance \"\"\" Execution # Note: The commands listed here for a Linux/Unix bash, for Windows remove all the backslashes (\"\\\") at the end of the lines and assemble the command in a single line. Under Windows, the path separator \":\" used in the CLASSPATH needs to be replaced with \";\" as well. Jython # The Jython classifier, e.g., FunkyClassifier.py , can be run like this from commandline, with only the weka.jar and the jython.jar in the CLASSPATH : java -classpath weka.jar:jython.jar \\ org.python.util.jython \\ /some/place/FunkyClassifier.py \\ -t /some/where/file.arff Weka # In order to execute the Jython classifier FunkyClassifier.py with Weka, one basically only needs to have the weka.jar and the jython.jar in the CLASSPATH and call the weka.classifiers.JythonClassifier classifier with the Jython classifier, i.e., FunkyClassifier.py , as parameter (\" -J \"): java -classpath weka.jar:jython.jar \\ weka.classifiers.JythonClassifier \\ -J /some/place/FunkyClassifier.py \\ -t /some/where/file.arff Downloads # UsingJ48.py UsingJ48Ext.py JeroR.py - weka.classifiers.rules.ZeroR as Jython script implemented See also # Use Weka in your Java code - for general information on how to use the Weka API Using Weka via Jepp - using the javax.script approach to interface Java and Python Links # Jython Homepage Java arrays Registry Embedding Jython Python Homepage HappyDoc - generating documentation from Jython/Python modules Java Homepage Javadoc Eclipse Homepage PyDev plugin","title":"Using weka from jython"},{"location":"using_weka_from_jython/#accessing-weka-classes-from-jython","text":"","title":"Accessing Weka classes from Jython"},{"location":"using_weka_from_jython/#requirements","text":"In order for Jython to find the Weka classes, you must export them in your CLASSPATH . Here is an example for adding the weka.jar located in the directory /some/where to the CLASSPATH in a bash under Linux: export CLASSPATH = $CLASSPATH :/some/where/weka.jar Note: Windows users must just the backslash (\"\\\") in the command prompt instead of the slash (\"/\") in paths.","title":"Requirements"},{"location":"using_weka_from_jython/#implementation","text":"As soon as one imports classes in a Jython module one can use that class just like in Java. E.g., if one wants to use the J48 classifier, one only needs to import it as follows: import weka.classifiers.trees.J48 as J48 Here's a Jython module ( UsingJ48.py ): import sys import java.io.FileReader as FileReader import weka.core.Instances as Instances import weka.classifiers.trees.J48 as J48 # load data file file = FileReader ( \"/some/where/file.arff\" ) data = Instances ( file ) data . setClassIndex ( data . numAttributes () - 1 ) # create the model j48 = J48 () j48 . buildClassifier ( data ) # print out the built model print j48 A slightly more elaborate example can be found in UsingJ48Ext.py , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.py needs Weka 3.6.x to run, due to some changes in the API.","title":"Implementation"},{"location":"using_weka_from_jython/#implementing-a-jython-classifier","text":"","title":"Implementing a Jython classifier"},{"location":"using_weka_from_jython/#requirements_1","text":"Weka >3.5.6 Jython 2.2rc2 (later versions should work as well)","title":"Requirements"},{"location":"using_weka_from_jython/#implementation_1","text":"This section covers the implementation of weka.classifiers.rules.ZeroR in Python, JeroR.py : Subclass an abstract superclass of Weka classifiers (in this case weka.classifiers.Classifier ): class JeroR (**Classifier**, JythonSerializableObject): Note: the JythonSerializableObject interface is necessary for Serialization purposes (Weka creates copies of classifiers via serialization ) You have to implement the following methods: def listOptions(self): Returns an java.util.Enumeration of weka.core.Option objects of all available options. Calling the superclass method is done with *<superclass>*.listOptions() , e.g., Classifier.listOptions() . def setOptions(self, options): Sets the commandline options, with the parameter options being an array of strings. def getOptions(self): Returns an array of strings, containing all the currently set options (to be used with setOptions(self,options) ). def getCapabilities(self): Returns a weka.core.Capabilities object with information about what attributes and classes can be processed by this algorithm. def buildClassifier(self, instances): This method builds the actual model based on the data provided. The first statements in this method should be the ones checking the capabilities of the algorithm against the data and removing all instances with a missing class value: # check the capabilities self . getCapabilities () . testWithFail ( instances ) # remove instances with missing class instances = Instances ( instances ) instances . deleteWithMissingClass () at least one of the following two: def classifyInstance(self, instance): Returns either the index of the predicted class label (for nominal classes) or the regression result (for numeric classes) def distributionForInstance(self, instance): This method returns an array of doubles containing the probabilities for all class labels. In case of a numeric class attribute, the length of this array is 1. In Jython, you can use the [jarray](http://www.jython.org/docs/jarray.html) module to generate a double array. With the following line you can create the correct array to be returned by this method (you still need to fill it with values): result = jarray.zeros(instance.numClasses(), 'd') Of course, the elements of this array must sum up to 1. * def toString(self): Returns a string describing the not-yet-built or built model. The following code snippet simulates the \"main\" method; it creates an instance of the classifier and passes it on to the Classifier.runClassifier method: if __name__ = \"__main__\" : Classifier . runClassifier ( JeroR (), sys . argv [ 1 :]) This doesn't work right out-of-the-box, since Jython cannot access protected static methods in superclasses. One has to set the following value in the Jython registry to make it work (taken from this FAQ): python.security.respectJavaAccessibility = false","title":"Implementation"},{"location":"using_weka_from_jython/#documentation","text":"Documentation in Python is done with the so-called doc strings within the class or method the documentation is for. Using HappyDoc , one can use structured text to output nice HTML, similar to Javadoc . Class doc string: class JeroR ( Classifier , JythonSerializableObject ): \"\"\" JeroR is a Jython implementation of the Weka classifier ZeroR 'author' -- FracPete (fracpete at waikato dot ac dot nz) 'version' -- $Revision$ \"\"\" Note: the $Revision$ tag is filled in by a source control system like CVS or Subversion. Method doc string: def classifyInstance ( self , instance ): \"\"\" returns the prediction for the given instance Parameter(s): 'instance' -- the instance to predict the class value for Return: the prediction for the given instance \"\"\"","title":"Documentation"},{"location":"using_weka_from_jython/#execution","text":"Note: The commands listed here for a Linux/Unix bash, for Windows remove all the backslashes (\"\\\") at the end of the lines and assemble the command in a single line. Under Windows, the path separator \":\" used in the CLASSPATH needs to be replaced with \";\" as well.","title":"Execution"},{"location":"using_weka_from_jython/#jython","text":"The Jython classifier, e.g., FunkyClassifier.py , can be run like this from commandline, with only the weka.jar and the jython.jar in the CLASSPATH : java -classpath weka.jar:jython.jar \\ org.python.util.jython \\ /some/place/FunkyClassifier.py \\ -t /some/where/file.arff","title":"Jython"},{"location":"using_weka_from_jython/#weka","text":"In order to execute the Jython classifier FunkyClassifier.py with Weka, one basically only needs to have the weka.jar and the jython.jar in the CLASSPATH and call the weka.classifiers.JythonClassifier classifier with the Jython classifier, i.e., FunkyClassifier.py , as parameter (\" -J \"): java -classpath weka.jar:jython.jar \\ weka.classifiers.JythonClassifier \\ -J /some/place/FunkyClassifier.py \\ -t /some/where/file.arff","title":"Weka"},{"location":"using_weka_from_jython/#downloads","text":"UsingJ48.py UsingJ48Ext.py JeroR.py - weka.classifiers.rules.ZeroR as Jython script implemented","title":"Downloads"},{"location":"using_weka_from_jython/#see-also","text":"Use Weka in your Java code - for general information on how to use the Weka API Using Weka via Jepp - using the javax.script approach to interface Java and Python","title":"See also"},{"location":"using_weka_from_jython/#links","text":"Jython Homepage Java arrays Registry Embedding Jython Python Homepage HappyDoc - generating documentation from Jython/Python modules Java Homepage Javadoc Eclipse Homepage PyDev plugin","title":"Links"},{"location":"using_weka_via_jepp/","text":"Jepp embeds CPython in Java. It is safe to use in a heavily threaded environment, it is quite fast and its stability is a main feature and goal. --taken from the Jepp homepage Prerequisites # Java 6 (jepp makes use of the javax.script package) Jepp 2.2 or higher suggested fix for the missing sys.argv problem Limitations # Jepp doesn't seem to be able to import third-party libraries like scipy , numpy or wx (pure Python modules can be imported, though). Accessing Weka classes within Jepp # Java classes are imported in one's Python script as follows: from < package > import < class > E.g., importing J48 looks like this: from weka.classifiers.trees import J48 In the following a little example script for loading a dataset, cross-validating J48 with it and outputting the results of the cross-validation in the console: # import classes from weka.core import Instances from weka.classifiers import Evaluation from weka.classifiers.trees import J48 from java.io import BufferedReader from java.io import FileReader from java.util import Random # load data reader = BufferedReader ( FileReader ( '/some/where/file.arff' )) data = Instances ( reader ) data . setClassIndex ( data . numAttributes () - 1 ) reader . close () # train classifier j48 = J48 () eval = Evaluation ( data ) rand = Random ( 1 ) eval . crossValidateModel ( j48 , data , 10 , rand ) # output summary print eval . toSummaryString () The script can be started like this (you will have to adjust the paths for the jars and the Python script): java -classpath jep.jar:weka.jar some_script.py See also # Use Weka in your Java code - for general information on how to use the Weka API Using Weka from Jython Links # Jepp homepage Python homepage Linux.com - more examples","title":"Using weka via jepp"},{"location":"using_weka_via_jepp/#prerequisites","text":"Java 6 (jepp makes use of the javax.script package) Jepp 2.2 or higher suggested fix for the missing sys.argv problem","title":"Prerequisites"},{"location":"using_weka_via_jepp/#limitations","text":"Jepp doesn't seem to be able to import third-party libraries like scipy , numpy or wx (pure Python modules can be imported, though).","title":"Limitations"},{"location":"using_weka_via_jepp/#accessing-weka-classes-within-jepp","text":"Java classes are imported in one's Python script as follows: from < package > import < class > E.g., importing J48 looks like this: from weka.classifiers.trees import J48 In the following a little example script for loading a dataset, cross-validating J48 with it and outputting the results of the cross-validation in the console: # import classes from weka.core import Instances from weka.classifiers import Evaluation from weka.classifiers.trees import J48 from java.io import BufferedReader from java.io import FileReader from java.util import Random # load data reader = BufferedReader ( FileReader ( '/some/where/file.arff' )) data = Instances ( reader ) data . setClassIndex ( data . numAttributes () - 1 ) reader . close () # train classifier j48 = J48 () eval = Evaluation ( data ) rand = Random ( 1 ) eval . crossValidateModel ( j48 , data , 10 , rand ) # output summary print eval . toSummaryString () The script can be started like this (you will have to adjust the paths for the jars and the Python script): java -classpath jep.jar:weka.jar some_script.py","title":"Accessing Weka classes within Jepp"},{"location":"using_weka_via_jepp/#see-also","text":"Use Weka in your Java code - for general information on how to use the Weka API Using Weka from Jython","title":"See also"},{"location":"using_weka_via_jepp/#links","text":"Jepp homepage Python homepage Linux.com - more examples","title":"Links"},{"location":"visualization_articles/","text":"Several articles are listed below that relate to visualizing results or customizing visual elements of WEKA: Visualizing a Tree Visualizing cluster assignments Visualizing ROC curve Changing the plot background Displaying results of cross-validation folds Explorer error visualization plugins Explorer graph visualization plugins Explorer prediction visualization plugins Explorer tree visualization plugins Explorer visualization plugins Exporting Charts from the Knowledge Flow Extensions for WEKAs main GUI Plotting error rate for incremental classifier","title":"Visualization Articles"},{"location":"weka_core_capabilities.props/","text":"File # weka/core/Capabilities.props Description # Customization of the Capabilities support in Weka. CAUTION: disabling any of these properties can lead to unreliable results within Weka! Version # 3.5.3 Fields # Test general switch for Capabilities tests InstancesTest enable/disables tests that are based on the data AttributeTest enable/disables tests that work only on the type of attribute MissingValuesTest test for missing values MissingClassValuesTest test for missing class values MinimumNumberInstancesTest test for minimum number of instances See also # Properties file Links #","title":"File"},{"location":"weka_core_capabilities.props/#file","text":"weka/core/Capabilities.props","title":"File"},{"location":"weka_core_capabilities.props/#description","text":"Customization of the Capabilities support in Weka. CAUTION: disabling any of these properties can lead to unreliable results within Weka!","title":"Description"},{"location":"weka_core_capabilities.props/#version","text":"3.5.3","title":"Version"},{"location":"weka_core_capabilities.props/#fields","text":"Test general switch for Capabilities tests InstancesTest enable/disables tests that are based on the data AttributeTest enable/disables tests that work only on the type of attribute MissingValuesTest test for missing values MissingClassValuesTest test for missing class values MinimumNumberInstancesTest test for minimum number of instances","title":"Fields"},{"location":"weka_core_capabilities.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_core_capabilities.props/#links","text":"","title":"Links"},{"location":"weka_core_logging_logging.props/","text":"File # weka/core/logging/Logging.props Description # Defines the type of logging Weka performs. The default is to log regular log messages from the GUI and everything that is output to stdout and stderr to $HOME/weka.log . Version # 3.5.8 Fields # Logger The logger class to use for logging. See Javadoc of respective logger class (derived from weka.core.logging.Logger ) for more information. MinLevel Sets the minimum level a log messages needs to have in order to end up in the log. ALL will log everything, OFF turns logging off. DateFormat The ISO-8601 format of the date. Here is the default value: yyyy-MM-dd HH:mm:ss LogFile In case a logger class logs to a file, like weka.core.logging.FileLogger and weka.core.logging.OutputLogger , this file will be used. These loggers clear the log-file everytime Weka is started. See also # Properties file Links # ISO-8601","title":"File"},{"location":"weka_core_logging_logging.props/#file","text":"weka/core/logging/Logging.props","title":"File"},{"location":"weka_core_logging_logging.props/#description","text":"Defines the type of logging Weka performs. The default is to log regular log messages from the GUI and everything that is output to stdout and stderr to $HOME/weka.log .","title":"Description"},{"location":"weka_core_logging_logging.props/#version","text":"3.5.8","title":"Version"},{"location":"weka_core_logging_logging.props/#fields","text":"Logger The logger class to use for logging. See Javadoc of respective logger class (derived from weka.core.logging.Logger ) for more information. MinLevel Sets the minimum level a log messages needs to have in order to end up in the log. ALL will log everything, OFF turns logging off. DateFormat The ISO-8601 format of the date. Here is the default value: yyyy-MM-dd HH:mm:ss LogFile In case a logger class logs to a file, like weka.core.logging.FileLogger and weka.core.logging.OutputLogger , this file will be used. These loggers clear the log-file everytime Weka is started.","title":"Fields"},{"location":"weka_core_logging_logging.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_core_logging_logging.props/#links","text":"ISO-8601","title":"Links"},{"location":"weka_examples/","text":"The Weka Examples collection is a comprehensive collection of examples for the different versions of Weka in the form of an ANT project. You can access these examples as follows: Git # Through git stable-3.8 version (3.8.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples/ developer version (3.9.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples","title":"Weka examples"},{"location":"weka_examples/#git","text":"Through git stable-3.8 version (3.8.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples/ developer version (3.9.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples","title":"Git"},{"location":"weka_experiment_database_utils.props/","text":"File # weka/experiment/DatabaseUtils.props Description # Defines the Databases setup, i.e., JDBC driver information, JDBC URL, database type conversion, etc. Version # >= 3.1.3 Fields # General jdbcDriver the comma-separated list of jdbc drivers to try loading jdbcURL the JDBC URL to the database Table creation CREATE_STRING database specific datatype, e.g., TEXT CREATE_INT database specific datatype, e.g., INT CREATE_DOUBLE database specific datatype, e.g., DOUBLE Database flags checkUpperCaseNames necessary if database turns column names into upper case ones, e.g., HSQLDB checkLowerCaseNames (> 3.5.3) necessary if database turns column names into lower case ones, e.g., PostgreSQL checkForTable (> 3.5.3) Checks whether the tables in the query are available in the meta-data of the JDBC Connection. Some tables, like pg_tables , exist but are not available through the meta-data setAutoCommit setting for java.sql.Connection.setAutoCommit(boolean) createIndex whether to create a primary key Key_IDX in the results table of an experiment Special flags for DatabaseLoader/Saver (package weka.core.converters ) nominalToStringLimit (>= 3.4.1) beyond this limit, nominal columns are loaded as STRING attributes and no longer as NOMINAL ones idColumn (>= 3.4.1) unique key in table that allows ordering for incremental loading Keywords (> 3.5.8, > 3.6.0) lists all the reserved keywords of the current database type default: AND,ASC,BY,DESC,FROM,GROUP,INSERT,ORDER,SELECT,UPDATE,WHERE KeywordsMaskChar (> 3.5.8, > 3.6.0) the character to append to attribute names/table names that would be interpreted as keywords by the database, in order to avoid exceptions when executing SQL commands defaut: _ Database type mapping # In order to import the data from database correctly into Weka, one has to specify what JDBC datatype corresponds to what Java SQL retrieval method. Here's an overview of how the Java types are mapped to Weka's attribute types: Java type Java method Identifier Weka attribute type Version String getString() 0 nominal boolean getBoolean() 1 nominal double getDouble() 2 numeric byte getByte() 3 numeric short getByte() 4 numeric int getInteger() 5 numeric long getLong() 6 numeric float getFloat() 7 numeric date getDate() 8 date text getString() 9 string >3.5.5 time getTime() 10 string >3.5.8 In the props file one lists now the type names that the database returns and what Java type it represents (via the identifier), e.g.: CHAR = 0 VARCHAR = 0 CHAR and VARCHAR are both String types, hence they are interpreted as String (identifier 0 ) Note: in case database types have blanks, one needs to replace those blanks with an underscore, e.g., DOUBLE PRECISION must be listed like this: DOUBLE_PRECISION = 2 See also # Databases Properties file","title":"File"},{"location":"weka_experiment_database_utils.props/#file","text":"weka/experiment/DatabaseUtils.props","title":"File"},{"location":"weka_experiment_database_utils.props/#description","text":"Defines the Databases setup, i.e., JDBC driver information, JDBC URL, database type conversion, etc.","title":"Description"},{"location":"weka_experiment_database_utils.props/#version","text":">= 3.1.3","title":"Version"},{"location":"weka_experiment_database_utils.props/#fields","text":"General jdbcDriver the comma-separated list of jdbc drivers to try loading jdbcURL the JDBC URL to the database Table creation CREATE_STRING database specific datatype, e.g., TEXT CREATE_INT database specific datatype, e.g., INT CREATE_DOUBLE database specific datatype, e.g., DOUBLE Database flags checkUpperCaseNames necessary if database turns column names into upper case ones, e.g., HSQLDB checkLowerCaseNames (> 3.5.3) necessary if database turns column names into lower case ones, e.g., PostgreSQL checkForTable (> 3.5.3) Checks whether the tables in the query are available in the meta-data of the JDBC Connection. Some tables, like pg_tables , exist but are not available through the meta-data setAutoCommit setting for java.sql.Connection.setAutoCommit(boolean) createIndex whether to create a primary key Key_IDX in the results table of an experiment Special flags for DatabaseLoader/Saver (package weka.core.converters ) nominalToStringLimit (>= 3.4.1) beyond this limit, nominal columns are loaded as STRING attributes and no longer as NOMINAL ones idColumn (>= 3.4.1) unique key in table that allows ordering for incremental loading Keywords (> 3.5.8, > 3.6.0) lists all the reserved keywords of the current database type default: AND,ASC,BY,DESC,FROM,GROUP,INSERT,ORDER,SELECT,UPDATE,WHERE KeywordsMaskChar (> 3.5.8, > 3.6.0) the character to append to attribute names/table names that would be interpreted as keywords by the database, in order to avoid exceptions when executing SQL commands defaut: _","title":"Fields"},{"location":"weka_experiment_database_utils.props/#database-type-mapping","text":"In order to import the data from database correctly into Weka, one has to specify what JDBC datatype corresponds to what Java SQL retrieval method. Here's an overview of how the Java types are mapped to Weka's attribute types: Java type Java method Identifier Weka attribute type Version String getString() 0 nominal boolean getBoolean() 1 nominal double getDouble() 2 numeric byte getByte() 3 numeric short getByte() 4 numeric int getInteger() 5 numeric long getLong() 6 numeric float getFloat() 7 numeric date getDate() 8 date text getString() 9 string >3.5.5 time getTime() 10 string >3.5.8 In the props file one lists now the type names that the database returns and what Java type it represents (via the identifier), e.g.: CHAR = 0 VARCHAR = 0 CHAR and VARCHAR are both String types, hence they are interpreted as String (identifier 0 ) Note: in case database types have blanks, one needs to replace those blanks with an underscore, e.g., DOUBLE PRECISION must be listed like this: DOUBLE_PRECISION = 2","title":"Database type mapping"},{"location":"weka_experiment_database_utils.props/#see-also","text":"Databases Properties file","title":"See also"},{"location":"weka_for_newbies/","text":"Apart from the Wiki and the other standard sources of information about Weka, such as the manual, there is a lot of other relevant information available in books and online. This page has been filled mostly thanks to answers on the wekalist . Important Tips # Since many people use Weka, lots of (basic & advanced) questions have already been asked on the mailing list. Therefore, using \" wekalist\" in your preferred search engine might help you get an answer faster than asking the same question again on the list before doing any research on your own first. In case you would be really lazy : http://www.google.com/search?hl=en&q= %20wekalist Java programming help # http://mindprod.com/jgloss/jcheat.html http://mindprod.com/jgloss/jgloss.html In French: http://www.jmdoudoux.fr/accueil_java.htm AI and machine learning courses # Artificial intelligence https://www.udacity.com/wiki/cs271 Machine learning https://www.youtube.com/channel/UCXYXSGq6Oz21b43hpW2DCvw https://www.coursera.org/course/ml http://www.cs.cornell.edu/Courses/cs4780/2013fa/#lectures http://shop.oreilly.com/product/0636920025610.do Specialized (Probabilistic Graphical Models - PGM: Bayesian and Markov networks) https://www.coursera.org/course/pgm Introductory books on machine learning # https://www.amazon.com/Data-Mining-Practical-Techniques-Management/dp/0128042915/ref=sr_1_1?keywords=data+mining+weka&qid=1575607507&s=books&sr=1-1 http://www.amazon.com/Machine-Learning-Tom-M-Mitchell/dp/0070428077/ref=sr_1_1?ie=UTF8&qid=1394186512&sr=8-1&keywords=tom+mitchell http://www.amazon.com/Pattern-Classification-Pt-1-Richard-Duda/dp/0471056693/ref=pd_sim_b_5?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Artificial-Intelligence-Modern-Approach-Edition/dp/0136042597/ref=pd_sim_b_6?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Introduction-Machine-Learning-Adaptive-Computation/dp/026201243X/ref=sr_1_1?s=books&ie=UTF8&qid=1394186894&sr=1-1&keywords=Alpaydin-Introduction+to+Machine+Learning In French: http://www.amazon.com/Apprentissage-artificiel-algorithmes-Antoine-Cornu%C3%A9jols/dp/2212110200/ref=sr_1_6?ie=UTF8&qid=1394186532&sr=8-6&keywords=cornu%C3%A9jols Third-party introductions to Weka # http://www.ibm.com/developerworks/library/os-weka1/ https://www.youtube.com/watch?v=TF1yh5PKaqI&t=13s https://www.youtube.com/playlist?list=PLJbE6j2EG1pZnBhOg3_Rb63WLCprtyJag https://www.youtube.com/watch?v=m7kpIBGEdkI Weka-based development # Code Examples A Simple Text Classifier in Java with WEKA presents and discuses two little programs as examples of how to integrate WEKA into your Java code for text mining: http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html Language Identification as Text Classification with WEKA explains how to build an automated language guesser for texts as a complete example of a text mining process with WEKA, and in order to demonstrate a more advanced usage of the StringToWordVector class: http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html Sample Code for Text Indexing with WEKA shows how to index a text dataset using your own Java code and the StringToWordVector filter in WEKA: http://jmgomezhidalgo.blogspot.com.es/2013/06/sample-code-for-text-indexing-with-weka.html Text Mining in WEKA Revisited: Selecting Attributes by Chaining Filters: http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html Specific Applications and other tools # Search Engine / Reranking https://www.lemurproject.org/sifaka.php https://cs.uni-paderborn.de/de/is/research/research-projects/software/weka-lr-a-label-ranking-extension-for-weka/ https://github.com/quansun/fantail-ml (ranking prediction, multi-target regression, label ranking and metalearning) Extraction, Transformation Loading (ETL) https://community.hitachivantara.com/s/article/data-integration-kettle","title":"Weka for newbies"},{"location":"weka_for_newbies/#important-tips","text":"Since many people use Weka, lots of (basic & advanced) questions have already been asked on the mailing list. Therefore, using \" wekalist\" in your preferred search engine might help you get an answer faster than asking the same question again on the list before doing any research on your own first. In case you would be really lazy : http://www.google.com/search?hl=en&q= %20wekalist","title":"Important Tips"},{"location":"weka_for_newbies/#java-programming-help","text":"http://mindprod.com/jgloss/jcheat.html http://mindprod.com/jgloss/jgloss.html In French: http://www.jmdoudoux.fr/accueil_java.htm","title":"Java programming help"},{"location":"weka_for_newbies/#ai-and-machine-learning-courses","text":"Artificial intelligence https://www.udacity.com/wiki/cs271 Machine learning https://www.youtube.com/channel/UCXYXSGq6Oz21b43hpW2DCvw https://www.coursera.org/course/ml http://www.cs.cornell.edu/Courses/cs4780/2013fa/#lectures http://shop.oreilly.com/product/0636920025610.do Specialized (Probabilistic Graphical Models - PGM: Bayesian and Markov networks) https://www.coursera.org/course/pgm","title":"AI and machine learning courses"},{"location":"weka_for_newbies/#introductory-books-on-machine-learning","text":"https://www.amazon.com/Data-Mining-Practical-Techniques-Management/dp/0128042915/ref=sr_1_1?keywords=data+mining+weka&qid=1575607507&s=books&sr=1-1 http://www.amazon.com/Machine-Learning-Tom-M-Mitchell/dp/0070428077/ref=sr_1_1?ie=UTF8&qid=1394186512&sr=8-1&keywords=tom+mitchell http://www.amazon.com/Pattern-Classification-Pt-1-Richard-Duda/dp/0471056693/ref=pd_sim_b_5?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Artificial-Intelligence-Modern-Approach-Edition/dp/0136042597/ref=pd_sim_b_6?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Introduction-Machine-Learning-Adaptive-Computation/dp/026201243X/ref=sr_1_1?s=books&ie=UTF8&qid=1394186894&sr=1-1&keywords=Alpaydin-Introduction+to+Machine+Learning In French: http://www.amazon.com/Apprentissage-artificiel-algorithmes-Antoine-Cornu%C3%A9jols/dp/2212110200/ref=sr_1_6?ie=UTF8&qid=1394186532&sr=8-6&keywords=cornu%C3%A9jols","title":"Introductory books on machine learning"},{"location":"weka_for_newbies/#third-party-introductions-to-weka","text":"http://www.ibm.com/developerworks/library/os-weka1/ https://www.youtube.com/watch?v=TF1yh5PKaqI&t=13s https://www.youtube.com/playlist?list=PLJbE6j2EG1pZnBhOg3_Rb63WLCprtyJag https://www.youtube.com/watch?v=m7kpIBGEdkI","title":"Third-party introductions to Weka"},{"location":"weka_for_newbies/#weka-based-development","text":"Code Examples A Simple Text Classifier in Java with WEKA presents and discuses two little programs as examples of how to integrate WEKA into your Java code for text mining: http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html Language Identification as Text Classification with WEKA explains how to build an automated language guesser for texts as a complete example of a text mining process with WEKA, and in order to demonstrate a more advanced usage of the StringToWordVector class: http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html Sample Code for Text Indexing with WEKA shows how to index a text dataset using your own Java code and the StringToWordVector filter in WEKA: http://jmgomezhidalgo.blogspot.com.es/2013/06/sample-code-for-text-indexing-with-weka.html Text Mining in WEKA Revisited: Selecting Attributes by Chaining Filters: http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html","title":"Weka-based development"},{"location":"weka_for_newbies/#specific-applications-and-other-tools","text":"Search Engine / Reranking https://www.lemurproject.org/sifaka.php https://cs.uni-paderborn.de/de/is/research/research-projects/software/weka-lr-a-label-ranking-extension-for-weka/ https://github.com/quansun/fantail-ml (ranking prediction, multi-target regression, label ranking and metalearning) Extraction, Transformation Loading (ETL) https://community.hitachivantara.com/s/article/data-integration-kettle","title":"Specific Applications and other tools"},{"location":"weka_gui_beans_beans.props/","text":"File # weka/gui/beans/Beans.props Description # Properties file that customizes the KnowledgeFlow. Version # >= 3.3.2 Fields # weka.gui.beans.KnowledgeFlow.standardToolBars list of standard toolbars (containing bean tools that do not wrap weka base class types) *.order toolbar ordering information for wrapper types *.alias toolbar naming aliases for weka algorithm classes GUI behavior (>= 3.5.1) ScrollBarIncrementLayout the increment for scrollbars using the mouse's scroll-wheel in the layout area ScrollBarIncrementComponents the increment for scrollbars using the mouse's scroll-wheel in the component area (i.e., the toolbars) FlowWidth the size of the layout area FlowHeight the size of the layout area PreferredExtension the preferred file extension (and therefore format) for saving a flow layout UserComponentsInXML whether the user components (\"meta\"-beans) are saved in XML (i.e., .kfml ) or not Colours (> 3.5.7) weka.gui.beans.StripChart.backgroundColour the background color of the StripChart, default is black (can use R,G,B format) weka.gui.beans.StripChart$LegendPanel.borderColour the color of the text on the legend's border, default is blue (can use R,G,B format) See also # Properties File","title":"File"},{"location":"weka_gui_beans_beans.props/#file","text":"weka/gui/beans/Beans.props","title":"File"},{"location":"weka_gui_beans_beans.props/#description","text":"Properties file that customizes the KnowledgeFlow.","title":"Description"},{"location":"weka_gui_beans_beans.props/#version","text":">= 3.3.2","title":"Version"},{"location":"weka_gui_beans_beans.props/#fields","text":"weka.gui.beans.KnowledgeFlow.standardToolBars list of standard toolbars (containing bean tools that do not wrap weka base class types) *.order toolbar ordering information for wrapper types *.alias toolbar naming aliases for weka algorithm classes GUI behavior (>= 3.5.1) ScrollBarIncrementLayout the increment for scrollbars using the mouse's scroll-wheel in the layout area ScrollBarIncrementComponents the increment for scrollbars using the mouse's scroll-wheel in the component area (i.e., the toolbars) FlowWidth the size of the layout area FlowHeight the size of the layout area PreferredExtension the preferred file extension (and therefore format) for saving a flow layout UserComponentsInXML whether the user components (\"meta\"-beans) are saved in XML (i.e., .kfml ) or not Colours (> 3.5.7) weka.gui.beans.StripChart.backgroundColour the background color of the StripChart, default is black (can use R,G,B format) weka.gui.beans.StripChart$LegendPanel.borderColour the color of the text on the legend's border, default is blue (can use R,G,B format)","title":"Fields"},{"location":"weka_gui_beans_beans.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_experiment_experimenter.props/","text":"File # weka/gui/experiment/Experimenter.props Description # Used for customizing the initial Experimenter settings. Version # >= 3.4.6 >= 3.5.1 Fields # Extension the default extension in the file-dialog (and therefore format) .exp - uses Java Serialization [.xml](xml#serialization of experiments.md) [.koml](xml#serialization of experiments.md) Destination - simple the default destination ARFF file CSV file JDBC database ExperimentType - simple the experiment type Cross-validation Train/Test Percentage Split (data randomized) Train/Test Percentage Split (order preserved) UseClassification - simple whether classification is the default ( true ) or regression ( false ) Folds - simple the default number of CV folds TrainPercentage - simple the default percentage for training (0 - 100) Repetitions - simple the default number of repetitions DatasetsFirst whether datasets are first iterated ( true ) or the algorithms ( false ) InitialDatasetsDirectory the initial datasets directory Note for Win32: the path backslashes have to written as \"\\\" UseRelativePaths whether to use relative paths ( true ) or absolute ones ( false ) Tester the default tester to use Paired T-Tester (corrected) Paired T-Tester Row the row selection Column the column selection ComparisonField the default comparison field (lower case!), cf. combobox Significance the default significance (0.0 - 1.0) Sorting the default sorting, left empty means no sorting at all ShowStdDev whether stddevs are displayed by default ShowAverage whether the Average is displayed by default (prints an additional list in the results) MeanPrecision the default precision for the mean StdDevPrecision the default precision for the stdev OutputFormat the classname of the ResultMatrix, responsible for the default output format (see weka.experiment package) RemoveFilterClassnames whether filter classnames are removed by default Note: simple means that this option is only available in the simple version of the Experimenter, not the advanced one See also # Properties File","title":"File"},{"location":"weka_gui_experiment_experimenter.props/#file","text":"weka/gui/experiment/Experimenter.props","title":"File"},{"location":"weka_gui_experiment_experimenter.props/#description","text":"Used for customizing the initial Experimenter settings.","title":"Description"},{"location":"weka_gui_experiment_experimenter.props/#version","text":">= 3.4.6 >= 3.5.1","title":"Version"},{"location":"weka_gui_experiment_experimenter.props/#fields","text":"Extension the default extension in the file-dialog (and therefore format) .exp - uses Java Serialization [.xml](xml#serialization of experiments.md) [.koml](xml#serialization of experiments.md) Destination - simple the default destination ARFF file CSV file JDBC database ExperimentType - simple the experiment type Cross-validation Train/Test Percentage Split (data randomized) Train/Test Percentage Split (order preserved) UseClassification - simple whether classification is the default ( true ) or regression ( false ) Folds - simple the default number of CV folds TrainPercentage - simple the default percentage for training (0 - 100) Repetitions - simple the default number of repetitions DatasetsFirst whether datasets are first iterated ( true ) or the algorithms ( false ) InitialDatasetsDirectory the initial datasets directory Note for Win32: the path backslashes have to written as \"\\\" UseRelativePaths whether to use relative paths ( true ) or absolute ones ( false ) Tester the default tester to use Paired T-Tester (corrected) Paired T-Tester Row the row selection Column the column selection ComparisonField the default comparison field (lower case!), cf. combobox Significance the default significance (0.0 - 1.0) Sorting the default sorting, left empty means no sorting at all ShowStdDev whether stddevs are displayed by default ShowAverage whether the Average is displayed by default (prints an additional list in the results) MeanPrecision the default precision for the mean StdDevPrecision the default precision for the stdev OutputFormat the classname of the ResultMatrix, responsible for the default output format (see weka.experiment package) RemoveFilterClassnames whether filter classnames are removed by default Note: simple means that this option is only available in the simple version of the Experimenter, not the advanced one","title":"Fields"},{"location":"weka_gui_experiment_experimenter.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_explorer_explorer.props/","text":"File # weka/gui/explorer/Explorer.props Description # This props file determines what schemes and options are initially set in the Explorer. Version # 3.5.3 Fields # Preprocess panel # InitGenericObjectEditorFilter if set to true the Capabilities filters in the GOE will be initialized based on the full dataset that has been loaded into the Explorer otherwise only the header Tabs Lists all the tabs that should be displayed in the Explorer. Apart from the Preprocess panel itself, all other panels are basically plugins. See the Adding tabs in the Explorer article for more details on adding custom panels. InitialDirectory (> 3.6.0, developer version) Defines the initial directory for opening datasets in the Preprocess panel. The following placeholders are recognized (work across platforms): %t - the temp directory %h - the user's home directory %c - the current directory (the default setting) %% - gets replaced by a single percentage sign enableUndo (> 3.6.5, > 3.7.4) Enable/disable the creation of undo files (default is enabled) undoDirectory (> 3.6.5, > 3.7.4) Specify the directory to use for saving undo files The following placeholders are recognized (work across platforms): %t - the temp directory Filter the filter to use, none if left empty Classify panel # Classifier the classifier to use ClassifierTestMode the default test mode in the classify tab 1 - cross-validation (default) 2 - percentage split 3 - use training set 4 - supplied test set ClassifierCrossvalidationFolds the default number of folds for CV ClassifierCostSensitiveEval whether the evaluation of the classifier is done cost-sensitively a cost matrix still has to be provided! ClassifierOutputConfusionMatrix whether the confusion matrix is output ClassifierOutputEntropyEvalMeasures whether the entropy based evaluation measures of the classifier model are output ClassifierOutputModel whether the classifier model is output ClassifierOutputPerClassStats whether additional per-class stats of the classifier model are output ClassifierOutputPredictions whether the predictions of the classifier output as well ClassifierPercentageSplit the default percentage split in % ClassifierPreserveOrder whether the order is preserved in case of percentage split ClassifierRandomSeed the default random seed ClassifierStorePredictionsForVis whether the predictions of the classifier are stored for visulization purposes ClassifierOutputSourceCode (> 3.5.5) whether to output Java source code for classifiers that implement the weka.classifiers.Sourcable interface ClassifierSourceCodeClass (> 3.5.5) the default classname of the generated Java source code ClassifierErrorsPlotInstances (> 3.7.0) the default classname for the class generating the plot instances of the classifier errors ClassifierErrorsMinimumPlotSizeNumeric (> 3.7.0) the minimum size for the crosses that display the classifier errors for numeric class attributes ClassifierErrorsMaximumPlotSizeNumeric (> 3.7.0) the maximum size for the crosses that display the classifier errors for numeric class attributes Cluster panel # Clusterer the clusterer to use ClustererTestMode the default test mode 2 - percentage split 3 - use training set (default) 4 - supplied test set 5 - classes to clusters evaluation ClustererStoreClustersForVis whether the clusters are stored for visualization purposes Associations panel Associator the default associator Attribute selection panel ASEvaluation the default attribute evaluator ASSearch the default attribute selection search scheme ASTestMode the default test mode 0 - use full training set (default) 1 - cross-validation ASCrossvalidationFolds the default number of folds for CV ASRandomSeed the default random seed See also # Properties File","title":"File"},{"location":"weka_gui_explorer_explorer.props/#file","text":"weka/gui/explorer/Explorer.props","title":"File"},{"location":"weka_gui_explorer_explorer.props/#description","text":"This props file determines what schemes and options are initially set in the Explorer.","title":"Description"},{"location":"weka_gui_explorer_explorer.props/#version","text":"3.5.3","title":"Version"},{"location":"weka_gui_explorer_explorer.props/#fields","text":"","title":"Fields"},{"location":"weka_gui_explorer_explorer.props/#preprocess-panel","text":"InitGenericObjectEditorFilter if set to true the Capabilities filters in the GOE will be initialized based on the full dataset that has been loaded into the Explorer otherwise only the header Tabs Lists all the tabs that should be displayed in the Explorer. Apart from the Preprocess panel itself, all other panels are basically plugins. See the Adding tabs in the Explorer article for more details on adding custom panels. InitialDirectory (> 3.6.0, developer version) Defines the initial directory for opening datasets in the Preprocess panel. The following placeholders are recognized (work across platforms): %t - the temp directory %h - the user's home directory %c - the current directory (the default setting) %% - gets replaced by a single percentage sign enableUndo (> 3.6.5, > 3.7.4) Enable/disable the creation of undo files (default is enabled) undoDirectory (> 3.6.5, > 3.7.4) Specify the directory to use for saving undo files The following placeholders are recognized (work across platforms): %t - the temp directory Filter the filter to use, none if left empty","title":"Preprocess panel"},{"location":"weka_gui_explorer_explorer.props/#classify-panel","text":"Classifier the classifier to use ClassifierTestMode the default test mode in the classify tab 1 - cross-validation (default) 2 - percentage split 3 - use training set 4 - supplied test set ClassifierCrossvalidationFolds the default number of folds for CV ClassifierCostSensitiveEval whether the evaluation of the classifier is done cost-sensitively a cost matrix still has to be provided! ClassifierOutputConfusionMatrix whether the confusion matrix is output ClassifierOutputEntropyEvalMeasures whether the entropy based evaluation measures of the classifier model are output ClassifierOutputModel whether the classifier model is output ClassifierOutputPerClassStats whether additional per-class stats of the classifier model are output ClassifierOutputPredictions whether the predictions of the classifier output as well ClassifierPercentageSplit the default percentage split in % ClassifierPreserveOrder whether the order is preserved in case of percentage split ClassifierRandomSeed the default random seed ClassifierStorePredictionsForVis whether the predictions of the classifier are stored for visulization purposes ClassifierOutputSourceCode (> 3.5.5) whether to output Java source code for classifiers that implement the weka.classifiers.Sourcable interface ClassifierSourceCodeClass (> 3.5.5) the default classname of the generated Java source code ClassifierErrorsPlotInstances (> 3.7.0) the default classname for the class generating the plot instances of the classifier errors ClassifierErrorsMinimumPlotSizeNumeric (> 3.7.0) the minimum size for the crosses that display the classifier errors for numeric class attributes ClassifierErrorsMaximumPlotSizeNumeric (> 3.7.0) the maximum size for the crosses that display the classifier errors for numeric class attributes","title":"Classify panel"},{"location":"weka_gui_explorer_explorer.props/#cluster-panel","text":"Clusterer the clusterer to use ClustererTestMode the default test mode 2 - percentage split 3 - use training set (default) 4 - supplied test set 5 - classes to clusters evaluation ClustererStoreClustersForVis whether the clusters are stored for visualization purposes Associations panel Associator the default associator Attribute selection panel ASEvaluation the default attribute evaluator ASSearch the default attribute selection search scheme ASTestMode the default test mode 0 - use full training set (default) 1 - cross-validation ASCrossvalidationFolds the default number of folds for CV ASRandomSeed the default random seed","title":"Cluster panel"},{"location":"weka_gui_explorer_explorer.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_generic_object_editor.props/","text":"File # weka/gui/GenericObjectEditor.props Description # Has been superceded by weka/gui/GenericPropertiesCreator.props which performs a dynamic discovery of classes. Version # >= 3.1.3 See also # weka/gui/GenericPropertiesCreator.props GenericObjectEditor (explains how to add new schemes) Properties File","title":"File"},{"location":"weka_gui_generic_object_editor.props/#file","text":"weka/gui/GenericObjectEditor.props","title":"File"},{"location":"weka_gui_generic_object_editor.props/#description","text":"Has been superceded by weka/gui/GenericPropertiesCreator.props which performs a dynamic discovery of classes.","title":"Description"},{"location":"weka_gui_generic_object_editor.props/#version","text":">= 3.1.3","title":"Version"},{"location":"weka_gui_generic_object_editor.props/#see-also","text":"weka/gui/GenericPropertiesCreator.props GenericObjectEditor (explains how to add new schemes) Properties File","title":"See also"},{"location":"weka_gui_generic_properties_creator.excludes/","text":"File # weka/gui/GenericPropertiesCreator.excludes Description # List classes for corresponding superclasses in GenericPropertiesCreator.props that shouldn't appear in the GenericObjectEditor popup tree. Version # >= 3.5.3 Fields # Format <key>=<prefix>:<class>[,<prefix>:<class>] <key> the key from GenericPropertiesCreator.props (class or interface) <prefix> S (\"Superclass\"): any class derived from this one will be excluded I (\"Interface\"): any class implementing this interface will be excluded C (\"Class\"): exactly this class will be excluded Example weka.experiment.ResultListener=I:weka.experiment.ResultProducer See also # weka/gui/GenericPropertiesCreator.props GenericObjectEditor Properties file","title":"File"},{"location":"weka_gui_generic_properties_creator.excludes/#file","text":"weka/gui/GenericPropertiesCreator.excludes","title":"File"},{"location":"weka_gui_generic_properties_creator.excludes/#description","text":"List classes for corresponding superclasses in GenericPropertiesCreator.props that shouldn't appear in the GenericObjectEditor popup tree.","title":"Description"},{"location":"weka_gui_generic_properties_creator.excludes/#version","text":">= 3.5.3","title":"Version"},{"location":"weka_gui_generic_properties_creator.excludes/#fields","text":"Format <key>=<prefix>:<class>[,<prefix>:<class>] <key> the key from GenericPropertiesCreator.props (class or interface) <prefix> S (\"Superclass\"): any class derived from this one will be excluded I (\"Interface\"): any class implementing this interface will be excluded C (\"Class\"): exactly this class will be excluded Example weka.experiment.ResultListener=I:weka.experiment.ResultProducer","title":"Fields"},{"location":"weka_gui_generic_properties_creator.excludes/#see-also","text":"weka/gui/GenericPropertiesCreator.props GenericObjectEditor Properties file","title":"See also"},{"location":"weka_gui_generic_properties_creator.props/","text":"File # weka/gui/GenericPropertiesCreator.props Description # Lists all the packages to look in for subclasses of a certain superclass to be displayed in the GenericObjectEditor . Note: Weka 3.5.8 turned the automatic discovery off by default. In this case, the weka/gui/GenericObjectEditor.props is used again. Version # >= 3.4.4 Fields # enable/disable dynamic discovery (> 3.5.5) UseDynamic=true|false Format (a backslash at the end continues the package list on the next line) <superclass>=<package>[,package[,...]] Filter example weka.filters.Filter = \\ weka.filters, \\ weka.filters.supervised.attribute, \\ weka.filters.supervised.instance, \\ weka.filters.unsupervised.attribute, \\ weka.filters.unsupervised.instance Classifier example weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.meta.nestedDichotomies, \\ weka.classifiers.mi, \\ weka.classifiers.misc, \\ weka.classifiers.trees, \\ weka.classifiers.rules See also # GenericObjectEditor (explains how to add new schemes) Properties file","title":"File"},{"location":"weka_gui_generic_properties_creator.props/#file","text":"weka/gui/GenericPropertiesCreator.props","title":"File"},{"location":"weka_gui_generic_properties_creator.props/#description","text":"Lists all the packages to look in for subclasses of a certain superclass to be displayed in the GenericObjectEditor . Note: Weka 3.5.8 turned the automatic discovery off by default. In this case, the weka/gui/GenericObjectEditor.props is used again.","title":"Description"},{"location":"weka_gui_generic_properties_creator.props/#version","text":">= 3.4.4","title":"Version"},{"location":"weka_gui_generic_properties_creator.props/#fields","text":"enable/disable dynamic discovery (> 3.5.5) UseDynamic=true|false Format (a backslash at the end continues the package list on the next line) <superclass>=<package>[,package[,...]] Filter example weka.filters.Filter = \\ weka.filters, \\ weka.filters.supervised.attribute, \\ weka.filters.supervised.instance, \\ weka.filters.unsupervised.attribute, \\ weka.filters.unsupervised.instance Classifier example weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.meta.nestedDichotomies, \\ weka.classifiers.mi, \\ weka.classifiers.misc, \\ weka.classifiers.trees, \\ weka.classifiers.rules","title":"Fields"},{"location":"weka_gui_generic_properties_creator.props/#see-also","text":"GenericObjectEditor (explains how to add new schemes) Properties file","title":"See also"},{"location":"weka_gui_gui_editors.props/","text":"File # weka/gui/GUIEditors.props Description # Lists what classes are handled by which GUI editor. Formerly done statically (but centralized) by the class weka.gui.GenericObjectEditor or in versions older than 3.5.2 all over the place. Version # 3.5.3 Fields # Format <class>=<editor> e.g., java.io.File=weka.gui.FileEditor <class[]>=<editor> (for arrays) e.g., java.lang.Object[]=weka.gui.GenericArrayEditor Available editors General editors weka.gui.GenericObjectEditor weka.gui.GenericArrayEditor Specialized editors weka.gui.CostMatrixEditor weka.gui.EnsembleSelectionLibraryEditor weka.gui.FileEditor weka.gui.SelectedTagEditor weka.gui.SimpleDateFormatEditor See also # GenericObjectEditor Properties File","title":"File"},{"location":"weka_gui_gui_editors.props/#file","text":"weka/gui/GUIEditors.props","title":"File"},{"location":"weka_gui_gui_editors.props/#description","text":"Lists what classes are handled by which GUI editor. Formerly done statically (but centralized) by the class weka.gui.GenericObjectEditor or in versions older than 3.5.2 all over the place.","title":"Description"},{"location":"weka_gui_gui_editors.props/#version","text":"3.5.3","title":"Version"},{"location":"weka_gui_gui_editors.props/#fields","text":"Format <class>=<editor> e.g., java.io.File=weka.gui.FileEditor <class[]>=<editor> (for arrays) e.g., java.lang.Object[]=weka.gui.GenericArrayEditor Available editors General editors weka.gui.GenericObjectEditor weka.gui.GenericArrayEditor Specialized editors weka.gui.CostMatrixEditor weka.gui.EnsembleSelectionLibraryEditor weka.gui.FileEditor weka.gui.SelectedTagEditor weka.gui.SimpleDateFormatEditor","title":"Fields"},{"location":"weka_gui_gui_editors.props/#see-also","text":"GenericObjectEditor Properties File","title":"See also"},{"location":"weka_gui_look_and_feel.props/","text":"File # weka/gui/LookAndFeel.props Description # Defines with what theme Weka is displayed. By default Weka starts with the system default one but sometimes it can help to change this to get Weka going or present user interfaces more nicely. Version # >= 3.4.5 >= 3.5.0 Fields # Theme a few possible values javax.swing.plaf.metal.MetalLookAndFeel works on all platforms fixes problem with Java 5/6 and Linux/Gnome com.sun.java.swing.plaf.windows.WindowsLookAndFeel theme for Win32 ... See also # Troubleshooting (GUIChooser starts but not Experimenter or Explorer) Troubleshooting (KnowledgeFlow toolbars are empty) Properties file","title":"File"},{"location":"weka_gui_look_and_feel.props/#file","text":"weka/gui/LookAndFeel.props","title":"File"},{"location":"weka_gui_look_and_feel.props/#description","text":"Defines with what theme Weka is displayed. By default Weka starts with the system default one but sometimes it can help to change this to get Weka going or present user interfaces more nicely.","title":"Description"},{"location":"weka_gui_look_and_feel.props/#version","text":">= 3.4.5 >= 3.5.0","title":"Version"},{"location":"weka_gui_look_and_feel.props/#fields","text":"Theme a few possible values javax.swing.plaf.metal.MetalLookAndFeel works on all platforms fixes problem with Java 5/6 and Linux/Gnome com.sun.java.swing.plaf.windows.WindowsLookAndFeel theme for Win32 ...","title":"Fields"},{"location":"weka_gui_look_and_feel.props/#see-also","text":"Troubleshooting (GUIChooser starts but not Experimenter or Explorer) Troubleshooting (KnowledgeFlow toolbars are empty) Properties file","title":"See also"},{"location":"weka_gui_memory_usage.props/","text":"File # weka/gui/MemoryUsage.props Description # Contains properties for the memory usage panel/frame, which can be launched from: weka.gui.Main File -> Memory usage weka.gui.GUIChooser click on the Memory usage button Version # 3.5.7 Fields # Width the width of the panel in pixel. The default is 400 . Height the height of the panel - normally not set, since the height of the garbage collector button is used Left , Top specify a fixed location on the screen in pixel. Both of these parameters have to be different from -1 to place the window. The default is -1 for both, i.e., top-left corner of the screen. BackgroundColor defines the background color of the panel, can either be the name of a standard Java color or an RGB triplet (= R,G,B ). Default is white . Interval the refresh interval in milli-second. The default is 1000 . Percentages comma-separated list of percentage number that indicate when to switch colors. E.g., 70,80,90 (which is the default) indicates to change the color of the bar from the default one (see DefaultColor ) as soon as the percentage reaches this threshold. For example, if the percentage reaches 70 percent, then the color specified with the key 70 is used (default is yellow for 70 ). If it reaches 80 or more then orange is used (the default for 80 is orange ) and everything above and including 90 will be displayed red ( red is the default for 90 ). DefaultColor the default color to display the percentage bar with, i.e., if the percentage is below the lowest percentage listed in Percentages . See also # Properties file","title":"File"},{"location":"weka_gui_memory_usage.props/#file","text":"weka/gui/MemoryUsage.props","title":"File"},{"location":"weka_gui_memory_usage.props/#description","text":"Contains properties for the memory usage panel/frame, which can be launched from: weka.gui.Main File -> Memory usage weka.gui.GUIChooser click on the Memory usage button","title":"Description"},{"location":"weka_gui_memory_usage.props/#version","text":"3.5.7","title":"Version"},{"location":"weka_gui_memory_usage.props/#fields","text":"Width the width of the panel in pixel. The default is 400 . Height the height of the panel - normally not set, since the height of the garbage collector button is used Left , Top specify a fixed location on the screen in pixel. Both of these parameters have to be different from -1 to place the window. The default is -1 for both, i.e., top-left corner of the screen. BackgroundColor defines the background color of the panel, can either be the name of a standard Java color or an RGB triplet (= R,G,B ). Default is white . Interval the refresh interval in milli-second. The default is 1000 . Percentages comma-separated list of percentage number that indicate when to switch colors. E.g., 70,80,90 (which is the default) indicates to change the color of the bar from the default one (see DefaultColor ) as soon as the percentage reaches this threshold. For example, if the percentage reaches 70 percent, then the color specified with the key 70 is used (default is yellow for 70 ). If it reaches 80 or more then orange is used (the default for 80 is orange ) and everything above and including 90 will be displayed red ( red is the default for 90 ). DefaultColor the default color to display the percentage bar with, i.e., if the percentage is below the lowest percentage listed in Percentages .","title":"Fields"},{"location":"weka_gui_memory_usage.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_gui_scripting_groovy.props/","text":"File # weka/gui/scripting/Groovy.props Description # This props file determines the look and feel of the minimalistic scripting IDE for Groovy . Version # 3.5.8 Fields # FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 2 Tabs The number of spaces a tab represents. default: 2 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: /* MultiLineCommentEnd The character sequence that ends a multi-line comment. default: */ SingleLineCommentStart The character sequence that starts a single-line comment. default: * AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: true BlockStart The character sequence that starts a block. default: { BlockEndd The character sequence that ends a block. default: } Keywords Comma-separated list of keywords to highlight. Notes # Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE. See also # Properties File Using Weka from Groovy Links # Groovy homepage","title":"File"},{"location":"weka_gui_scripting_groovy.props/#file","text":"weka/gui/scripting/Groovy.props","title":"File"},{"location":"weka_gui_scripting_groovy.props/#description","text":"This props file determines the look and feel of the minimalistic scripting IDE for Groovy .","title":"Description"},{"location":"weka_gui_scripting_groovy.props/#version","text":"3.5.8","title":"Version"},{"location":"weka_gui_scripting_groovy.props/#fields","text":"FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 2 Tabs The number of spaces a tab represents. default: 2 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: /* MultiLineCommentEnd The character sequence that ends a multi-line comment. default: */ SingleLineCommentStart The character sequence that starts a single-line comment. default: * AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: true BlockStart The character sequence that starts a block. default: { BlockEndd The character sequence that ends a block. default: } Keywords Comma-separated list of keywords to highlight.","title":"Fields"},{"location":"weka_gui_scripting_groovy.props/#notes","text":"Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE.","title":"Notes"},{"location":"weka_gui_scripting_groovy.props/#see-also","text":"Properties File Using Weka from Groovy","title":"See also"},{"location":"weka_gui_scripting_groovy.props/#links","text":"Groovy homepage","title":"Links"},{"location":"weka_gui_scripting_jython.props/","text":"File # weka/gui/scripting/Jython.props Description # This props file determines the look and feel of the minimalistic scripting IDE for Jython . Version # 3.5.8 Fields # FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 4 Tabs The number of spaces a tab represents. default: 4 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: \"\"\" MultiLineCommentEnd The character sequence that ends a multi-line comment. default: \"\"\" SingleLineCommentStart The character sequence that starts a single-line comment. default: # AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: false BlockStart The character sequence that starts a block. default: -none- BlockEndd The character sequence that ends a block. default: -none- Keywords Comma-separated list of keywords to highlight. Notes # Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE. See also # Properties File Using Weka from Jython Links # Jython homepage","title":"File"},{"location":"weka_gui_scripting_jython.props/#file","text":"weka/gui/scripting/Jython.props","title":"File"},{"location":"weka_gui_scripting_jython.props/#description","text":"This props file determines the look and feel of the minimalistic scripting IDE for Jython .","title":"Description"},{"location":"weka_gui_scripting_jython.props/#version","text":"3.5.8","title":"Version"},{"location":"weka_gui_scripting_jython.props/#fields","text":"FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 4 Tabs The number of spaces a tab represents. default: 4 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: \"\"\" MultiLineCommentEnd The character sequence that ends a multi-line comment. default: \"\"\" SingleLineCommentStart The character sequence that starts a single-line comment. default:","title":"Fields"},{"location":"weka_gui_scripting_jython.props/#_1","text":"AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: false BlockStart The character sequence that starts a block. default: -none- BlockEndd The character sequence that ends a block. default: -none- Keywords Comma-separated list of keywords to highlight.","title":""},{"location":"weka_gui_scripting_jython.props/#notes","text":"Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE.","title":"Notes"},{"location":"weka_gui_scripting_jython.props/#see-also","text":"Properties File Using Weka from Jython","title":"See also"},{"location":"weka_gui_scripting_jython.props/#links","text":"Jython homepage","title":"Links"},{"location":"weka_gui_simple_cli.props/","text":"File # weka/gui/SimpleCLI.props Description # Contains properties for the SimpleCLI, e.g., the command history. Whenever the user issues a command, the history gets saved in the user's home directory ( $HOME/SimpleCLI.props on Linux or %USERPROFILE%/SimpleCLI.props on Windows). Version # 3.5.6 Fields # HistorySize the maximum number of most recent commands to store in the properties file (in the user's home directory). Command X lists command X of the history, with X being an integer starting from 0. See also # Properties file","title":"File"},{"location":"weka_gui_simple_cli.props/#file","text":"weka/gui/SimpleCLI.props","title":"File"},{"location":"weka_gui_simple_cli.props/#description","text":"Contains properties for the SimpleCLI, e.g., the command history. Whenever the user issues a command, the history gets saved in the user's home directory ( $HOME/SimpleCLI.props on Linux or %USERPROFILE%/SimpleCLI.props on Windows).","title":"Description"},{"location":"weka_gui_simple_cli.props/#version","text":"3.5.6","title":"Version"},{"location":"weka_gui_simple_cli.props/#fields","text":"HistorySize the maximum number of most recent commands to store in the properties file (in the user's home directory). Command X lists command X of the history, with X being an integer starting from 0.","title":"Fields"},{"location":"weka_gui_simple_cli.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/","text":"File # weka/gui/treevisualizer/TreeVisualizer.props Description # Customizes the TreeVisualizer display. The TreeVisualizer is used in the Explorer to display trees, e.g., generated by J48. Version # 3.6.0 (stable-3.6 version) 3.5.8 (developer version) Fields # FontColor (can use R,G,B format) The color of the text being displayed, node and edge labels. BackgroundColor (can use R,G,B format) The background color, by default empty in order to use the platforms default background color. Note: on Mac OS X, this seems to result in BLACK when saving the tree to an image file. Mac OS X users should fill in a color. NodeColor (can use R,G,B format) The color in which the node boxes are painted. LineColor (can use R,G,B format) The color of the edges. ZoomBoxColor (can use R,G,B format) The color of the zoom box. ZoomBoxXORColor (can use R,G,B format) The XOR color of the zoom box. ShowBorder Indicates whether to show the border around the graph (labeled \"Tree View\") or not. See also # Properties File","title":"File"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#file","text":"weka/gui/treevisualizer/TreeVisualizer.props","title":"File"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#description","text":"Customizes the TreeVisualizer display. The TreeVisualizer is used in the Explorer to display trees, e.g., generated by J48.","title":"Description"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#version","text":"3.6.0 (stable-3.6 version) 3.5.8 (developer version)","title":"Version"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#fields","text":"FontColor (can use R,G,B format) The color of the text being displayed, node and edge labels. BackgroundColor (can use R,G,B format) The background color, by default empty in order to use the platforms default background color. Note: on Mac OS X, this seems to result in BLACK when saving the tree to an image file. Mac OS X users should fill in a color. NodeColor (can use R,G,B format) The color in which the node boxes are painted. LineColor (can use R,G,B format) The color of the edges. ZoomBoxColor (can use R,G,B format) The color of the zoom box. ZoomBoxXORColor (can use R,G,B format) The XOR color of the zoom box. ShowBorder Indicates whether to show the border around the graph (labeled \"Tree View\") or not.","title":"Fields"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_visualize_visualize.props/","text":"File # weka/gui/visualize/Visualize.props Description # Customizes display of plots and certain curves in the GUI. Version # >= 3.1.9 Fields # weka.gui.visualize.precision Maximum precision for numeric values weka.gui.visualize.Plot2D.axisColour Colour for the axis in the 2D plot (can use R,G,B format) weka.gui.visualize.Plot2D.backgroundColour Colour for the background of the 2D plot (can use R,G,B format) weka.gui.visualize.VisualizePanel.displayAttributeBars Display the list of one dimensional attribute visualizations weka.gui.visualize.AttributePanel.barColour Colour for the background of the attribute bars in the AttributePanel (can use R,G,B format) weka.gui.visualize.Plot2D.instanceInfoFrame (developer version later than 3.5.8, not in stable-3.6) Lists the classname for displaying the instance info, e.g., when visualizing the classifier errors in the Explorer. Custom classes only need to be derived from javax.swing.JFrame and implement the weka.gui.visualize.InstanceInfo interface. Threshold curve plots weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.XDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.YDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.ColourDimension Cost curve plots weka.gui.visualize.VisualizePanel.CostCurve.XDimension weka.gui.visualize.VisualizePanel.CostCurve.YDimension weka.gui.visualize.VisualizePanel.CostCurve.ColourDimension Margin curve plots weka.gui.visualize.VisualizePanel.MarginCurve.XDimension weka.gui.visualize.VisualizePanel.MarginCurve.YDimension weka.gui.visualize.VisualizePanel.MarginCurve.ColourDimension Notes # Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE. See also # Properties File","title":"File"},{"location":"weka_gui_visualize_visualize.props/#file","text":"weka/gui/visualize/Visualize.props","title":"File"},{"location":"weka_gui_visualize_visualize.props/#description","text":"Customizes display of plots and certain curves in the GUI.","title":"Description"},{"location":"weka_gui_visualize_visualize.props/#version","text":">= 3.1.9","title":"Version"},{"location":"weka_gui_visualize_visualize.props/#fields","text":"weka.gui.visualize.precision Maximum precision for numeric values weka.gui.visualize.Plot2D.axisColour Colour for the axis in the 2D plot (can use R,G,B format) weka.gui.visualize.Plot2D.backgroundColour Colour for the background of the 2D plot (can use R,G,B format) weka.gui.visualize.VisualizePanel.displayAttributeBars Display the list of one dimensional attribute visualizations weka.gui.visualize.AttributePanel.barColour Colour for the background of the attribute bars in the AttributePanel (can use R,G,B format) weka.gui.visualize.Plot2D.instanceInfoFrame (developer version later than 3.5.8, not in stable-3.6) Lists the classname for displaying the instance info, e.g., when visualizing the classifier errors in the Explorer. Custom classes only need to be derived from javax.swing.JFrame and implement the weka.gui.visualize.InstanceInfo interface. Threshold curve plots weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.XDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.YDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.ColourDimension Cost curve plots weka.gui.visualize.VisualizePanel.CostCurve.XDimension weka.gui.visualize.VisualizePanel.CostCurve.YDimension weka.gui.visualize.VisualizePanel.CostCurve.ColourDimension Margin curve plots weka.gui.visualize.VisualizePanel.MarginCurve.XDimension weka.gui.visualize.VisualizePanel.MarginCurve.YDimension weka.gui.visualize.VisualizePanel.MarginCurve.ColourDimension","title":"Fields"},{"location":"weka_gui_visualize_visualize.props/#notes","text":"Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE.","title":"Notes"},{"location":"weka_gui_visualize_visualize.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_on_a_memory_stick/","text":"The following guide explains how to put Weka on a USB memory stick. This works both for Linux and Windows (Mac OSX as well). For simplicity, this example is demonstrated with the following versions: Weka: 3.5.3 JRE: 1.5.0_10 Preliminaries # download a Weka ZIP (Windows: don't download the Installer!) download the JRE (Java Runtime Environemnt) that works with the downloaded Weka version. (Linux: don't download the RPM , but the Linux self-extracting file ) install the downloaded JRE Windows: the JRE location is normally in C:\\Program Files\\Java Linux: the self-extracting file creates a directory containing the JRE at the same location as the installation file Setup # create a directory on your memory stick that will hold Weka and the JRE: weka * unzip the Weka ZIP into the weka directory, which will create the following sub-directory; weka-3-5-3 * copy the JRE onto the stick in the weka directory, which will be this sub-directory: jre1.5.0_10 Script # As a final step, create a script to start Weka: Windows create a new batch file called weka.bat in the directory weka with the following content @ echo off set CP = %CLASSPATH% ;.\\weka-3-5-3\\weka.jar start .\\jre1.5.0_10\\bin\\javaw -classpath \" %CP% \" weka.gui.GUIChooser Note: If start is not available in your flavor of Windows, you can drop it. It is only used to get rid of the DOS-Box. Linux create a new bash script called weka.sh in the directory weka with the following content #!/bin/bash CP = $CLASSPATH :./weka-3-5-3/weka.jar ./jre1.5.0_10/bin/java -classpath $CP weka.gui.GUIChooser Note: since memory sticks normally use the FAT32 file-system you probably won't need to make it executable Execution # Windows: just double-click on the batch file Linux: open a terminal and execute the bash script Links # Weka homepage Sun Java homepage start command in Windows NT/2000","title":"Weka on a memory stick"},{"location":"weka_on_a_memory_stick/#preliminaries","text":"download a Weka ZIP (Windows: don't download the Installer!) download the JRE (Java Runtime Environemnt) that works with the downloaded Weka version. (Linux: don't download the RPM , but the Linux self-extracting file ) install the downloaded JRE Windows: the JRE location is normally in C:\\Program Files\\Java Linux: the self-extracting file creates a directory containing the JRE at the same location as the installation file","title":"Preliminaries"},{"location":"weka_on_a_memory_stick/#setup","text":"create a directory on your memory stick that will hold Weka and the JRE: weka * unzip the Weka ZIP into the weka directory, which will create the following sub-directory; weka-3-5-3 * copy the JRE onto the stick in the weka directory, which will be this sub-directory: jre1.5.0_10","title":"Setup"},{"location":"weka_on_a_memory_stick/#script","text":"As a final step, create a script to start Weka: Windows create a new batch file called weka.bat in the directory weka with the following content @ echo off set CP = %CLASSPATH% ;.\\weka-3-5-3\\weka.jar start .\\jre1.5.0_10\\bin\\javaw -classpath \" %CP% \" weka.gui.GUIChooser Note: If start is not available in your flavor of Windows, you can drop it. It is only used to get rid of the DOS-Box. Linux create a new bash script called weka.sh in the directory weka with the following content #!/bin/bash CP = $CLASSPATH :./weka-3-5-3/weka.jar ./jre1.5.0_10/bin/java -classpath $CP weka.gui.GUIChooser Note: since memory sticks normally use the FAT32 file-system you probably won't need to make it executable","title":"Script"},{"location":"weka_on_a_memory_stick/#execution","text":"Windows: just double-click on the batch file Linux: open a terminal and execute the bash script","title":"Execution"},{"location":"weka_on_a_memory_stick/#links","text":"Weka homepage Sun Java homepage start command in Windows NT/2000","title":"Links"},{"location":"where_does_weka_look_for_props_files/","text":"WEKA not only uses the .props files that are present in the weka.jar archive, but also the ones in the user's home directory and the current directory, i.e., the one WEKA was started from. For a complete overview, see the section Precedence in the Properties file article. The same article also explains how to modify these .props files in section How to modify a .props file? .","title":"Where does weka look for props files"},{"location":"windows_databases/","text":"A common query we get from our users is how to open a Windows database in the Weka Explorer. This page is intended as a guide to help you achieve this. It is a complicated process and we cannot guarantee that it will work for you. The process described makes use of the JDBC-ODBC bridge that is part of Sun's JRE/JDK 1.3 (and higher). The following instructions are for Windows 2000. Under other Windows versions there may be slight differences. Step 1: Create a User DSN # Go to the Control Panel Choose Adminstrative Tools Choose Data Sources (ODBC) At the User DSN tab, choose Add... Choose database Microsoft Access Note: Make sure your database is not open in another application before following the steps below. Choose the Microsoft Access driver and click Finish Give the source a name by typing it into the Data Source Name field In the Database section, choose Select... Browse to find your database file, select it and click OK Click OK to finalize your DSN Microsoft SQL Server 2000 (Desktop Engine) Choose the SQL Server driver and click Finish Give the source a name by typing it into the Name field Add a description for this source in the Description field Select the server you're connecting to from the Server combobox For the verification of the authenticity of the login ID choose With SQL Server... Check Connect to SQL Server to obtain default settings... and supply the user ID and password with which you installed the Desktop Engine Just click on Next until it changes into Finish and click this, too For testing purposes, click on Test Data Source... - the result should be TESTS COMPLETED SUCCESSFULLY! Click on OK MySQL Choose the MySQL ODBC driver and click Finish Give the source a name by typing it into the Data Source Name field Add a description for this source in the Description field Specify the server you're connecting to in Server Fill in the user to use for connecting to the database in the User field, the same for the password Choose the database for this DSN from the Database combobox Click on OK Your DSN should now be listed in the User Data Sources list Step 2: Set up the DatabaseUtils.props file # You will need to configure a file called DatabaseUtils.props . This file already exists under the path weka/experiment/ in the weka.jar file (which is just a ZIP file) that is part of the Weka download. In this directory you will also find a sample file for ODBC connectivity, called DatabaseUtils.props.odbc , and one specifically for MS Access, called DatabaseUtils.props.msaccess (>3.4.14, >3.5.8, >3.6.0), also using ODBC. You should use one of the sample files as basis for your setup, since they already contain default values specific to ODBC access. This file needs to be recognized when the Explorer starts. You can achieve this by making sure it is in the working directory or the home directory (if you are unsure what the terms working directory and home directory mean, see the Notes section). The easiest is probably the second alternative, as the setup will apply to all the Weka instances on your machine. Just make sure that the file contains the following lines at least: jdbcDriver = sun.jdbc.odbc.JdbcOdbcDriver jdbcURL = jdbc:odbc:dbname where dbname is the name you gave the user DSN. (This can also be changed once the Explorer is running.) Step 3: Open the database # Book version # Start up the Weka Explorer. Choose Open DB... Edit the query field to read select * from tablename where tablename is the name of the database table you want to read, or you could put a more complicated SQL query here instead. The databaseURL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click OK At this point the data should be read from the database. Stable 3.6 and developer version # Start up the Weka Explorer. Choose Open DB... The URL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click Connect Enter a Query , e.g., \" select * from tablename \" where tablename is the name of the database table you want to read. Or you could put a more complicated SQL query here instead. Click Execute When you're satisfied with the returned data, click OK to load the data into the Preprocess panel. Notes # Working directory The directory a process is started from. When you start Weka from the Windows Start Menu, then this directory would be Weka's installation directory (the java process is started from that directory). Home directory The directory that contains all the user's data. The exact location depends on the operating system and the version of the operating system. It is stored in the following environment variable: Unix/Linux $HOME Windows %USERPROFILE% Cygwin $USERPROFILE You should be able output the value in a command prompt/terminal with the echo command. E.g., for Windows this would be: echo %USERPROFILE% See also # Databases CLASSPATH DatabaseUtils.props","title":"Windows databases"},{"location":"windows_databases/#step-1-create-a-user-dsn","text":"Go to the Control Panel Choose Adminstrative Tools Choose Data Sources (ODBC) At the User DSN tab, choose Add... Choose database Microsoft Access Note: Make sure your database is not open in another application before following the steps below. Choose the Microsoft Access driver and click Finish Give the source a name by typing it into the Data Source Name field In the Database section, choose Select... Browse to find your database file, select it and click OK Click OK to finalize your DSN Microsoft SQL Server 2000 (Desktop Engine) Choose the SQL Server driver and click Finish Give the source a name by typing it into the Name field Add a description for this source in the Description field Select the server you're connecting to from the Server combobox For the verification of the authenticity of the login ID choose With SQL Server... Check Connect to SQL Server to obtain default settings... and supply the user ID and password with which you installed the Desktop Engine Just click on Next until it changes into Finish and click this, too For testing purposes, click on Test Data Source... - the result should be TESTS COMPLETED SUCCESSFULLY! Click on OK MySQL Choose the MySQL ODBC driver and click Finish Give the source a name by typing it into the Data Source Name field Add a description for this source in the Description field Specify the server you're connecting to in Server Fill in the user to use for connecting to the database in the User field, the same for the password Choose the database for this DSN from the Database combobox Click on OK Your DSN should now be listed in the User Data Sources list","title":"Step 1: Create a User DSN"},{"location":"windows_databases/#step-2-set-up-the-databaseutilsprops-file","text":"You will need to configure a file called DatabaseUtils.props . This file already exists under the path weka/experiment/ in the weka.jar file (which is just a ZIP file) that is part of the Weka download. In this directory you will also find a sample file for ODBC connectivity, called DatabaseUtils.props.odbc , and one specifically for MS Access, called DatabaseUtils.props.msaccess (>3.4.14, >3.5.8, >3.6.0), also using ODBC. You should use one of the sample files as basis for your setup, since they already contain default values specific to ODBC access. This file needs to be recognized when the Explorer starts. You can achieve this by making sure it is in the working directory or the home directory (if you are unsure what the terms working directory and home directory mean, see the Notes section). The easiest is probably the second alternative, as the setup will apply to all the Weka instances on your machine. Just make sure that the file contains the following lines at least: jdbcDriver = sun.jdbc.odbc.JdbcOdbcDriver jdbcURL = jdbc:odbc:dbname where dbname is the name you gave the user DSN. (This can also be changed once the Explorer is running.)","title":"Step 2: Set up the DatabaseUtils.props file"},{"location":"windows_databases/#step-3-open-the-database","text":"","title":"Step 3: Open the database"},{"location":"windows_databases/#book-version","text":"Start up the Weka Explorer. Choose Open DB... Edit the query field to read select * from tablename where tablename is the name of the database table you want to read, or you could put a more complicated SQL query here instead. The databaseURL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click OK At this point the data should be read from the database.","title":"Book version"},{"location":"windows_databases/#stable-36-and-developer-version","text":"Start up the Weka Explorer. Choose Open DB... The URL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click Connect Enter a Query , e.g., \" select * from tablename \" where tablename is the name of the database table you want to read. Or you could put a more complicated SQL query here instead. Click Execute When you're satisfied with the returned data, click OK to load the data into the Preprocess panel.","title":"Stable 3.6 and developer version"},{"location":"windows_databases/#notes","text":"Working directory The directory a process is started from. When you start Weka from the Windows Start Menu, then this directory would be Weka's installation directory (the java process is started from that directory). Home directory The directory that contains all the user's data. The exact location depends on the operating system and the version of the operating system. It is stored in the following environment variable: Unix/Linux $HOME Windows %USERPROFILE% Cygwin $USERPROFILE You should be able output the value in a command prompt/terminal with the echo command. E.g., for Windows this would be: echo %USERPROFILE%","title":"Notes"},{"location":"windows_databases/#see-also","text":"Databases CLASSPATH DatabaseUtils.props","title":"See also"},{"location":"writing_classifier/","text":"In case you have a flash idea for a new classifier and want to write one for Weka, this HOWTO will help you developing it. The Mindmap ( Build_classifier.pdf , produced with FreeMind ) helps you decide from which base classifier to start, what methods are to be implemented and general guidelines. The base classifiers are all located in the following package: weka.classifiers Note: This is also covered in chapter Extending WEKA of the WEKA manual. Packages # A few comments about the different classifier sub-packages: bayes - contains bayesian classifiers, e.g. NaiveBayes evaluation - classes related to evaluation, e.g., cost matrix functions - e.g., Support Vector Machines, regression algorithms, neural nets lazy - no offline learning, that is done during runtime, e.g., k-NN meta - Meta classifiers that use a base classifier as input, e.g., boosting or bagging mi - classifiers that handle multi-instance data misc - various classifiers that don't fit in any another category rules - rule-based classifiers, e.g. ZeroR trees - tree classifiers, like decision trees Coding # In the following you'll find notes about certain implementation parts listed in the Mindmap, which need a bit more explanation. Random number generators # In order to get repeatable experiments, one is not allowed to use unseeded random number generators like Math.random() . Instead, one has to instantiate a java.util.Random object in the buildClassifier(Instances) method with a specific seed value. The seed value can be user supplied, of course, which all the Randomizable... abstract classifiers already implement. Capabilities # In old versions of Weka (up to version 3.5.2), all classifiers could handle basically every kind of data by default, unless they were throwing an Exception (in the buildClassifier(Instances) method). Since this behavior makes it cumbersome to introduce new attribute types, for instance ( all classifiers have to be modified, which can't handle the new attribute type!), the general Capabilities were introduced. Base-classifier # Normal classifiers only state what kind of attributes and what kind of classes they can handle. The getCapabilities() method of weka.classifiers.trees.RandomTree , for instance, looks like this: public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); // returns the object from weka.classifiers.Classifier // attributes result . enable ( Capability . NOMINAL_ATTRIBUTES ); result . enable ( Capability . NUMERIC_ATTRIBUTES ); result . enable ( Capability . DATE_ATTRIBUTES ); result . enable ( Capability . MISSING_VALUES ); // class result . enable ( Capability . NOMINAL_CLASS ); result . enable ( Capability . MISSING_CLASS_VALUES ); return result ; } Special cases: incremental classifiers - By default, at least 1 instance has to be in the dataset, which does not apply for incremental classifiers. They have to lower the limit to 0 : result.setMinimumNumberInstances(0); multi-instance classifiers - The structure for multi-instance classifiers is always fixed to bagID,bag-data,class . To restrict the data to multi-instance data, add the following: result.enable(Capability.ONLY_MULTIINSTANCE); Multi-instance classifiers also implement the following interface, which returns the Capabilities for the bag-data, which is just a relational attribute (the reason why RELATIONAL_ATTRIBUTES has to be enabled): weka.core.MultiInstanceCapabilitiesHandler clusterer - Since clusterer don't need a class attribute like classifiers, the following Capability has to be specified to enable datasets without a class attribute (which is already done in the superclass weka.clusterers.Clusterer ): result.enable(Capability.NO_CLASS); Meta-classifier # Meta-classifiers, by default, just return the capabilities of their base classifiers - in case of descendants of the weka.classifier.MultipleClassifiersCombiner , an AND over all the Capabilities of the base classifiers is returned. Due to this behavior, the Capabilities depend (normally) only on the currently configured base classifier(s). To soften filtering for certain behavior, meta-classifiers also define so-called Dependencies on a per-Capability basis. These dependencies tell the filter that even though a certain capability is not supported right now, it is possible that it will be supported with a different base classifier. By default, all Capabilities are initialized as Dependencies. weka.classifiers.meta.LogitBoost , e.g., is restricted to nominal classes. For that reason it disables the Dependencies for the class: result . disableAllClasses (); // disable all class types result . disableAllClassDependencies (); // no dependencies! result . enable ( Capability . NOMINAL_CLASS ); // only nominal classes allowed Relevant classes # weka.core.Capabilities weka.core.CapabilitiesHandler weka.core.MultiInstanceCapabilitiesHandler (for multi-instance classifiers) Paper reference(s) # In order to make it easy to generate a bibliography of all the algorithms in Weka, the paper references located so far in the Javadoc were extracted and placed in the code. Classes that are based on some technical paper should implement the TechnicalInformationHandler interface and return a customized TechnicalInformation instance. The format used is based on BibTeX and the TechnicalInformation class can either return a plain text string via the toString() method or a real BibTeX entry via the toBibTex() method. This two methods are then used to automatically update the Javadoc (see Javadoc further down) of a class. Relevant classes: weka.core.TechnicalInformation weka.core.TechnicalInformationHandler Javadoc # Open-source software is only as good as its documentation. Hence, correct and up-to-date documentation is vital. So far most of the Javadoc was maintained manually, which made it hard to maintain, e.g., as soon as new options were added the Javadoc had to be changed accordingly, too. And that normally in several places: Class description setOptions(String[]) method Over the time the documentation got out of sync, which made it frustrating determining what options were really relevant and active. Since a lot of the documentation is already available in the code itself, the next logical step was to automate the Javadoc generation as much as possible. In the following you will see how to structure your Javadoc to reduce maintainance. For this purpose special comment tags are used, where the content in between will be replaced automatically by the classes listed below in the Relevant classes section. The indentation of the generated Javadoc depends on the indentation of the &lt; of the starting comment tag. This general layout order should be used for all classes: class description Javadoc globalinfo bibtex - if available commandline options setOptions Javadoc commandline options General # The general description for all classes displayed in the GenericObjectEditor was already in place, with the following method: globalInfo () The return value can be placed in the Javadoc, surrounded by the following comment tags: <!-- globalinfo-start --> will be automatically replaced <!-- globalinfo-end --> Paper reference(s) # If available, the paper reference should also be listed in the Javadoc. Since the globalInfo() method should return a short version of the reference, it is sufficient to list the full BibTeX documentation: <!-- technical-bibtex-start --> will be automatically replaced <!-- technical-bibtex-end --> In case it is necessary to list the short, plain text version, too, one can use the following tags: <!-- technical-plaintext-start --> will be automatically replaced <!-- technical-plaintext-end --> Options # To place the commandline options, use the following comment tags: <!-- options-start --> will be automatically replaced <!-- options-end --> Relevant classes # weka.core.AllJavadoc - executes all Javadoc-producing classes weka.core.GlobalInfoJavadoc - updates the globalInfo tags weka.core.OptionHandlerJavadoc - updates the option tags weka.core.TechnicalInformationHandlerJavadoc - updates the technical tags (plain text and BibTeX ) Integration # After finishing the coding stage, it's time to integrate your classifier in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your classifier and therefore displaying it in the GenericObjectEditor . Revisions # Classifiers also implement the weka.core.RevisionHandler interface. Classifiers that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); } Testing # Weka provides already a test framework to ensure the basic functionality of a classifier. It is essential for the classifier to pass these tests. Commandline test # General # Use the CheckClassifier class to test your classifier from commandline: weka.classifiers.CheckClassifier -W classname [-- additional parameters] Only the following tests may have \"no\" as result, the others must have a \"no (OK error message)\" or \"yes\": options updateable classifier weighted instances classifier multi-instance classifier Option handling # Additionally, check the option handling of your classifier with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes . GenericObjectEditor # The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes . Source code # Classifiers that implement the weka.classifiers.Sourcable interface can output Java code of their model. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.classifiers.CheckSource This class takes the original Weka classifier, the generated code and the dataset used for generating the source code as parameters. It builds the Weka classifier on the dataset and compares the predictions, the ones from the Weka classifier and the ones from the generated source code, whether they are the same. Here's an example call for weka.classifiers.trees.Id3 and the generated class weka.classifiers.WekaWrapper (it wraps the actual generated code in a pseudo-classifier): java weka.classifiers.CheckSource \\ -W \"weka.classifiers.trees.Id3\" \\ -S weka.classifiers.WekaWrapper \\ -t data.arff \\ -c last It needs to return Tests OK! . Unit tests # In order to make sure that your classifier applies to the Weka criteria, you should add your classifier to the junit unit test framework, i.e., by creating a Test class derived from AbstractClassifierTest . This class uses the CheckClassifier , CheckOptionHandler and CheckGOE class to run a battery of tests. See also # GenericObjectEditor Paper References Writing your own Classifier Article Links # Build_classifier.pdf - MindMap for implementing a new classifier Weka API ( stable , developer ) Freemind junit","title":"Writing classifier"},{"location":"writing_classifier/#packages","text":"A few comments about the different classifier sub-packages: bayes - contains bayesian classifiers, e.g. NaiveBayes evaluation - classes related to evaluation, e.g., cost matrix functions - e.g., Support Vector Machines, regression algorithms, neural nets lazy - no offline learning, that is done during runtime, e.g., k-NN meta - Meta classifiers that use a base classifier as input, e.g., boosting or bagging mi - classifiers that handle multi-instance data misc - various classifiers that don't fit in any another category rules - rule-based classifiers, e.g. ZeroR trees - tree classifiers, like decision trees","title":"Packages"},{"location":"writing_classifier/#coding","text":"In the following you'll find notes about certain implementation parts listed in the Mindmap, which need a bit more explanation.","title":"Coding"},{"location":"writing_classifier/#random-number-generators","text":"In order to get repeatable experiments, one is not allowed to use unseeded random number generators like Math.random() . Instead, one has to instantiate a java.util.Random object in the buildClassifier(Instances) method with a specific seed value. The seed value can be user supplied, of course, which all the Randomizable... abstract classifiers already implement.","title":"Random number generators"},{"location":"writing_classifier/#capabilities","text":"In old versions of Weka (up to version 3.5.2), all classifiers could handle basically every kind of data by default, unless they were throwing an Exception (in the buildClassifier(Instances) method). Since this behavior makes it cumbersome to introduce new attribute types, for instance ( all classifiers have to be modified, which can't handle the new attribute type!), the general Capabilities were introduced.","title":"Capabilities"},{"location":"writing_classifier/#base-classifier","text":"Normal classifiers only state what kind of attributes and what kind of classes they can handle. The getCapabilities() method of weka.classifiers.trees.RandomTree , for instance, looks like this: public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); // returns the object from weka.classifiers.Classifier // attributes result . enable ( Capability . NOMINAL_ATTRIBUTES ); result . enable ( Capability . NUMERIC_ATTRIBUTES ); result . enable ( Capability . DATE_ATTRIBUTES ); result . enable ( Capability . MISSING_VALUES ); // class result . enable ( Capability . NOMINAL_CLASS ); result . enable ( Capability . MISSING_CLASS_VALUES ); return result ; } Special cases: incremental classifiers - By default, at least 1 instance has to be in the dataset, which does not apply for incremental classifiers. They have to lower the limit to 0 : result.setMinimumNumberInstances(0); multi-instance classifiers - The structure for multi-instance classifiers is always fixed to bagID,bag-data,class . To restrict the data to multi-instance data, add the following: result.enable(Capability.ONLY_MULTIINSTANCE); Multi-instance classifiers also implement the following interface, which returns the Capabilities for the bag-data, which is just a relational attribute (the reason why RELATIONAL_ATTRIBUTES has to be enabled): weka.core.MultiInstanceCapabilitiesHandler clusterer - Since clusterer don't need a class attribute like classifiers, the following Capability has to be specified to enable datasets without a class attribute (which is already done in the superclass weka.clusterers.Clusterer ): result.enable(Capability.NO_CLASS);","title":"Base-classifier"},{"location":"writing_classifier/#meta-classifier","text":"Meta-classifiers, by default, just return the capabilities of their base classifiers - in case of descendants of the weka.classifier.MultipleClassifiersCombiner , an AND over all the Capabilities of the base classifiers is returned. Due to this behavior, the Capabilities depend (normally) only on the currently configured base classifier(s). To soften filtering for certain behavior, meta-classifiers also define so-called Dependencies on a per-Capability basis. These dependencies tell the filter that even though a certain capability is not supported right now, it is possible that it will be supported with a different base classifier. By default, all Capabilities are initialized as Dependencies. weka.classifiers.meta.LogitBoost , e.g., is restricted to nominal classes. For that reason it disables the Dependencies for the class: result . disableAllClasses (); // disable all class types result . disableAllClassDependencies (); // no dependencies! result . enable ( Capability . NOMINAL_CLASS ); // only nominal classes allowed","title":"Meta-classifier"},{"location":"writing_classifier/#relevant-classes","text":"weka.core.Capabilities weka.core.CapabilitiesHandler weka.core.MultiInstanceCapabilitiesHandler (for multi-instance classifiers)","title":"Relevant classes"},{"location":"writing_classifier/#paper-references","text":"In order to make it easy to generate a bibliography of all the algorithms in Weka, the paper references located so far in the Javadoc were extracted and placed in the code. Classes that are based on some technical paper should implement the TechnicalInformationHandler interface and return a customized TechnicalInformation instance. The format used is based on BibTeX and the TechnicalInformation class can either return a plain text string via the toString() method or a real BibTeX entry via the toBibTex() method. This two methods are then used to automatically update the Javadoc (see Javadoc further down) of a class. Relevant classes: weka.core.TechnicalInformation weka.core.TechnicalInformationHandler","title":"Paper reference(s)"},{"location":"writing_classifier/#javadoc","text":"Open-source software is only as good as its documentation. Hence, correct and up-to-date documentation is vital. So far most of the Javadoc was maintained manually, which made it hard to maintain, e.g., as soon as new options were added the Javadoc had to be changed accordingly, too. And that normally in several places: Class description setOptions(String[]) method Over the time the documentation got out of sync, which made it frustrating determining what options were really relevant and active. Since a lot of the documentation is already available in the code itself, the next logical step was to automate the Javadoc generation as much as possible. In the following you will see how to structure your Javadoc to reduce maintainance. For this purpose special comment tags are used, where the content in between will be replaced automatically by the classes listed below in the Relevant classes section. The indentation of the generated Javadoc depends on the indentation of the &lt; of the starting comment tag. This general layout order should be used for all classes: class description Javadoc globalinfo bibtex - if available commandline options setOptions Javadoc commandline options","title":"Javadoc"},{"location":"writing_classifier/#general","text":"The general description for all classes displayed in the GenericObjectEditor was already in place, with the following method: globalInfo () The return value can be placed in the Javadoc, surrounded by the following comment tags: <!-- globalinfo-start --> will be automatically replaced <!-- globalinfo-end -->","title":"General"},{"location":"writing_classifier/#paper-references_1","text":"If available, the paper reference should also be listed in the Javadoc. Since the globalInfo() method should return a short version of the reference, it is sufficient to list the full BibTeX documentation: <!-- technical-bibtex-start --> will be automatically replaced <!-- technical-bibtex-end --> In case it is necessary to list the short, plain text version, too, one can use the following tags: <!-- technical-plaintext-start --> will be automatically replaced <!-- technical-plaintext-end -->","title":"Paper reference(s)"},{"location":"writing_classifier/#options","text":"To place the commandline options, use the following comment tags: <!-- options-start --> will be automatically replaced <!-- options-end -->","title":"Options"},{"location":"writing_classifier/#relevant-classes_1","text":"weka.core.AllJavadoc - executes all Javadoc-producing classes weka.core.GlobalInfoJavadoc - updates the globalInfo tags weka.core.OptionHandlerJavadoc - updates the option tags weka.core.TechnicalInformationHandlerJavadoc - updates the technical tags (plain text and BibTeX )","title":"Relevant classes"},{"location":"writing_classifier/#integration","text":"After finishing the coding stage, it's time to integrate your classifier in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your classifier and therefore displaying it in the GenericObjectEditor .","title":"Integration"},{"location":"writing_classifier/#revisions","text":"Classifiers also implement the weka.core.RevisionHandler interface. Classifiers that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); }","title":"Revisions"},{"location":"writing_classifier/#testing","text":"Weka provides already a test framework to ensure the basic functionality of a classifier. It is essential for the classifier to pass these tests.","title":"Testing"},{"location":"writing_classifier/#commandline-test","text":"","title":"Commandline test"},{"location":"writing_classifier/#general_1","text":"Use the CheckClassifier class to test your classifier from commandline: weka.classifiers.CheckClassifier -W classname [-- additional parameters] Only the following tests may have \"no\" as result, the others must have a \"no (OK error message)\" or \"yes\": options updateable classifier weighted instances classifier multi-instance classifier","title":"General"},{"location":"writing_classifier/#option-handling","text":"Additionally, check the option handling of your classifier with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes .","title":"Option handling"},{"location":"writing_classifier/#genericobjecteditor","text":"The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes .","title":"GenericObjectEditor"},{"location":"writing_classifier/#source-code","text":"Classifiers that implement the weka.classifiers.Sourcable interface can output Java code of their model. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.classifiers.CheckSource This class takes the original Weka classifier, the generated code and the dataset used for generating the source code as parameters. It builds the Weka classifier on the dataset and compares the predictions, the ones from the Weka classifier and the ones from the generated source code, whether they are the same. Here's an example call for weka.classifiers.trees.Id3 and the generated class weka.classifiers.WekaWrapper (it wraps the actual generated code in a pseudo-classifier): java weka.classifiers.CheckSource \\ -W \"weka.classifiers.trees.Id3\" \\ -S weka.classifiers.WekaWrapper \\ -t data.arff \\ -c last It needs to return Tests OK! .","title":"Source code"},{"location":"writing_classifier/#unit-tests","text":"In order to make sure that your classifier applies to the Weka criteria, you should add your classifier to the junit unit test framework, i.e., by creating a Test class derived from AbstractClassifierTest . This class uses the CheckClassifier , CheckOptionHandler and CheckGOE class to run a battery of tests.","title":"Unit tests"},{"location":"writing_classifier/#see-also","text":"GenericObjectEditor Paper References Writing your own Classifier Article","title":"See also"},{"location":"writing_classifier/#links","text":"Build_classifier.pdf - MindMap for implementing a new classifier Weka API ( stable , developer ) Freemind junit","title":"Links"},{"location":"writing_classifier_article/","text":"Here you'll find instructions of how to create an article that describes the classifier you developed, including the upload of the source code itself. As an example, the Weka classifier ZeroR is used. Preparation # The content of the wiki is available as repository on GitHub , which also contains a guide on how to build and test the repository using MkDocs . Add or modify items as necessary and then do a pull request . See this link for details on writing articles. Article Layout # To provide fast access to a certain classifier, the following topics should be covered in an article about a classifier: insert a full description of your classifier, whether it handles numerical and/or nominal classes/attributes; in a nutshell, information a potential user needs to decide whether this specific classifier is suitable for a certain problem # Description Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class). add the reference paper, if any # Reference -none- add the package the classifier belongs to, that people can adapt their GenericObjectEditor props file and use the classifier within Weka . # Package weka.classifiers.rules add the link to download the source code, either an internal one, if you uploaded the source code, or an external one, if you want to point to a different resource; here it is an internal one, pointing to ZeroR.java # Download Source code: [ZeroR.java](files/ZeroR.java) (optional) add additional information, e.g., some benchmarks; which is nothing in our case # Additional Information -none- finally, save and deploy the article","title":"Writing classifier article"},{"location":"writing_classifier_article/#preparation","text":"The content of the wiki is available as repository on GitHub , which also contains a guide on how to build and test the repository using MkDocs . Add or modify items as necessary and then do a pull request . See this link for details on writing articles.","title":"Preparation"},{"location":"writing_classifier_article/#article-layout","text":"To provide fast access to a certain classifier, the following topics should be covered in an article about a classifier: insert a full description of your classifier, whether it handles numerical and/or nominal classes/attributes; in a nutshell, information a potential user needs to decide whether this specific classifier is suitable for a certain problem # Description Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class). add the reference paper, if any # Reference -none- add the package the classifier belongs to, that people can adapt their GenericObjectEditor props file and use the classifier within Weka . # Package weka.classifiers.rules add the link to download the source code, either an internal one, if you uploaded the source code, or an external one, if you want to point to a different resource; here it is an internal one, pointing to ZeroR.java # Download Source code: [ZeroR.java](files/ZeroR.java) (optional) add additional information, e.g., some benchmarks; which is nothing in our case # Additional Information -none- finally, save and deploy the article","title":"Article Layout"},{"location":"writing_filter/","text":"Note: This is also covered in chapter Extending WEKA of the WEKA manual. Packages # A few comments about the different filter sub-packages: supervised - contains supervised filters, i.e., filters that take class distributions into account. Must implement the weka.filters.SupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise. unsupervised - contains unsupervised filters, i.e., they work without taking any class distributions into account. Must implement the weka.filters.UnsupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise. Choosing the superclass # The base filters and interfaces are all located in the following package: weka.filters One can basically distinguish between two different kinds of filters: batch filters - they need to see the whole dataset before they can start processing it, which they do in one go stream filters - they can start producing output right away and the data just passes through while being modified All filters are derived from the abstract superclass weka.filters.Filter . To speed up development, there are also the following abstract filters, depending on the kind of classifier you want to implement: weka.filters.SimpleBatchFilter weka.filters.SimpleStreamFilter These filters simplify the rather general and complex framework introduced by the abstract superclass weka.filters.Filter . One only needs to implement a couple of abstract methods that will process the actual data and override, if necessary, a few existing methods for option handling. Filter # Implementation # The following methods are of importance for the implementation of a filter and explained in detail further down: getCapabilities() setInputFormat(Instances) getInputFormat() setOutputFormat(Instances) getOutputFormat() input(Instance) bufferInput(Instance) push(Instance) output() batchFinished() flushInput() getRevision() But only the following ones need normally be modified: getCapabilities() setInputFormat(Instances) input(Instance) batchFinished() getRevision() getCapabilities() # Filters implement the weka.core.CapabilitiesHandler interface like the classifiers. This method returns what kind of data the filter is able to process. Needs to be adapted for each individual filter. setInputFormat(Instances) # With this format, the user tells the filter what format, i.e., attributes, the input data has. This method also tests, whether the filter can actually process this data. All older Weka versions or book branch versions need to check the data manually and throw fitting exceptions, e.g., not being able to handle String attributes. If the output format of the filter, i.e., the new Instances header, can be determined based alone on this information, then the method should set the output format via setOutputFormat(Instances) and return true , otherwise it has to return false . getInputFormat() # This method returns an Instances object containing all currently buffered Instance objects from the input queue. setOutputFormat(Instances) # setOutputFormat(Instances) defines the new Instances header for the output data. For filters that work on a row-basis, there shouldn't be any changes between the input and output format. But filters that work on attributes, e.g., removing, adding, modifying, will affect this format. This method must be called with the appropriate Instances object as parameter, since all Instance objects being processed will rely on the output format. getOutputFormat() # This method returns the currently set Instances object that defines the output format. In case setOutputFormat(Instances) hasn't been called yet, this method will return null . input(Instance) # The input(Instance) method returns true if the given Instance can be processed straight away and can be collected immediately via the output() method (after adding it to the output queue via push(Instance) , of course). This is also the case if the first batch of data has been processed and the instance belongs to the second batch. Via isFirstBatchDone() one can query whether this instance is still part of the first batch or of the second. If the Instance cannot be processed immediately, e.g., the filter needs to collect all the data first before doing some calculations, then it needs to be buffered with bufferInput(Instance) until batchFinished() is called. bufferInput(Instance) # In case an Instance cannot be processed immediately, one can use this method to buffer them in the input queue. All buffered Instance objects are available via the getInputFormat() method. push(Instance) # push(Instance) adds the given Instance to the output queue. output() # Returns the next Instance object from the output queue and removes it from there. In case there is no Instance available this method returns null . batchFinished() # The batchFinished() method signifies the end of a dataset being pushed through the filter. In case of a filter that couldn't process the data of the first batch immediately, this is the place to determine what the output format will be (and set if via setOutputFormat(Instances) ) and process the actual data. The currently available data can be retrieved with the getInputFormat() method. After processing the data, one needs to call flushInput() to remove all the pending input data. flushInput() # flushInput() removes all buffered Instance objects from the input queue. This method must be called after all the Instance objects have been processed in the batchFinished() method. Option handling # If the filter should be able to handle commandline options, then the weka.core.OptionHandler interface needs to be implemented. In addition to that, the following code should be added at the end of the setOptions(String[]) method: if ( getInputFormat () != null ) setInputFormat ( getInputFormat ()); This will inform the filter about changes in the options and therefore reset it. Examples # The following examples are to illustrate the filter framework. Note: unseeded random number generators like Math.random() should never be used since they will produce different results in each run and repeatable results are essential in machine learning. BatchFilter # This simple batch filter adds a new attribute called //bla// at the end of the dataset. The rows of this attribute contain only the row's index in the data. Since the batch-filter need not see all the data before creating the output format, the setInputFormat(Instances) sets the output format and returns true (indicating that the output format can be queried immediately). The batchFinished() method performs the processing of all the data. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"can be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); return true ; // output format is immediately available } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter (), args ); } } BatchFilter2 # In contrast to the first batch filter, this one here cannot determine the output format immediately (the number of instances in the first batch is part of the attribute name now). This is done in the batchFinished() method. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter2 extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"cannot be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (depends on first batch of data) if ( ! isFirstBatchDone ()) { Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter2 (), args ); } } BatchFilter3 # As soon as this batch filter's first batch is done, it can process Instance objects immediately in the input(Instance) method. It adds a new attribute which contains just a random number, but the random number generator being used is seeded with the number of instances from the first batch. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class BatchFilter3 extends Filter { protected int m_Seed ; protected Random m_Random ; public String globalInfo () { return \"A batch filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format cannot be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } if ( isFirstBatchDone ()) convertInstance ( instance ); else bufferInput ( instance ); return isFirstBatchDone (); } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (random number generator is seeded // with number of instances of first batch) if ( ! isFirstBatchDone ()) { m_Seed = getInputFormat (). numInstances (); Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { convertInstance ( inst . instance ( i )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; m_Random = null ; return ( numPendingOutput () != 0 ); } protected void convertInstance ( Instance instance ) { if ( m_Random = null ) m_Random = new Random ( m_Seed ); double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new BatchFilter3 (), args ); } } StreamFilter # This stream filter adds a random number at the end of each instance of the input data. Since this doesn't rely on having access to the full data of the first batch, the output format is accessible immediately after using setInputFormat(Instances) . All the Instance objects are immediately processed in input(Instance) via the convertInstance(Instance) method, which pushes them immediately to the output queue. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class StreamFilter extends Filter { protected Random m_Random ; public String globalInfo () { return \"A stream filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format can be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); m_Random = new Random ( 1 ); return true ; // output format is immediately available } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } convertInstance ( instance ); return true ; // can be immediately collected via output() } protected void convertInstance ( Instance instance ) { double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new StreamFilter (), args ); } } SimpleBatchFilter # Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instances) - processes the whole dataset in one go getRevision() - returns the revision information If you need access to the full input dataset in determineOutputFormat(Instances) , then you need to also override the method allowAccessToFullInputFormat() and make it return true. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation that adds an additional attribute at the end, containing the index of the processed instance: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; public class SimpleBatch extends SimpleBatchFilter { public String globalInfo () { return \"A simple batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instances process ( Instances inst ) { Instances result = new Instances ( determineOutputFormat ( inst ), 0 ); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] values = new double [ result . numAttributes () ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . instance ( i ). value ( n ); values [ values . length - 1 ] = i ; result . add ( new Instance ( 1 , values )); } return result ; } public static void main ( String [] args ) { runFilter ( new SimpleBatch (), args ); } } SimpleStreamFilter # Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instance) processes a single instance and turns it from the old format into the new one getRevision() - returns the revision information The reset() method is only used, since the random number generator needs to be re-initialized in order to obtain repeatable results. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation of a stream filter that adds an extra attribute at the end, which is filled with random numbers: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; import java.util.Random ; public class SimpleStream extends SimpleStreamFilter { protected Random m_Random ; public String globalInfo () { return \"A simple stream filter that adds an attribute 'bla' at the end \" + \"containing a random number.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected void reset () { super . reset (); m_Random = new Random ( 1 ); } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instance process ( Instance inst ) { double [] values = new double [ inst . numAttributes () + 1 ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . value ( n ); values [ values . length - 1 ] = m_Random . nextInt (); Instance result = new Instance ( 1 , values ); return result ; } public static void main ( String [] args ) { runFilter ( new SimpleStream (), args ); } } A real-world implementation of a stream filter is the MultiFilter (package weka.filters ), which passes the data through all the filters it contains. Depending on whether all the used filters are streamable or not, it acts either as a stream filter or as batch filter. Internals # Some useful methods of the filter classes: isNewBatch() - returns true if an instance of the filter was just instantiated via new or a new batch was started via the batchFinished() method. isFirstBatchDone() - returns true as soon as the first batch was finished via the batchFinished() method. Useful for supervised filters, which should not be altered after being trained with the first batch of instances. Revisions # Filters also implement the weka.core.RevisionHandler interface. Filters that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); } Integration # After finishing the coding stage, it's time to integrate your filter in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your filter and therefore displaying it in the GenericObjectEditor (filters work in the same fashion as classifiers, regarding the discovery). Testing # Weka provides already a test framework to ensure the basic functionality of a filter. It is essential for the filter to pass these tests. Option handling # If your filter implements weka.core.OptionHandler , check the option handling of your filter with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes . GenericObjectEditor # The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes . Source code # Filters that implement the weka.filters.Sourcable interface can output Java code of their internal representation. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.filters.CheckSource This class takes the original Weka filter, the generated code and the dataset used for generating the source code (and an optional class index) as parameters. It builds the Weka filter on the dataset and compares the output, the one from the Weka filter and the one from the generated source code, whether they are the same. Here's an example call for weka.filters.unsupervised.attribute.ReplaceMissingValues and the generated class weka.filters.WekaWrapper (it wraps the actual generated code in a pseudo-filter): java weka.filters.CheckSource \\ -W weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -S weka.filters.WekaWrapper \\ -t data.arff It needs to return Tests OK! . Unit tests # In order to make sure that your filter applies to the Weka criteria, you should add your filter to the junit unit test framework, i.e., by creating a Test class. See also # GenericObjectEditor Links # junit","title":"Writing filter"},{"location":"writing_filter/#packages","text":"A few comments about the different filter sub-packages: supervised - contains supervised filters, i.e., filters that take class distributions into account. Must implement the weka.filters.SupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise. unsupervised - contains unsupervised filters, i.e., they work without taking any class distributions into account. Must implement the weka.filters.UnsupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise.","title":"Packages"},{"location":"writing_filter/#choosing-the-superclass","text":"The base filters and interfaces are all located in the following package: weka.filters One can basically distinguish between two different kinds of filters: batch filters - they need to see the whole dataset before they can start processing it, which they do in one go stream filters - they can start producing output right away and the data just passes through while being modified All filters are derived from the abstract superclass weka.filters.Filter . To speed up development, there are also the following abstract filters, depending on the kind of classifier you want to implement: weka.filters.SimpleBatchFilter weka.filters.SimpleStreamFilter These filters simplify the rather general and complex framework introduced by the abstract superclass weka.filters.Filter . One only needs to implement a couple of abstract methods that will process the actual data and override, if necessary, a few existing methods for option handling.","title":"Choosing the superclass"},{"location":"writing_filter/#filter","text":"","title":"Filter"},{"location":"writing_filter/#implementation","text":"The following methods are of importance for the implementation of a filter and explained in detail further down: getCapabilities() setInputFormat(Instances) getInputFormat() setOutputFormat(Instances) getOutputFormat() input(Instance) bufferInput(Instance) push(Instance) output() batchFinished() flushInput() getRevision() But only the following ones need normally be modified: getCapabilities() setInputFormat(Instances) input(Instance) batchFinished() getRevision()","title":"Implementation"},{"location":"writing_filter/#getcapabilities","text":"Filters implement the weka.core.CapabilitiesHandler interface like the classifiers. This method returns what kind of data the filter is able to process. Needs to be adapted for each individual filter.","title":"getCapabilities()"},{"location":"writing_filter/#setinputformatinstances","text":"With this format, the user tells the filter what format, i.e., attributes, the input data has. This method also tests, whether the filter can actually process this data. All older Weka versions or book branch versions need to check the data manually and throw fitting exceptions, e.g., not being able to handle String attributes. If the output format of the filter, i.e., the new Instances header, can be determined based alone on this information, then the method should set the output format via setOutputFormat(Instances) and return true , otherwise it has to return false .","title":"setInputFormat(Instances)"},{"location":"writing_filter/#getinputformat","text":"This method returns an Instances object containing all currently buffered Instance objects from the input queue.","title":"getInputFormat()"},{"location":"writing_filter/#setoutputformatinstances","text":"setOutputFormat(Instances) defines the new Instances header for the output data. For filters that work on a row-basis, there shouldn't be any changes between the input and output format. But filters that work on attributes, e.g., removing, adding, modifying, will affect this format. This method must be called with the appropriate Instances object as parameter, since all Instance objects being processed will rely on the output format.","title":"setOutputFormat(Instances)"},{"location":"writing_filter/#getoutputformat","text":"This method returns the currently set Instances object that defines the output format. In case setOutputFormat(Instances) hasn't been called yet, this method will return null .","title":"getOutputFormat()"},{"location":"writing_filter/#inputinstance","text":"The input(Instance) method returns true if the given Instance can be processed straight away and can be collected immediately via the output() method (after adding it to the output queue via push(Instance) , of course). This is also the case if the first batch of data has been processed and the instance belongs to the second batch. Via isFirstBatchDone() one can query whether this instance is still part of the first batch or of the second. If the Instance cannot be processed immediately, e.g., the filter needs to collect all the data first before doing some calculations, then it needs to be buffered with bufferInput(Instance) until batchFinished() is called.","title":"input(Instance)"},{"location":"writing_filter/#bufferinputinstance","text":"In case an Instance cannot be processed immediately, one can use this method to buffer them in the input queue. All buffered Instance objects are available via the getInputFormat() method.","title":"bufferInput(Instance)"},{"location":"writing_filter/#pushinstance","text":"push(Instance) adds the given Instance to the output queue.","title":"push(Instance)"},{"location":"writing_filter/#output","text":"Returns the next Instance object from the output queue and removes it from there. In case there is no Instance available this method returns null .","title":"output()"},{"location":"writing_filter/#batchfinished","text":"The batchFinished() method signifies the end of a dataset being pushed through the filter. In case of a filter that couldn't process the data of the first batch immediately, this is the place to determine what the output format will be (and set if via setOutputFormat(Instances) ) and process the actual data. The currently available data can be retrieved with the getInputFormat() method. After processing the data, one needs to call flushInput() to remove all the pending input data.","title":"batchFinished()"},{"location":"writing_filter/#flushinput","text":"flushInput() removes all buffered Instance objects from the input queue. This method must be called after all the Instance objects have been processed in the batchFinished() method.","title":"flushInput()"},{"location":"writing_filter/#option-handling","text":"If the filter should be able to handle commandline options, then the weka.core.OptionHandler interface needs to be implemented. In addition to that, the following code should be added at the end of the setOptions(String[]) method: if ( getInputFormat () != null ) setInputFormat ( getInputFormat ()); This will inform the filter about changes in the options and therefore reset it.","title":"Option handling"},{"location":"writing_filter/#examples","text":"The following examples are to illustrate the filter framework. Note: unseeded random number generators like Math.random() should never be used since they will produce different results in each run and repeatable results are essential in machine learning.","title":"Examples"},{"location":"writing_filter/#batchfilter","text":"This simple batch filter adds a new attribute called //bla// at the end of the dataset. The rows of this attribute contain only the row's index in the data. Since the batch-filter need not see all the data before creating the output format, the setInputFormat(Instances) sets the output format and returns true (indicating that the output format can be queried immediately). The batchFinished() method performs the processing of all the data. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"can be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); return true ; // output format is immediately available } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter (), args ); } }","title":"BatchFilter"},{"location":"writing_filter/#batchfilter2","text":"In contrast to the first batch filter, this one here cannot determine the output format immediately (the number of instances in the first batch is part of the attribute name now). This is done in the batchFinished() method. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter2 extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"cannot be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (depends on first batch of data) if ( ! isFirstBatchDone ()) { Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter2 (), args ); } }","title":"BatchFilter2"},{"location":"writing_filter/#batchfilter3","text":"As soon as this batch filter's first batch is done, it can process Instance objects immediately in the input(Instance) method. It adds a new attribute which contains just a random number, but the random number generator being used is seeded with the number of instances from the first batch. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class BatchFilter3 extends Filter { protected int m_Seed ; protected Random m_Random ; public String globalInfo () { return \"A batch filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format cannot be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } if ( isFirstBatchDone ()) convertInstance ( instance ); else bufferInput ( instance ); return isFirstBatchDone (); } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (random number generator is seeded // with number of instances of first batch) if ( ! isFirstBatchDone ()) { m_Seed = getInputFormat (). numInstances (); Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { convertInstance ( inst . instance ( i )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; m_Random = null ; return ( numPendingOutput () != 0 ); } protected void convertInstance ( Instance instance ) { if ( m_Random = null ) m_Random = new Random ( m_Seed ); double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new BatchFilter3 (), args ); } }","title":"BatchFilter3"},{"location":"writing_filter/#streamfilter","text":"This stream filter adds a random number at the end of each instance of the input data. Since this doesn't rely on having access to the full data of the first batch, the output format is accessible immediately after using setInputFormat(Instances) . All the Instance objects are immediately processed in input(Instance) via the convertInstance(Instance) method, which pushes them immediately to the output queue. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class StreamFilter extends Filter { protected Random m_Random ; public String globalInfo () { return \"A stream filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format can be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); m_Random = new Random ( 1 ); return true ; // output format is immediately available } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } convertInstance ( instance ); return true ; // can be immediately collected via output() } protected void convertInstance ( Instance instance ) { double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new StreamFilter (), args ); } }","title":"StreamFilter"},{"location":"writing_filter/#simplebatchfilter","text":"Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instances) - processes the whole dataset in one go getRevision() - returns the revision information If you need access to the full input dataset in determineOutputFormat(Instances) , then you need to also override the method allowAccessToFullInputFormat() and make it return true. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation that adds an additional attribute at the end, containing the index of the processed instance: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; public class SimpleBatch extends SimpleBatchFilter { public String globalInfo () { return \"A simple batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instances process ( Instances inst ) { Instances result = new Instances ( determineOutputFormat ( inst ), 0 ); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] values = new double [ result . numAttributes () ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . instance ( i ). value ( n ); values [ values . length - 1 ] = i ; result . add ( new Instance ( 1 , values )); } return result ; } public static void main ( String [] args ) { runFilter ( new SimpleBatch (), args ); } }","title":"SimpleBatchFilter"},{"location":"writing_filter/#simplestreamfilter","text":"Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instance) processes a single instance and turns it from the old format into the new one getRevision() - returns the revision information The reset() method is only used, since the random number generator needs to be re-initialized in order to obtain repeatable results. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation of a stream filter that adds an extra attribute at the end, which is filled with random numbers: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; import java.util.Random ; public class SimpleStream extends SimpleStreamFilter { protected Random m_Random ; public String globalInfo () { return \"A simple stream filter that adds an attribute 'bla' at the end \" + \"containing a random number.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected void reset () { super . reset (); m_Random = new Random ( 1 ); } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instance process ( Instance inst ) { double [] values = new double [ inst . numAttributes () + 1 ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . value ( n ); values [ values . length - 1 ] = m_Random . nextInt (); Instance result = new Instance ( 1 , values ); return result ; } public static void main ( String [] args ) { runFilter ( new SimpleStream (), args ); } } A real-world implementation of a stream filter is the MultiFilter (package weka.filters ), which passes the data through all the filters it contains. Depending on whether all the used filters are streamable or not, it acts either as a stream filter or as batch filter.","title":"SimpleStreamFilter"},{"location":"writing_filter/#internals","text":"Some useful methods of the filter classes: isNewBatch() - returns true if an instance of the filter was just instantiated via new or a new batch was started via the batchFinished() method. isFirstBatchDone() - returns true as soon as the first batch was finished via the batchFinished() method. Useful for supervised filters, which should not be altered after being trained with the first batch of instances.","title":"Internals"},{"location":"writing_filter/#revisions","text":"Filters also implement the weka.core.RevisionHandler interface. Filters that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); }","title":"Revisions"},{"location":"writing_filter/#integration","text":"After finishing the coding stage, it's time to integrate your filter in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your filter and therefore displaying it in the GenericObjectEditor (filters work in the same fashion as classifiers, regarding the discovery).","title":"Integration"},{"location":"writing_filter/#testing","text":"Weka provides already a test framework to ensure the basic functionality of a filter. It is essential for the filter to pass these tests.","title":"Testing"},{"location":"writing_filter/#option-handling_1","text":"If your filter implements weka.core.OptionHandler , check the option handling of your filter with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes .","title":"Option handling"},{"location":"writing_filter/#genericobjecteditor","text":"The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes .","title":"GenericObjectEditor"},{"location":"writing_filter/#source-code","text":"Filters that implement the weka.filters.Sourcable interface can output Java code of their internal representation. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.filters.CheckSource This class takes the original Weka filter, the generated code and the dataset used for generating the source code (and an optional class index) as parameters. It builds the Weka filter on the dataset and compares the output, the one from the Weka filter and the one from the generated source code, whether they are the same. Here's an example call for weka.filters.unsupervised.attribute.ReplaceMissingValues and the generated class weka.filters.WekaWrapper (it wraps the actual generated code in a pseudo-filter): java weka.filters.CheckSource \\ -W weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -S weka.filters.WekaWrapper \\ -t data.arff It needs to return Tests OK! .","title":"Source code"},{"location":"writing_filter/#unit-tests","text":"In order to make sure that your filter applies to the Weka criteria, you should add your filter to the junit unit test framework, i.e., by creating a Test class.","title":"Unit tests"},{"location":"writing_filter/#see-also","text":"GenericObjectEditor","title":"See also"},{"location":"writing_filter/#links","text":"junit","title":"Links"},{"location":"zero_r/","text":"Description # Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class). Reference # -none- Package # weka.classifiers.rules Download # Source code: ZeroR.java Additional Information # -none-","title":"Description"},{"location":"zero_r/#description","text":"Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class).","title":"Description"},{"location":"zero_r/#reference","text":"-none-","title":"Reference"},{"location":"zero_r/#package","text":"weka.classifiers.rules","title":"Package"},{"location":"zero_r/#download","text":"Source code: ZeroR.java","title":"Download"},{"location":"zero_r/#additional-information","text":"-none-","title":"Additional Information"},{"location":"academic/academic_or_related_projects/","text":"Below are a number of articles referring to academic and/or related projects: Paper References Related Research Groups Related Projects Piqle PROMPT","title":"Academic or related projects"},{"location":"academic/paper_references/","text":"Due to the introduction of the weka.core.TechnicalInformationHandler interface it is now easy to extract all the paper references via weka.core.ClassDiscovery and weka.core.TechnicalInformation . The get_wekatechinfo.sh . Typical use (after an ant exejar ) for BibTeX : get_wekatechinfo.sh -d ../ -w ../dist/weka.jar -b > ../tech.txt (command is issued from the same directory the Weka build.xml is located in) Links # get_wekatechinfo.sh Writing a Classifier","title":"Paper references"},{"location":"academic/paper_references/#links","text":"get_wekatechinfo.sh Writing a Classifier","title":"Links"},{"location":"academic/piqle/","text":"PIQLE is a set of Java classes for quickly experimenting reinforcement learning schemes. For understanding what the algorithms have learned, one can use Weka classification/clustering algorithms : the (s,a,Q(s,a)) values can be formatted in ARFF files. External links # PIQLE homepage","title":"Piqle"},{"location":"academic/piqle/#external-links","text":"PIQLE homepage","title":"External links"},{"location":"academic/prompt/","text":"PROMPT is a open source platform independent system for retrieval, analysis, mapping and comparison of protein sets. PROMPT's focus lies on statistical testing; Data in WEKA's ARFF format can be imported and other data formats can be exported to WEKA. It thus complements the machine learning techniques of the WEKA workbench with statistical significance tests. External links # PROMPT Website","title":"Prompt"},{"location":"academic/prompt/#external-links","text":"PROMPT Website","title":"External links"},{"location":"academic/related_projects/","text":"There are many software projects that are related to Weka because they use it in some form. An incomplete list can be found below. Perhaps particularly noteworthy are RWeka, which provides an interface to Weka from R, python-weka-wrapper, which provides a wrapper for using Weka from Python, and ADAMS, which provides a workflow environment integrating Weka. You may also want to take a look at the (incomplete) list of \"unofficial\" packages for WEKA 3.7 . Active links # ADAMS - A dvanced D ata mining A nd M achine learning S ystem offers all of WEKA's functionality (and lots more) in its workflow engine. Agent Academy - Java integrated development framework for creating Intelligent Agents and Multi Agent Systems Auto-WEKA - Automatic parameter tuning and algorithm selection for Weka. Balie* - BAseLine Information Extraction. Bayesian Network Classifiers - with bindings for Weka. BioWeka - Knowledge discovery and data analysis for biologists. C4.5Rule-PANE and NeC4.5 Cahit Arf - a data extraction utility for Weka. Contrast Mining - Mining the interesting differences between pre-defined data groups. Cost-sensitive classifiers - Adaboost extensions for cost-sensitive classification. Dataconda - Builds a flat table (and an .arff file) from a relational database. Debellor - Data mining platform for data streams. DecisionTemplate - Combining classifiers using Decision Templates. distributedWekaSpark - A proof of concept for running Weka in Apache Spark. ELKI - Similar project, with focus on clustering and outlier detection, as well as on using database index structures for acceleration. Fuzzyweka - Classifier for fuzzy classification based on fuzzy if-then rules. GATE - NLP workbench with Weka interface. GeneticProgramming - Genetic Programming Classifier for Weka Graph RAT - A framework for combining graph and non-graph algorithms. GroovyLab - Groovy based Matlab-like interface to Weka's algorithms. http://sourceforge.net/projects/wekainterfacetranslator/ - A translation tool to facilitate the creation of message.properties files for Weka 3.6. Courtesy of the University of Jordan. Java Framework to crate ARFF from JPA Entity - Use the JPA Entities to create automatically you ARFF file. Kea - automatic keyphrase extraction. KeplerWeka - Weka module for the Kepler workflow environment. Learning Vector Quanization - and more with Weka. Matlab Weka Interface - A module for using Weka from Matlab. Mayday - Machine Learning for Microarrays - plugin for the WEKA machine Learning Library. Meka - A multi-label extension for Weka. Milk - a workbench for multi-instance learning. MOA - Massive online analysis for data streams. Modified version of Weka, including time series mining and visualization tools. Mulan - Multi-label classification. Pattern Miner - Integrated Pattern Management (extraction, storage, retrieval and comparison of data mining patterns) pHMM4weka - Profile Hidden Markov Models for Weka. PROMPT - Statistical comparison and mapping of protein sets. Import/Export of WEKA arff data files. python-arff - Pure Python library for parsing and writing ARFF files. python-weka-wrapper - Wrapper for conveniently using Weka from Python. RWeka - an R interface to Weka. ScalaLab - Scala based Matlab-like interface to Weka's algorithms. Semi-Supervised and Collective Classification using Weka. Spectral clustering by Luigi Dragone. StarSystem command-line tool implementing best practices in supervised classification, including \"agnostic\" feature selection. TClass - classifying multivariate time series. Tertius - a system for rule discovery. TUBE - Tree-based Density Estimation Algorithms. TunedIT - Automated tests of machine-learning algorithms. Repository of datasets, algorithms and benchmarks. Weka for Computational Genetics - Multifactor Dimensionality Reduction (MDR) added to the Weka package. Weka Proper - Database propositionalization for Weka. weka4WS - distributed data mining. Weka-GDPM and Weka-STPM - Weka for geographic data processing and Moving Object Data Analysis and Mining. WekaMetal - a meta-learning extension to Weka. Weka-Parallel - parallel processing for Weka. Word sense disambiguation by Ted Pedersen. x2arff - A simple VB application to convert data stored in excel files into Attribute-Relation File Format. YALE - Yet Another Learning Environment. Dead links (information potentially accessible through Internet Archive) # BenchMarking Via Weka - Programming-language agnostic experimental framework. CMAC - The Cerebellar Model Articulation Controller CVS to ARFF converter - an online tool for the conversion from CSV files to ARFF files Epitopes Toolkit (EpiT) - A platform for developing epitope prediction tools. FAEHIM - Data Mining Web services. FastKMeans - a faster version of k-means clustering (.zip file). Fuzzy algorithms - for clustering and classification. GRB Tool Shed - a tool to aid gamma ray burst research. Grid Weka - grid computing with Weka. Instance-based Classifiers - a collection of Instance-based Classifiers. Judge - software for document classification and clustering. KDDML-MQL - support for KDD process. LocBoost classification demo applet. MARFF - extension of ARFF for Multi-Relational Applications. Mathematica interface for Weka. maxdView visualisation tool for microarray data. Mobile Data Mining (in portuguese) - software for mobile data mining OlexSuite - Text classification methods. OpenSubspace - Subspace-clustering for Weka. Rarff - A Ruby library for manipulating ARFF files. RSW - sequential classification with Weka. TagHelper Tools - a tool for analysis of conversational data. Weka on Text - software for text mining. Weka Visualization tools - using PMML, VisWiz, and ROCOn. Weka-GDPM - extended version of Weka 3.4 to support automatic geographic data preprocessing for spatial data mining. Faster implementation of a NeuralNet with backpropagation (only nominal classes).","title":"Related projects"},{"location":"academic/related_projects/#active-links","text":"ADAMS - A dvanced D ata mining A nd M achine learning S ystem offers all of WEKA's functionality (and lots more) in its workflow engine. Agent Academy - Java integrated development framework for creating Intelligent Agents and Multi Agent Systems Auto-WEKA - Automatic parameter tuning and algorithm selection for Weka. Balie* - BAseLine Information Extraction. Bayesian Network Classifiers - with bindings for Weka. BioWeka - Knowledge discovery and data analysis for biologists. C4.5Rule-PANE and NeC4.5 Cahit Arf - a data extraction utility for Weka. Contrast Mining - Mining the interesting differences between pre-defined data groups. Cost-sensitive classifiers - Adaboost extensions for cost-sensitive classification. Dataconda - Builds a flat table (and an .arff file) from a relational database. Debellor - Data mining platform for data streams. DecisionTemplate - Combining classifiers using Decision Templates. distributedWekaSpark - A proof of concept for running Weka in Apache Spark. ELKI - Similar project, with focus on clustering and outlier detection, as well as on using database index structures for acceleration. Fuzzyweka - Classifier for fuzzy classification based on fuzzy if-then rules. GATE - NLP workbench with Weka interface. GeneticProgramming - Genetic Programming Classifier for Weka Graph RAT - A framework for combining graph and non-graph algorithms. GroovyLab - Groovy based Matlab-like interface to Weka's algorithms. http://sourceforge.net/projects/wekainterfacetranslator/ - A translation tool to facilitate the creation of message.properties files for Weka 3.6. Courtesy of the University of Jordan. Java Framework to crate ARFF from JPA Entity - Use the JPA Entities to create automatically you ARFF file. Kea - automatic keyphrase extraction. KeplerWeka - Weka module for the Kepler workflow environment. Learning Vector Quanization - and more with Weka. Matlab Weka Interface - A module for using Weka from Matlab. Mayday - Machine Learning for Microarrays - plugin for the WEKA machine Learning Library. Meka - A multi-label extension for Weka. Milk - a workbench for multi-instance learning. MOA - Massive online analysis for data streams. Modified version of Weka, including time series mining and visualization tools. Mulan - Multi-label classification. Pattern Miner - Integrated Pattern Management (extraction, storage, retrieval and comparison of data mining patterns) pHMM4weka - Profile Hidden Markov Models for Weka. PROMPT - Statistical comparison and mapping of protein sets. Import/Export of WEKA arff data files. python-arff - Pure Python library for parsing and writing ARFF files. python-weka-wrapper - Wrapper for conveniently using Weka from Python. RWeka - an R interface to Weka. ScalaLab - Scala based Matlab-like interface to Weka's algorithms. Semi-Supervised and Collective Classification using Weka. Spectral clustering by Luigi Dragone. StarSystem command-line tool implementing best practices in supervised classification, including \"agnostic\" feature selection. TClass - classifying multivariate time series. Tertius - a system for rule discovery. TUBE - Tree-based Density Estimation Algorithms. TunedIT - Automated tests of machine-learning algorithms. Repository of datasets, algorithms and benchmarks. Weka for Computational Genetics - Multifactor Dimensionality Reduction (MDR) added to the Weka package. Weka Proper - Database propositionalization for Weka. weka4WS - distributed data mining. Weka-GDPM and Weka-STPM - Weka for geographic data processing and Moving Object Data Analysis and Mining. WekaMetal - a meta-learning extension to Weka. Weka-Parallel - parallel processing for Weka. Word sense disambiguation by Ted Pedersen. x2arff - A simple VB application to convert data stored in excel files into Attribute-Relation File Format. YALE - Yet Another Learning Environment.","title":"Active links"},{"location":"academic/related_projects/#dead-links-information-potentially-accessible-through-internet-archive","text":"BenchMarking Via Weka - Programming-language agnostic experimental framework. CMAC - The Cerebellar Model Articulation Controller CVS to ARFF converter - an online tool for the conversion from CSV files to ARFF files Epitopes Toolkit (EpiT) - A platform for developing epitope prediction tools. FAEHIM - Data Mining Web services. FastKMeans - a faster version of k-means clustering (.zip file). Fuzzy algorithms - for clustering and classification. GRB Tool Shed - a tool to aid gamma ray burst research. Grid Weka - grid computing with Weka. Instance-based Classifiers - a collection of Instance-based Classifiers. Judge - software for document classification and clustering. KDDML-MQL - support for KDD process. LocBoost classification demo applet. MARFF - extension of ARFF for Multi-Relational Applications. Mathematica interface for Weka. maxdView visualisation tool for microarray data. Mobile Data Mining (in portuguese) - software for mobile data mining OlexSuite - Text classification methods. OpenSubspace - Subspace-clustering for Weka. Rarff - A Ruby library for manipulating ARFF files. RSW - sequential classification with Weka. TagHelper Tools - a tool for analysis of conversational data. Weka on Text - software for text mining. Weka Visualization tools - using PMML, VisWiz, and ROCOn. Weka-GDPM - extended version of Weka 3.4 to support automatic geographic data preprocessing for spatial data mining. Faster implementation of a NeuralNet with backpropagation (only nominal classes).","title":"Dead links (information potentially accessible through Internet Archive)"},{"location":"academic/related_research_groups/","text":"Gregory Piatetsky-Shapiro's Knowledge Discovery Mine Journal of Data Mining and Knowledge Discovery Multivariate Data Analysis Software and Resources Machine Learning # MLC++, A Machine Learning Library in C++ UCI - Machine Learning information , software and databases UTCS Machine Learning Research Group Machine Learning Journal Other Sites # AAAI Home Page SRI Artificial Intelligence Center Classification Society of North America MIT Artificial Intelligence Laboratory SIGART Electronic Information Service StatLib Index Dead Links # Microsoft Belief Networks Tools","title":"Related research groups"},{"location":"academic/related_research_groups/#machine-learning","text":"MLC++, A Machine Learning Library in C++ UCI - Machine Learning information , software and databases UTCS Machine Learning Research Group Machine Learning Journal","title":"Machine Learning"},{"location":"academic/related_research_groups/#other-sites","text":"AAAI Home Page SRI Artificial Intelligence Center Classification Society of North America MIT Artificial Intelligence Laboratory SIGART Electronic Information Service StatLib Index","title":"Other Sites"},{"location":"academic/related_research_groups/#dead-links","text":"Microsoft Belief Networks Tools","title":"Dead Links"},{"location":"experimenter/experimenter/","text":"Several articles describe certain aspects of using the WEKA Experimenter: Using the Experimenter API Learning Curves Remote Experiments Running an Experiment using Clusterers Of interest may also be: WEKA experiment DatabaseUtils.props Serialization of Experiments","title":"Experimenter"},{"location":"experimenter/learning_curves/","text":"The Advanced mode of the Experimenter can be used to generated learning curves for classifiers. These approaches can be setup in the Simple mode as well, but it is more cumbersome than in the advanced mode. Number of instances # For varying the number of instances a classifier is trained on, we use the FilteredClassifier classifier (package weka.classifiers.meta ) in conjunction with the RemovePercentage filter (package weka.filters.unsupervised.instance ) and J48 as base classifier (package weka.classifiers.trees ): start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is FilteredClassifier with J48 as base classifier and RemovePercentage as filter close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> filter -> percentage * now you can add all the percentages that you want to test, e.g. (NB: this is the percentage being removed !): 90, 80, 70, 60, 50, 40, 30, 20, 10 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results Classifier parameter # This example shows how to generate a learning curve that does not vary on the number of instances, but on a specific classifier parameter, e.g., the confidenceFactor (= commandline option -C ) of J48 . start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is J48 close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> confidenceFactor * now you can add all the factors that you want to test, e.g.: 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results See also # Databases","title":"Learning curves"},{"location":"experimenter/learning_curves/#number-of-instances","text":"For varying the number of instances a classifier is trained on, we use the FilteredClassifier classifier (package weka.classifiers.meta ) in conjunction with the RemovePercentage filter (package weka.filters.unsupervised.instance ) and J48 as base classifier (package weka.classifiers.trees ): start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is FilteredClassifier with J48 as base classifier and RemovePercentage as filter close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> filter -> percentage * now you can add all the percentages that you want to test, e.g. (NB: this is the percentage being removed !): 90, 80, 70, 60, 50, 40, 30, 20, 10 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results","title":"Number of instances"},{"location":"experimenter/learning_curves/#classifier-parameter","text":"This example shows how to generate a learning curve that does not vary on the number of instances, but on a specific classifier parameter, e.g., the confidenceFactor (= commandline option -C ) of J48 . start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is J48 close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> confidenceFactor * now you can add all the factors that you want to test, e.g.: 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results","title":"Classifier parameter"},{"location":"experimenter/learning_curves/#see-also","text":"Databases","title":"See also"},{"location":"experimenter/remote_experiment/","text":"Remote experiments enable you to distribute the computing load across multiple computers. In the following we will discuss the setup and operation for HSQLDB and MySQL . Preparation # To run a remote experiment you will need: A database server. A number of computers to run remote engines on. To edit the remote engine policy file included in the Weka distribution to allow class and dataset loading from your home directory. An invocation of the Experimenter on a machine somewhere (any will do). For the following examples, we assume a user called johndoe with this setup: Access to a set of computers running a flavour of Unix (pathnames need to be changed for Windows). The home directory is located at /home/johndoe . Weka is found in /home/johndoe/weka . Additional jar archives, i.e., JDBC drivers, are stored in /home/johndoe/jars . The directory for the datasets is /home/johndoe/datasets . Note: The example policy file remote.policy.example is using this setup (available in weka/experiment ). Database Server Setup # HSQLDB Download the JDBC driver for HSQLDB, extract the hsqldb.jar and place it in the directory /home/johndoe/jars . To set up the database server, choose or create a directory to run the database server from, and start the server with: java -classpath /home/johndoe/jars/hsqldb.jar \\ org.hsqldb.Server -database.0 experiment -dbname.0 experiment Note: This will start up a database with the alias experiment ( -dbname.0 <alias> ) and create a properties and a log file at the current location prefixed with experiment ( -database.0 <file> ). MySQL We won't go into details of setting up a MySQL server, but this is rather straightforward and includes the following steps: Download a suitable version of MySQL for your server machine. Start the MySQL server. Create a database - for our example we will use experiment as database name. Download the appropriate JDBC driver, extract the JDBC jar and place it as mysql.jar in /home/johndoe/jars . Remote Engine Setup # First, set up a directory for scripts and policy files: /home/johndoe/remote_engine * Unzip the remoteExperimentServer.jar (from the Weka distribution; or build it from the sources with ant remotejar ) into a temporary directory. * Next, copy the remoteEngine.jar to the /home/johndoe/remote_engine directory. * Create a script, called /home/johndoe/remote_engine/startRemoteEngine , with the following content (don't forget to make it executable with chmod a+x startRemoteEngine ): HSQLDB java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & MySQL java -Xmx256m \\ -classpath /home/johndoe/jars/mysql.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & From Weka 3.7.2 you will need to include the core weka.jar file in the classpath for the RemoteEngine. Assuming that the weka.jar file has been copied to /home/johndoe/remote_engine : java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:weka.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & Now we will start the remote engines (note that the same version of Java must be used for the Experimenter and remote engines) : Copy the remote.policy.example file to /home/johndoe/remote_engine as remote.policy . For each machine you want to run an engine on: ssh to the machine. cd to /home/johndoe/remote_engine . Run /home/johndoe/startRemoteEngine (to enable the remote engines to use more memory, modify the -Xmx option in the startRemoteEngine script) . Configuring the Experimenter # Now we will run the Experimenter: HSQLDB Copy the DatabaseUtils.props.hsql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . Edit this file and change the \" jdbcURL=jdbc:hsqldb:hsql://server_name/database_name \" entry to include the name of the machine that is running your database server (e.g., jdbcURL=jdbc:hsqldb:hsql://dodo.company.com/experiment ). Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter * MySQL * Copy the DatabaseUtils.props.mysql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . * Edit this file and change the \" jdbcURL=jdbc:mysql://server_name:3306/database_name \" entry to include the name of the machine that is running your database server and the name of the database the result will be stored in (e.g., jdbcURL=jdbc:mysql://dodo.company.com:3306/experiment ). * Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/mysql.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter Note: the database name experiment can be still modified in the Experimenter, this is just the default setup. Now we will configure the experiment: First of all select the Advanced mode in the Setup tab Now choose the DatabaseResultListener in the Destination panel. Configure this result producer: HSQLDB Supply the value sa for the username and leave the password empty. MySQL Provide username and password that you need for connecting to the database. From the Result generator panel choose either the CrossValidationResultProducer or the RandomSplitResultProducer (these are the most commonly used ones) and then configure the remaining experiment details (e.g., datasets and classifiers). Now enable the Distribute Experiment panel by checking the tick box. Click on the Hosts button and enter the names of the machines that you started remote engines on ( <Enter> adds the host to the list). You can choose to distribute by run or dataset (try to get a balance). Save your experiment configuration. Now start your experiment as you would do normally. Check your results in the Analyse tab by clicking either the Database or Experiment buttons. Multi-core support # If you want to utilize all the cores on a multi-core machine, then you can do so with Weka version 3.6.x and developer versions later than 3.5.7. All you have to do, is define the port alongside the hostname in the Experimenter (format: hostname:port ) and then start the RemoteEngine with the -p option, specifying the port to listen on. See also this post on the Wekalist . Troubleshooting # If you get an error at the start of an experiment that looks a bit like this: {{01:13:19: RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )] 01:13:19: dataset :vineyard.arff RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )]. Scheduling for execution on another host.}} then do not panic - this happens because multiple remote machines are trying to create the same table and are temporarily locked out - this will resolve itself so just leave your experiment running - in fact, it is a sign that the experiment is working! If you serialized an experiment and then modify your DatabaseUtils.props file due to an error (e.g., a missing type-mapping), the Experimenter will use the DatabaseUtils.props you had at the time you serialized the experiment. Keep in mind that the serialization process also serializes the DatabaseUtils class and therefore stored your props-file! This is another reason for storing your experiments as XML and not in the properietary binary format the Java serialization produces. Using a corrupt or incomplete DatabaseUtils.props file can cause peculiar interface errors, for example disabling the use of the User button alongside the database URL. If in doubt copy a clean DatabaseUtils.props from git . If you get NullPointerException at java.util.Hashtable.get() in the Remote Engine do not be alarmed. This will have no effect on the results of your experiment. Links # Databases weka/experiment/DatabaseUtils.props git HSQLDB MySQL","title":"Remote experiment"},{"location":"experimenter/remote_experiment/#preparation","text":"To run a remote experiment you will need: A database server. A number of computers to run remote engines on. To edit the remote engine policy file included in the Weka distribution to allow class and dataset loading from your home directory. An invocation of the Experimenter on a machine somewhere (any will do). For the following examples, we assume a user called johndoe with this setup: Access to a set of computers running a flavour of Unix (pathnames need to be changed for Windows). The home directory is located at /home/johndoe . Weka is found in /home/johndoe/weka . Additional jar archives, i.e., JDBC drivers, are stored in /home/johndoe/jars . The directory for the datasets is /home/johndoe/datasets . Note: The example policy file remote.policy.example is using this setup (available in weka/experiment ).","title":"Preparation"},{"location":"experimenter/remote_experiment/#database-server-setup","text":"HSQLDB Download the JDBC driver for HSQLDB, extract the hsqldb.jar and place it in the directory /home/johndoe/jars . To set up the database server, choose or create a directory to run the database server from, and start the server with: java -classpath /home/johndoe/jars/hsqldb.jar \\ org.hsqldb.Server -database.0 experiment -dbname.0 experiment Note: This will start up a database with the alias experiment ( -dbname.0 <alias> ) and create a properties and a log file at the current location prefixed with experiment ( -database.0 <file> ). MySQL We won't go into details of setting up a MySQL server, but this is rather straightforward and includes the following steps: Download a suitable version of MySQL for your server machine. Start the MySQL server. Create a database - for our example we will use experiment as database name. Download the appropriate JDBC driver, extract the JDBC jar and place it as mysql.jar in /home/johndoe/jars .","title":"Database Server Setup"},{"location":"experimenter/remote_experiment/#remote-engine-setup","text":"First, set up a directory for scripts and policy files: /home/johndoe/remote_engine * Unzip the remoteExperimentServer.jar (from the Weka distribution; or build it from the sources with ant remotejar ) into a temporary directory. * Next, copy the remoteEngine.jar to the /home/johndoe/remote_engine directory. * Create a script, called /home/johndoe/remote_engine/startRemoteEngine , with the following content (don't forget to make it executable with chmod a+x startRemoteEngine ): HSQLDB java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & MySQL java -Xmx256m \\ -classpath /home/johndoe/jars/mysql.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & From Weka 3.7.2 you will need to include the core weka.jar file in the classpath for the RemoteEngine. Assuming that the weka.jar file has been copied to /home/johndoe/remote_engine : java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:weka.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & Now we will start the remote engines (note that the same version of Java must be used for the Experimenter and remote engines) : Copy the remote.policy.example file to /home/johndoe/remote_engine as remote.policy . For each machine you want to run an engine on: ssh to the machine. cd to /home/johndoe/remote_engine . Run /home/johndoe/startRemoteEngine (to enable the remote engines to use more memory, modify the -Xmx option in the startRemoteEngine script) .","title":"Remote Engine Setup"},{"location":"experimenter/remote_experiment/#configuring-the-experimenter","text":"Now we will run the Experimenter: HSQLDB Copy the DatabaseUtils.props.hsql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . Edit this file and change the \" jdbcURL=jdbc:hsqldb:hsql://server_name/database_name \" entry to include the name of the machine that is running your database server (e.g., jdbcURL=jdbc:hsqldb:hsql://dodo.company.com/experiment ). Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter * MySQL * Copy the DatabaseUtils.props.mysql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . * Edit this file and change the \" jdbcURL=jdbc:mysql://server_name:3306/database_name \" entry to include the name of the machine that is running your database server and the name of the database the result will be stored in (e.g., jdbcURL=jdbc:mysql://dodo.company.com:3306/experiment ). * Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/mysql.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter Note: the database name experiment can be still modified in the Experimenter, this is just the default setup. Now we will configure the experiment: First of all select the Advanced mode in the Setup tab Now choose the DatabaseResultListener in the Destination panel. Configure this result producer: HSQLDB Supply the value sa for the username and leave the password empty. MySQL Provide username and password that you need for connecting to the database. From the Result generator panel choose either the CrossValidationResultProducer or the RandomSplitResultProducer (these are the most commonly used ones) and then configure the remaining experiment details (e.g., datasets and classifiers). Now enable the Distribute Experiment panel by checking the tick box. Click on the Hosts button and enter the names of the machines that you started remote engines on ( <Enter> adds the host to the list). You can choose to distribute by run or dataset (try to get a balance). Save your experiment configuration. Now start your experiment as you would do normally. Check your results in the Analyse tab by clicking either the Database or Experiment buttons.","title":"Configuring the Experimenter"},{"location":"experimenter/remote_experiment/#multi-core-support","text":"If you want to utilize all the cores on a multi-core machine, then you can do so with Weka version 3.6.x and developer versions later than 3.5.7. All you have to do, is define the port alongside the hostname in the Experimenter (format: hostname:port ) and then start the RemoteEngine with the -p option, specifying the port to listen on. See also this post on the Wekalist .","title":"Multi-core support"},{"location":"experimenter/remote_experiment/#troubleshooting","text":"If you get an error at the start of an experiment that looks a bit like this: {{01:13:19: RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )] 01:13:19: dataset :vineyard.arff RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )]. Scheduling for execution on another host.}} then do not panic - this happens because multiple remote machines are trying to create the same table and are temporarily locked out - this will resolve itself so just leave your experiment running - in fact, it is a sign that the experiment is working! If you serialized an experiment and then modify your DatabaseUtils.props file due to an error (e.g., a missing type-mapping), the Experimenter will use the DatabaseUtils.props you had at the time you serialized the experiment. Keep in mind that the serialization process also serializes the DatabaseUtils class and therefore stored your props-file! This is another reason for storing your experiments as XML and not in the properietary binary format the Java serialization produces. Using a corrupt or incomplete DatabaseUtils.props file can cause peculiar interface errors, for example disabling the use of the User button alongside the database URL. If in doubt copy a clean DatabaseUtils.props from git . If you get NullPointerException at java.util.Hashtable.get() in the Remote Engine do not be alarmed. This will have no effect on the results of your experiment.","title":"Troubleshooting"},{"location":"experimenter/remote_experiment/#links","text":"Databases weka/experiment/DatabaseUtils.props git HSQLDB MySQL","title":"Links"},{"location":"experimenter/running_an_experiment_using_clusterers/","text":"Using the advanced mode of the Experimenter you can now run experiments on clustering algorithms as well as classifiers (Note: this is a new feature available with Weka 3.5.8). The main evaluation metric for this type of experiment is the log likelihood of the clusters found by each clusterer. Here is an example of setting up a cross-validation experiment using clusterers. Choose CrossValidationResultProducer from the Result generator panel. Next, choose DensityBasedClustererSplitEvaluator as the split evaluator to use. If you click on DensityBasedClustererSplitEvaluator you will see its options. Note that there is an option for removing the class column from the data. In the Experimenter, the class column is set to be the last column by default. Turn this off if you want to keep this column in the data. Once DensityBasedClustererSplitEvaluator has been selected, you will notice that the Generator properties have become disabled. Enable them again and expand splitEvaluator . Select the clusterer node. Now you will see that EM becomes the default clusterer and gets added to the list of schemes. You can now add/delete other clusterers. IMPORTANT : in order to any clusterer that does not produce density estimates (i.e. most other clusterers in Weka), they will have to wrapped in the MakeDensityBasedClusterer . Once and experiment has been run, you can analyze results in the Analyse panel. In the Comparison field you will need to scroll down and select \"Log_likelihood\".","title":"Running an experiment using clusterers"},{"location":"experimenter/using_the_experiment_api/","text":"General # The ExperimentDemo.java class demonstrates the use of the Experiment API (stable 3.6 or developer version): setting up an experiment one classifier one or more datasets classification or regression cross-validation or random split running the experiment evaluating the experiment and outputting the results Classes of the Experiment API being used: weka.experiment.Experiment - the class for peforming experiments weka.experiment.ClassifierSplitEvaluator - for classification weka.experiment.RegressionSplitEvaluator - for regression weka.experiment.CrossValidationResultProducer - for cross-validation weka.experiment.RandomSplitResultProducer - for random splits weka.experiment.InstancesResultListener - for storing the results of the experiment, used as input for the TTester algorithm weka.experiment.PairedCorrectedTTester - for generating the statistics see Claude Nadeau, Yoshua Bengio (2001). Inference for the Generalization Error. Machine Learning. weka.experiment.ResultMatrixPlainText - for storing the statistics Examples # Usage: java ExperimentDemo -classifier <classifier incl. parameters> -exptype <classification | regression> -splittype <crossvalidation | randomsplit> -runs < # of runs> -folds <folds for CV> -percentage <percentage for randomsplit> -result <ARFF file for storing the results> -t <dataset> ( can be supplied multiple times ) Classification # An example run with J48 and two UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.J48 -exptype classification -splittype crossvalidation -runs 10 -folds 10 -result /some/where/results.arff -t vote.arff -t iris.arff And the output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) vote Perc. correct: 96.57135311000002 StdDev: 2.560851001842444 (2) iris Perc. correct: 94.73333325999994 StdDev: 5.300826810632913 Regression # Another example with M5P and two numeric UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.M5P -exptype regression -splittype randomsplit -runs 10 -percentage 66 -result /some/where/results.arff -t bolts.arff -t bodyfat.arff And the associated output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) bolts Perc. correct: 0.9701825 StdDev: 0.017970627641614084 (2) bodyfat.names Perc. correct: 0.9795883 StdDev: 0.011646527074622525 See also # Use Weka in your Java code - for general use of the Weka API Downloads # ExperimentDemo.java","title":"General"},{"location":"experimenter/using_the_experiment_api/#general","text":"The ExperimentDemo.java class demonstrates the use of the Experiment API (stable 3.6 or developer version): setting up an experiment one classifier one or more datasets classification or regression cross-validation or random split running the experiment evaluating the experiment and outputting the results Classes of the Experiment API being used: weka.experiment.Experiment - the class for peforming experiments weka.experiment.ClassifierSplitEvaluator - for classification weka.experiment.RegressionSplitEvaluator - for regression weka.experiment.CrossValidationResultProducer - for cross-validation weka.experiment.RandomSplitResultProducer - for random splits weka.experiment.InstancesResultListener - for storing the results of the experiment, used as input for the TTester algorithm weka.experiment.PairedCorrectedTTester - for generating the statistics see Claude Nadeau, Yoshua Bengio (2001). Inference for the Generalization Error. Machine Learning. weka.experiment.ResultMatrixPlainText - for storing the statistics","title":"General"},{"location":"experimenter/using_the_experiment_api/#examples","text":"Usage: java ExperimentDemo -classifier <classifier incl. parameters> -exptype <classification | regression> -splittype <crossvalidation | randomsplit> -runs < # of runs> -folds <folds for CV> -percentage <percentage for randomsplit> -result <ARFF file for storing the results> -t <dataset> ( can be supplied multiple times )","title":"Examples"},{"location":"experimenter/using_the_experiment_api/#classification","text":"An example run with J48 and two UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.J48 -exptype classification -splittype crossvalidation -runs 10 -folds 10 -result /some/where/results.arff -t vote.arff -t iris.arff And the output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) vote Perc. correct: 96.57135311000002 StdDev: 2.560851001842444 (2) iris Perc. correct: 94.73333325999994 StdDev: 5.300826810632913","title":"Classification"},{"location":"experimenter/using_the_experiment_api/#regression","text":"Another example with M5P and two numeric UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.M5P -exptype regression -splittype randomsplit -runs 10 -percentage 66 -result /some/where/results.arff -t bolts.arff -t bodyfat.arff And the associated output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) bolts Perc. correct: 0.9701825 StdDev: 0.017970627641614084 (2) bodyfat.names Perc. correct: 0.9795883 StdDev: 0.011646527074622525","title":"Regression"},{"location":"experimenter/using_the_experiment_api/#see-also","text":"Use Weka in your Java code - for general use of the Weka API","title":"See also"},{"location":"experimenter/using_the_experiment_api/#downloads","text":"ExperimentDemo.java","title":"Downloads"},{"location":"faqs/OutOfMemoryException/","text":"Most Java virtual machines only allocate a certain maximum amount of memory to run Java programs. Usually, this is much less than the amount of RAM in your computer. There is some information on default heap sizes in Oracle Java virtual machines for Java 8 here . However, you can extend the memory available for the virtual machine by setting appropriate options. With Oracle's JDK, for example, you can go java -Xmx2g ... to set the maximum Java heap size to 2GB. A reliable way to set the maximum heap size for Oracle, Zulu and OpenJDK Java virtual machines (and overwrite any other settings that might be provided in startup scripts, etc.) is to use the _JAVA_OPTIONS environment variable to specify the -Xmx option. There is more information here . Setting environment variables under Windows: Windows 10 Windows 11","title":"OutOfMemoryException"},{"location":"faqs/arff_does_not_load/","text":"One way to figure out why ARFF files are failing to load is to give them to the weka.core.Instances class. In the SimpleCLI or in the terminal, type the following: java weka.core.Instances filename.arff where you substitute filename for the actual name of your file. This should return an error if there is a problem reading the file, or show some statistics if the file is ok. The error message you get should give some indication of what is wrong. nominal value not declared in header, read Token[X], line Y # If you get this error message than you seem to have declared a nominal attribute in the ARFF header section, but WEKA came across a value ( \"X\" ) in the data (in line Y ) for this particular attribute that wasn't listed as possible value. All nominal values that appear in the data must be declared in the header.","title":"Arff does not load"},{"location":"faqs/arff_does_not_load/#nominal-value-not-declared-in-header-read-tokenx-line-y","text":"If you get this error message than you seem to have declared a nominal attribute in the ARFF header section, but WEKA came across a value ( \"X\" ) in the data (in line Y ) for this particular attribute that wasn't listed as possible value. All nominal values that appear in the data must be declared in the header.","title":"nominal value not declared in header, read Token[X], line Y"},{"location":"faqs/can_i_change_the_colors_background_axes_etc_of_the_plots_in_weka/","text":"Sure, this information is stored in the Visualize.props properties file: weka.gui.visualize.Plot2D.axisColour defines the color of the axes weka.gui.visualize.Plot2D.backgroundColour sets the background color For more information see the articles about properties file (especially the section Precedence will tell you where to place the .props file.) and Visualize.props itself.","title":"Can i change the colors background axes etc of the plots in weka"},{"location":"faqs/can_i_compile_weka_into_native_code/","text":"Yes, you have the following options: Excelsior JET - a commercial tool for compiling Java into native code (Windows/Linux) gcj - a free, cross-platform tool for compiling Java into native code See the article Compiling WEKA with gcj for more details.","title":"Can i compile weka into native code"},{"location":"faqs/can_i_make_a_screenshot_of_a_plot_or_graph_directly_in_weka/","text":"Yes, you can. The currently supported formats are BMP, EPS, JPEG and PNG. The magic button is Alt+Shift+Left-Click . From Weka 3.7.5 it is also possible to export various charts as PNG files non-interactively from a Knowledge Flow process. See Exporting Charts from the Knowledge Flow .","title":"Can i make a screenshot of a plot or graph directly in weka"},{"location":"faqs/can_i_process_utf8_datasets_or_files/","text":"Java can process UTF-8 files without any problems, it is just that Java uses a different encoding for displaying them under Windows (= \"Cp1252\"). If you change the file encoding to \"utf-8\" everything should be fine. If you are running WEKA directly from the commandline, just add the following parameter to your commandline: -Dfile.encoding=utf-8 If you are starting WEKA from the Start menu, then edit the RunWEKA.ini file: If a fileEncoding placeholder already exists, then just change the value from \"Cp1252\" to \"utf-8\" (without the quotes of course). If there isn't a fileEncoding yet, just add the -Dfile.encoding=utf-8 parameter to all the java / javaw commands). For Korean users, the following was suggested : Save ARFF file as UTF-8 Use cp949 for RunWeka.ini Check Unicode Utf-8 in Region Settings","title":"Can i process utf8 datasets or files"},{"location":"faqs/can_i_run_an_experiment_using_clusterers_in_the_experimenter/","text":"Yes, see the article Running an Experiment Using Clusterers .","title":"Can i run an experiment using clusterers in the experimenter"},{"location":"faqs/can_i_tune_the_parameters_of_a_classifier/","text":"Yes, you can do that with one of the following meta-classifiers: weka.classifiers.meta.CVParameterSelection weka.classifiers.meta.GridSearch (only developer version) weka.classifiers.meta.AutoWEKAClassifier (via external package) weka.classifiers.meta.MultiSearch (via external package ) See the Javadoc of the respective classifier or the Optimizing parameters article for more information.","title":"Can i tune the parameters of a classifier"},{"location":"faqs/can_i_use_gpus_to_speed_up_weka/","text":"Possibly, have a look at the article Speeding up Weka for more details.","title":"Can i use gpus to speed up weka"},{"location":"faqs/can_i_use_weka_for_time_series_analysis/","text":"Weka 3.7.3 has a new package that provides an environment for time series analysis. The article How do I use the package manager? can be followed to install this package. Once installed, the package provides a plugin tab in the Explorer. Documentation on the time series environment can be found here Older versions of Weka have limited support for time series analysis and consists of only two filters, TimeSeriesDelta and TimeSeriesTranslate . There are modified (not supported by the University of Waikato) versions of WEKA out there, that offer additional functionality ( 1 , 2 ).","title":"Can i use weka for time series analysis"},{"location":"faqs/can_i_use_weka_from_c_sharp/","text":"Yes, you can. Read the Use WEKA with the Microsoft .NET Framework article for more information. There is also a tutorial for IKVM available .","title":"Can i use weka from c sharp"},{"location":"faqs/can_i_use_weka_from_groovy/","text":"Yes, you can. Read the Using WEKA from Groovy article for more information. This article tells you how to setup the Groovy CLASSPATH, in order to make the WEKA classes available to Groovy, and also contains some sample code.","title":"Can i use weka from groovy"},{"location":"faqs/can_i_use_weka_from_python/","text":"There are several ways of using Weka in Python or Python-like environment. Jython # If you're starting from scratch, you might want to consider Jython , a rewrite of Python to seamlessly integrate with Java. The drawback is, that you can only use the libraries that Jython implements, not others like NumPy or SciPy . The article Using WEKA from Jython explains how to use WEKA classes from Jython and how to implement a new classifier in Jython, with an example of ZeroR implemented in Jython. Jepp # An approach making use of the javax.script package (new in Java 6) is Jepp , Java embedded Python . Jepp seems to have the same limitations as Jython, not being able to import Scipy or Numpy, but one can import pure Python libraries. The arcticle Using WEKA via Jepp contains more information and examples. JPype # Another solution, to access Java from within Python applications is JPype . python-weka-wrapper3 # You can use the python-weka-wrapper3 Python 3 library to access most of the non-GUI functionality of Weka (3.9.x): pypi github examples sklearn-weka-plugin # With the sklearn-weka-plugin library, you can use Weka from within the scikit-learn framework. The library itself uses python-weka-wrapper3 under the hood to make use of the Weka algorithms. pypi github examples","title":"Can i use weka from python"},{"location":"faqs/can_i_use_weka_from_python/#jython","text":"If you're starting from scratch, you might want to consider Jython , a rewrite of Python to seamlessly integrate with Java. The drawback is, that you can only use the libraries that Jython implements, not others like NumPy or SciPy . The article Using WEKA from Jython explains how to use WEKA classes from Jython and how to implement a new classifier in Jython, with an example of ZeroR implemented in Jython.","title":"Jython"},{"location":"faqs/can_i_use_weka_from_python/#jepp","text":"An approach making use of the javax.script package (new in Java 6) is Jepp , Java embedded Python . Jepp seems to have the same limitations as Jython, not being able to import Scipy or Numpy, but one can import pure Python libraries. The arcticle Using WEKA via Jepp contains more information and examples.","title":"Jepp"},{"location":"faqs/can_i_use_weka_from_python/#jpype","text":"Another solution, to access Java from within Python applications is JPype .","title":"JPype"},{"location":"faqs/can_i_use_weka_from_python/#python-weka-wrapper3","text":"You can use the python-weka-wrapper3 Python 3 library to access most of the non-GUI functionality of Weka (3.9.x): pypi github examples","title":"python-weka-wrapper3"},{"location":"faqs/can_i_use_weka_from_python/#sklearn-weka-plugin","text":"With the sklearn-weka-plugin library, you can use Weka from within the scikit-learn framework. The library itself uses python-weka-wrapper3 under the hood to make use of the Weka algorithms. pypi github examples","title":"sklearn-weka-plugin"},{"location":"faqs/check_classpath_within_weka/","text":"Yes, you can. Just start up the SimpleCLI and issue the following command: java weka.core.SystemInfo Look for the property java.class.path , which lists the CLASSPATH WEKA was started with.","title":"Check classpath within weka"},{"location":"faqs/check_memory_available/","text":"You can easily check, how much memory WEKA can use (this depends on the maximum heap size the Java Virtual Machine was started with). SimpelCLI start the SimpleCLI run the following command: java weka.core.SystemInfo the property memory.max lists the maximum amount of memory available to WEKA GUIChooser select Help -> SystemInfo the property memory.max lists the maximum amount of memory available to WEKA In case you should run into an OutOfMemoryException , you will have to increase the maximum heap size . How much you can allocate, depends heavily on the operating system and the underlying hardware, see the Java Virtual Machine article). Also, have a look at the OutOfMemoryException section further down.","title":"Check memory available"},{"location":"faqs/commercial_applications/","text":"WEKA is licensed under the GNU General Public license ( GPL 2.0 for Weka 3.6 ) and ( GPL 3.0 for Weka > 3.7.5 ). Any derivative work obtained under this license must be licensed under the GPL if this derivative work is distributed to a third party. For commercial projects that require the ability to distribute WEKA code as part of a program that cannot be distributed under the GPL, it may be possible to purchase an appropriate license from the copyright holders listed in the corresponding Java classes. Projects that only require a small subset of algorithms or filters can use the tiny-weka library as basis. This library consists of core classes of WEKA and is released under the liberal MIT license . Only additional classes need then be licensed. A ready-to-use maven template is available as well. The copyright for most WEKA code is owned by the University of Waikato. For information on licenses for this code please contact Rosanne Ellis, Director of Innovation and Impact, here at the University of Waikato , by sending an email to rosanne dot ellis at waikato dot ac dot nz .","title":"Commercial applications"},{"location":"faqs/contribution/","text":"Information on how to contribute to WEKA can be found in the Contributing a package section of the How are packages structured for the package management system? article. The conditions for new classifiers (schemes in general) are that, firstly, they have to be published in the proceedings of a renowned conference (e.g., ICML) or as an article of respected journal (e.g., Machine Learning) and, secondly, that they outperform other standard schemes (e.g., J48/C4.5). But please bear in mind, that we don't have a lot of man power, i.e., being the WEKA maintainer is NOT a full-time position.","title":"Contribution"},{"location":"faqs/couldnt_read_from_database_unknown_data_type/","text":"Since there is a plethora of different databases out there, each with their own data types, it is impossible to define all of them beforehand. WEKA therefore comes with setups for different databases that allow you to run experiments without any additional tuning. But if you want to read database from a different data source, then it can happen that you have to tell WEKA how to import these data types. Here is what to do: Extract the weka/experiment/DatabaseUtils.props file from either the weka.jar or weka-src.jar and place it in your home directory. Finally, check out the section Missing Datatypes in the Databases article and add the missing data types accordingly. Notes: jar files are just ZIP files . Just use an archive manager that can handle ZIP files to open them. Windows users can use 7-zip for instance. More information on props files can be found in the Properties file article. If you don't know where to find your home directory , see FAQ Where is my home directory located? .","title":"Couldnt read from database unknown data type"},{"location":"faqs/csv_file_conversion/","text":"Either load the CSV file in the Explorer or use the CSV converter on the commandline as follows: java weka.core.converters.CSVLoader filename.csv > filename.arff See also the Converting CSV to ARFF article and FAQ Can I use CSV files? .","title":"Csv file conversion"},{"location":"faqs/different_versions/","text":"Refer to History for a tabular overview of all Weka releases. Several branches are associated with the 1st, 2nd, 3rd, and 4th edition of the book Data Mining: Practical Machine Learning Tools and Techniques by Ian H. Witten and Eibe Frank , joined by Mark Hall for the 3rd edition and Chris Pal for the 4th edition. Once created, non-development branches receive bug fixes, but no new features (classifiers, filters, etc.). Version name Most recent base number Associated with book edition Book 1st ed. version 3.0.x 1st edition Old GUI version 3.2.x none Book 2nd ed. version 3.4.x 2nd edition Book 3rd ed. version 3.6.x 3rd edition Book 4th ed. version 3.8.x 4th edition Development version 3.9.x none For contributions , you should always develop against the developer version.","title":"Different versions"},{"location":"faqs/does_weka_support_multi_label_classification/","text":"No, WEKA only allows you to specify a single class attribute (which can be numeric or contain an arbitrary number of labels). There are other third-party frameworks available that can handle this type of data. One of them is Mulan , which is built on top of WEKA.","title":"Does weka support multi label classification"},{"location":"faqs/home_directory_location/","text":"Where a user's home directory is located varies from platform to platform and among the users on a single computer. But the actual location of the home directory is available through special environment variables: Unix/Linux $HOME Windows %USERPROFILE% Cygwin $USERPROFILE In order to find out where these environment variables actually point to, do the following: on Unix/Linux , open a terminal and type the following command echo $HOME on Windows , open a command-prompt and type the following command echo %USERPROFILE% on Cygwin , open a bash and type the following command echo $USERPROFILE","title":"Home directory location"},{"location":"faqs/how_can_i_perform_multi_instance_learning_in_weka/","text":"The article Multi-instance classification explains what classifiers can perform multi-instance classification and what format the data has to be in for these multi-instance classifiers.","title":"How can i perform multi instance learning in weka"},{"location":"faqs/how_can_i_speed_up_weka/","text":"Depending on the algorithm, it might be possible. Have a look at the article Speeding up Weka for more details.","title":"How can i speed up weka"},{"location":"faqs/how_can_i_track_instances_in_weka/","text":"WEKA doesn't support internal IDs for instances, one has to use ID attributes. See How do I use ID attributes?","title":"How can i track instances in weka"},{"location":"faqs/how_can_i_use_transactional_data_in_weka/","text":"Transactional data is often stored in databases by having a table with the transaction ID as the primary key. Individual items or elements of a given transaction may be split up over multiple rows in the table (each with the same ID). Data in this format needs to be converted to one row per transaction before it can be used to learn classifiers, association rules, clusterers etc. in WEKA. From WEKA 3.7.2 there is a package called denormalize that contains a filter that can perform this kind of \"flattening\" process. The filter requires 1) that the data contains an ID field that uniquely identifies each separate transaction , and 2) the data is already sorted in order of this ID field . Here is an example scenario taken from the WEKA mailing list: Hi, I have data spanning multiple rows for an instance, such as below (User 1 span across multiple rows, User 2 as well). Is it possible to use WEKA to cluster this dataset? If not, any suggestion on how I should organize the data so that I can use WEKA to cluster this data? User ItemID Sequence TimeSpent 1 1 1 5 1 2 2 1 1 5 3 8 1 6 4 12 1 8 5 2 2 1 1 7 2 2 2 3 2 3 3 3 2 4 4 2 2 5 5 7 In WEKA 3.7.2 there is a package called denormalize that contains a filter for flattening transactional data. The first thing you'd have to do to your example above is to convert it into an ARFF file: @relation test @attribute User numeric @attribute ItemID numeric @attribute Sequence numeric @attribute TimeSpent numeric @data 1, 1, 1, 5 1, 2, 2, 1 1, 5, 3, 8 1, 6, 4, 12 1, 8, 5, 2 2, 1, 1, 7 2, 2, 2, 3 2, 3, 3, 3 2, 4, 4, 2 2, 5, 5, 7 Next, you can run the NumericToNominal filter to convert the attributes that need to be coded as nominal (the User attribute is an ID and can stay either as numeric or nominal). Here I've converted all attributes except the ID to nominal: java weka.filters.unsupervised.attribute.NumericToNominal -R 2-last -i test.arff > test2.arff This results in: @relation test-weka.filters.unsupervised.attribute.NumericToNominal-R2-last @attribute User numeric @attribute ItemID {1,2,3,4,5,6,8} @attribute Sequence {1,2,3,4,5} @attribute TimeSpent {1,2,3,5,7,8,12} @data 1,1,1,5 1,2,2,1 1,5,3,8 1,6,4,12 1,8,5,2 2,1,1,7 2,2,2,3 2,3,3,3 2,4,4,2 2,5,5,7 Now, assuming that the denormalize package is installed, and ( IMPORTANT ) that the data is already sorted in order of the ID attribute (\"User\" in this case): java weka.Run Denormalize -G first -i test2.arff > final.arff This results in: @attribute User numeric @attribute ItemID_1 {f,t} @attribute ItemID_2 {f,t} @attribute ItemID_3 {f,t} @attribute ItemID_4 {f,t} @attribute ItemID_5 {f,t} @attribute ItemID_6 {f,t} @attribute ItemID_8 {f,t} @attribute Sequence_1 {f,t} @attribute Sequence_2 {f,t} @attribute Sequence_3 {f,t} @attribute Sequence_4 {f,t} @attribute Sequence_5 {f,t} @attribute TimeSpent_1 {f,t} @attribute TimeSpent_2 {f,t} @attribute TimeSpent_3 {f,t} @attribute TimeSpent_5 {f,t} @attribute TimeSpent_7 {f,t} @attribute TimeSpent_8 {f,t} @attribute TimeSpent_12 {f,t} @data 1,t,t,f,f,t,t,t,t,t,t,t,t,t,t,f,t,f,t,t 2,t,t,t,t,t,f,f,t,t,t,t,t,f,t,t,f,t,f,f Note, that for clustering/association rules you'd want to first remove the User ID attribute. I've shown this as an example using the command-line interface. It can all be done from the Explorer as well of course. The Denormalize filter has options for aggregating any numeric attributes (not the ID) as well, so if you left (for example) the TimeSpent attribute as numeric, rather than converting it to nominal using NumericToNominal , then Denormalize can aggregate it (sum, average, max, min).","title":"How can i use transactional data in weka"},{"location":"faqs/how_can_i_use_weka_with_matlab_or_octave/","text":"Matlab and Octave allow you to interface with Java applications, which allows you to use Weka from within these applications. See the following presentation, section Octave ( Octave is fairly compatible with Matlab), on how to use the Java integration: WEKA Ecosystem","title":"How can i use weka with matlab or octave"},{"location":"faqs/how_do_i_add_a_new_classifier_filter_kernel_etc/","text":"As of WEKA 3.4.4, all the derived classes of superclasses that can be edited in the GenericObjectEditor, like subclasses of weka.classifiers.Classifier for instance, can be determined dynamically at runtime. Read here for more information. Note: WEKA 3.5.8 and 3.6.0 turned the automatic discovery off by default. Starting with 3.6.1 and 3.7.0 it is turned on again.","title":"How do i add a new classifier filter kernel etc"},{"location":"faqs/how_do_i_compile_weka/","text":"You can compile the source code simply with any (Sun-compliant) java compiler, or use ant, or an IDE. Check out the article about Compiling WEKA , which contains links to further articles, covering topics about ant and IDEs.","title":"How do i compile weka"},{"location":"faqs/how_do_i_connect_to_a_database/","text":"With a bit of effort you can easily access databases via JDBC . You need the following: JDBC driver for the database you want to access in your CLASSPATH. A customized DatabaseUtils.props file. The following example files are located in the weka/experiment directory of the weka.jar archive: HSQLDB - DatabaseUtils.props.hsql (>= 3.4.1/3.5.0) MS SQL Server 2000 - DatabaseUtils.props.mssqlserver (>= 3.4.9/3.5.4) MS SQL Server 2005 Express Edition - DatabaseUtils.props.mssqlserver2005 (> 3.4.10/3.5.5) MySQL - DatabaseUtils.props.mysql (>= 3.4.9/3.5.4) ODBC - DatabaseUtils.props.odbc (>= 3.4.9/3.5.4) Oracle - DatabaseUtils.props.oracle (>= 3.4.9/3.5.4) PostgreSQL - DatabaseUtils.props.postgresql (>= 3.4.9/3.5.4) Sqlite 3.x - DatabaseUtils.props.sqlite3 (> 3.4.12, > 3.5.7) For more details see the following articles: Databases DatabaseUtils.props Windows databases (covers access via ODBC) The following FAQs could be of interest as well: Couldn't read from database: unknown data type Trying to add JDBC driver: ... - Error, not in CLASSPATH?","title":"How do i connect to a database"},{"location":"faqs/how_do_i_divide_a_dataset_into_training_and_test_set/","text":"You can use the RemovePercentage filter (package weka.filters.unsupervised.instance ). In the Explorer just do the following: training set: Load the full dataset select the RemovePercentage filter in the preprocess panel set the correct percentage for the split apply the filter save the generated data as a new file test set: Load the full dataset (or just use undo to revert the changes to the dataset) select the RemovePercentage filter if not yet selected set the invertSelection property to true apply the filter save the generated data as new file","title":"How do i divide a dataset into training and test set"},{"location":"faqs/how_do_i_generate_compatible_train_and_test_sets_that_get_processed_with_a_filter/","text":"Running a filter twice, once with the train set as input and then the second time with the test set, will create almost certainly two incompatible files. Why is that? Every time you run a filter, it will get initialized based on the input data, and, of course, training and test set will differ, hence creating incompatible output. You can avoid this by using batch filtering . See the article on Batch filtering for more details.","title":"How do i generate compatible train and test sets that get processed with a filter"},{"location":"faqs/how_do_i_generate_learning_curves/","text":"You can generate learning curves using the Advanced mode of the Experimenter. See the article Learning curves for more details.","title":"How do i generate learning curves"},{"location":"faqs/how_do_i_make_predictions_with_a_trained_model/","text":"Since WEKA allows models to be saved (as Java binary serialized objects), one can use those models again to perform predictions. Check out the article Making Predictions for more details.","title":"How do i make predictions with a trained model"},{"location":"faqs/how_do_i_modify_the_runwekabat_file/","text":"Check out the Invocation section of the Java Virtual Machine article.","title":"How do i modify the runwekabat file"},{"location":"faqs/how_do_i_perform_attribute_selection/","text":"WEKA offers different approaches for performing attribute selection: directly with the attribute selection classes, with a meta-classifier, and with a filter. Check out the Performing attribute selection article for more details and examples.","title":"How do i perform attribute selection"},{"location":"faqs/how_do_i_perform_clustering/","text":"WEKA offers clustering capabilities not only as standalone schemes, but also as filters and classifiers. Check out the article about Using cluster algorithms more detailed information.","title":"How do i perform clustering"},{"location":"faqs/how_do_i_perform_cost_sensitive_classification/","text":"Cost-sensitive classification can be achieved using a Cost-Sensitive Classifier . Related articles to the cost-sensitive topic include: Cost Matrix Metacost Searching for the term cost-sensitive will also show articles related to the topic.","title":"How do i perform cost sensitive classification"},{"location":"faqs/how_do_i_perform_one_class_classification/","text":"WEKA offers some rudimentary support for one-class classfication: via the weka.classifiers.functions.LibSVM wrapper classifier (stable 3.6 and developer version). See the LibSVM article for more information. via the weka.classifiers.meta.OneClassClassifier meta-classifier (developer version >3.7.0)","title":"How do i perform one class classification"},{"location":"faqs/how_do_i_perform_text_classification/","text":"The article Text categorization with WEKA explains a few basics of how to deal with text documents, like importing and pre-processing.","title":"How do i perform text classification"},{"location":"faqs/how_do_i_run_the_windows_weka_installer_in_silent_mode/","text":"To run the Windows installer for WEKA without in \"silent\" mode (i.e. without any graphical prompts/dialogs): Open a command prompt window Navigate to the directory where the installer executable resides Type .\\weka-x-y-z.exe /S Replace x, y and z with the correct version numbers for your installer of course. Note that the /S is a capital S.","title":"How do i run the windows weka installer in silent mode"},{"location":"faqs/how_do_i_use_id_attributes/","text":"See the Instance ID article for more information on how to use attribute IDs in WEKA.","title":"How do i use id attributes"},{"location":"faqs/how_do_i_use_libsvm_in_weka/","text":"If you run the classifier weka.classifiers.functions.LibSVM and get the libsvm classes not in CLASSPATH! error message, you are missing the libsvm jar archive in your current classpath. The LibSVM classifier is only a wrapper and doesn't need the libsvm classes to compile (uses Reflection). Check out the LibSVM article for details about how to use this classifier.","title":"How do i use libsvm in weka"},{"location":"faqs/how_do_i_use_weka_from_the_command_line/","text":"Reading the Primer article will help you understand the usage of the command line, as well as the How to run WEKA schemes from commandline article.","title":"How do i use weka from the command line"},{"location":"faqs/how_do_i_use_wekas_classes_in_my_own_code/","text":"It's not that hard to use WEKA classes in your own code, the following articles give a good overview of how to do that: Use WEKA in your Java code In general, the articles tagged as \" source code \". Further resources: Check out the chapter Using the API in the Weka manual (releases >3.6.1 and >3.7.0). The Weka Examples collection is a an ANT project that is available through releases later than 09/08/2009, containing a lot of example classes. Note: WEKA is open-source software under the GNU General Public License , which means that your code has to be licensed under the GPL as well.","title":"How do i use wekas classes in my own code"},{"location":"faqs/how_do_i_write_a_new_classifier_or_filter/","text":"Basically, a classifier needs to be derived from weka.classifiers.Classifier and a filter from weka.filters.Filter , but this is only part of the story. The following articles cover the development of new schemes in greater detail: Writing your own Classifier Writing your own Filter If your scheme is outside the usual WEKA packages, you need to make WEKA aware of this package in order to be able to use it in the GUI as well. See How do I add a new classifier, filter, kernel, etc? for more information about this. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0. Furthermore, this chapter also covers clusterers, attribute selection algorithms and associators.","title":"How do i write a new classifier or filter"},{"location":"faqs/i_cannot_process_large_datasets_any_ideas/","text":"Since most schemes in WEKA need to have all of the data present in memory, large datasets can be a problem. The article Classifying large datasets tries to present some solutions. But make sure that you have already read how to deal with an OutOfMemoryException .","title":"I cannot process large datasets any ideas"},{"location":"faqs/i_have_unbalanced_data_now_what/","text":"You can either perform cost-sensitive classification or try a resampling filter to get a more balanced class distribution: Resample filter (supervised version, to take class distribution into account) SMOTE (available through SMOTE package)","title":"I have unbalanced data now what"},{"location":"faqs/latest_bugfixes/","text":"The article How to get the latest bugfixes explains it in detail.","title":"Latest bugfixes"},{"location":"faqs/old_versions/","text":"If you need a specific version of WEKA, e.g., due to some third-party tools, go WEKA's project page on Sourceforge.net . In the Files section you have access to all the releases ever made.","title":"Old versions"},{"location":"faqs/package_manager_doesnt_start/","text":"The most likely reason for this is that your computer does not have direct access to the Internet and Java needs to be told to use a proxy server to access the web. The best way to achieve this is to configure an environment variable that provides the proxy details, e.g., _JAVA_OPTIONS which is read by Oracle Java virtual machines. There is more information on this variable here . Information on how to set environment variables in Windows is here . For Mac users, there is a nice program to set environment variables available here . Set the value of this variable to -Dhttp.proxyHost=some.proxy.somewhere.net -Dhttp.proxyPort=port where some.proxy.somewhere.net needs to be replaced by the name of your proxy server and port needs to be replaced by the appropriate port number on the proxy server. Your IT department should be able to give you these details. This should allow the package manager to connect to the website that hosts the package meta-information. However, if the package manager still cannot connect to the Internet, you can also force it to run in offline mode, by setting the above environment variable to -Dweka.packageManager.offline=true Then, you can download package .zip files manually via your web browser, by navigating to https://weka.sourceforge.io/packageMetaData/ clicking on the link for the package you want to install, then clicking on Latest , and finally clicking on the URL given next to PackageURL . Once you have downloaded the package .zip file, open the WEKA package manager, and click on the File/URL button in the top-right corner of the package manager window (in the Unofficial panel). Then navigate to your package .zip file and install it. If you are running Weka in offline mode, and the packages you are installing have some dependencies on one another, then there can still be some problems due to Weka not being able to verify the dependencies by checking against the central repository. This is usually a problem in the case where Weka has never been able to connect to the internet and thus has not downloaded and established a cache of the central package metadata repository. Fortunately there is a simple work-around to this, as long as you can access the internet via a web browser: Using your web browser, download https://weka.sourceforge.io/packageMetaData/repo.zip If it doesn't already exist, create the directory ~/wekafiles/repCache Copy the downloaded repo.zip into ~/wekafiles/repCache and unzip it there Start Weka (use the weka.packageManager.offline=true property to speed up the startup process; see [http://weka.wikispaces.com/How+do+I+use+the+package+manager%3F#Package%20manager%20property%20file] for info)","title":"Package manager doesnt start"},{"location":"faqs/pluggable_evaluation_metrics/","text":"WEKA 3.7.8 has a mechanism to allow new classification and regression evaluation metrics to be added as plugins. The new metrics will be output, along with WEKA's standard set of evaluation metrics, in the output generated on the command line, in the Explorer's Classify panel and by the Knowledge Flow's ClassifierPerformanceEvaluator component. Furthermore, new plugin metrics are also available for analysis in the Experimenter. Previously, adding a new evaluation metric involved editing and recompiling the monolithic weka.classifiers.Evaluation class - a shudder-worthy undertaking at the best of times. With the new plugin mechanism it is easy to add a new metric and deploy it via the package management system. The \"Additional configuration files\" section of How are packages structured for the package management system? details how to tell the PluginManager class about your new plugin evaluation metric. Classes and interfaces # The main base class for all new metrics is weka.classifiers.evaluation.AbstractEvaluationMetric . This class requires the following methods to be implemented by concrete sub classes: boolean appliesToNominalClass() - true if the stats computed by this metric apply to nominal class problems boolean appliesToNumericClass() - true if the stats computed by this metric apply to numeric class problems String getMetricName() - return the name of the metric String getMetricDescription() - return a short description of the metric List<String> getStatisticNames() - return a list of statistics that this metric computes (e.g. a \"correct\" metric might return both the number correctly classified and the percentage correct) double getStatistic(String statName) - get the computed value for the named statistic To facilitate computing statistics, the main Evaluation object (who's class now lives in weka.classifiers.evaluation) will pass a reference to itself to all plugin metrics when it is first constructed. Therefore, a plugin metric has access to all the protected fields in the Evaluation class, and can use these when computing it's own statistic(s). Beyond extending AbstractEvaluationMetric , a plugin metric will also need to implement one of the following interfaces: weka.classifiers.evaluation.StandardEvaluationMetric # Interface for a \"standard\" evaluation metric - i.e. one that would be part of the normal output in WEKA without having to turn on specific display options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. weka.classifiers.evaluation.InformationTheoreticEvaluationMetric # Interface for information theoretic evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForConditionalDensityEstimator(ConditionalDensityEstimator classifier, Instance classMissing, double classValue) - updates stats for conditional density estimator based on current test instance. Gets called when the class is numeric and the classifier is a ConditionalDensityEstimators. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. weka.classifiers.evaluation.InformationRetrievalMetric # An interface for information retrieval evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. These statistics will be displayed as new columns in the table of information retrieval statistics. As such, a toSummaryString() formatted representation is not required. It defines the following methods void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. double getStatistic(String name, int classIndex) - get the value of the named statistic for the given class index. If the implementing class is extending AbstractEvaluationMetric then the implementation of getStatistic(String statName) should just call this method with a classIndex of 0. double getClassWeightedAverageStatistic(String statName) - get the weighted (by class) average for this statistic. weka.classifiers.evaluation.IntervalBasedEvaluationMetric # Primarily a marker interface for interval-based evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForIntervalEstimator(IntervalEstimator classifier, Instance instance, double classValue) - updates stats for interval estimator based on current test instance. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"Pluggable evaluation metrics"},{"location":"faqs/pluggable_evaluation_metrics/#classes-and-interfaces","text":"The main base class for all new metrics is weka.classifiers.evaluation.AbstractEvaluationMetric . This class requires the following methods to be implemented by concrete sub classes: boolean appliesToNominalClass() - true if the stats computed by this metric apply to nominal class problems boolean appliesToNumericClass() - true if the stats computed by this metric apply to numeric class problems String getMetricName() - return the name of the metric String getMetricDescription() - return a short description of the metric List<String> getStatisticNames() - return a list of statistics that this metric computes (e.g. a \"correct\" metric might return both the number correctly classified and the percentage correct) double getStatistic(String statName) - get the computed value for the named statistic To facilitate computing statistics, the main Evaluation object (who's class now lives in weka.classifiers.evaluation) will pass a reference to itself to all plugin metrics when it is first constructed. Therefore, a plugin metric has access to all the protected fields in the Evaluation class, and can use these when computing it's own statistic(s). Beyond extending AbstractEvaluationMetric , a plugin metric will also need to implement one of the following interfaces:","title":"Classes and interfaces"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationstandardevaluationmetric","text":"Interface for a \"standard\" evaluation metric - i.e. one that would be part of the normal output in WEKA without having to turn on specific display options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"weka.classifiers.evaluation.StandardEvaluationMetric"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationinformationtheoreticevaluationmetric","text":"Interface for information theoretic evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForConditionalDensityEstimator(ConditionalDensityEstimator classifier, Instance classMissing, double classValue) - updates stats for conditional density estimator based on current test instance. Gets called when the class is numeric and the classifier is a ConditionalDensityEstimators. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"weka.classifiers.evaluation.InformationTheoreticEvaluationMetric"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationinformationretrievalmetric","text":"An interface for information retrieval evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. These statistics will be displayed as new columns in the table of information retrieval statistics. As such, a toSummaryString() formatted representation is not required. It defines the following methods void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. double getStatistic(String name, int classIndex) - get the value of the named statistic for the given class index. If the implementing class is extending AbstractEvaluationMetric then the implementation of getStatistic(String statName) should just call this method with a classIndex of 0. double getClassWeightedAverageStatistic(String statName) - get the weighted (by class) average for this statistic.","title":"weka.classifiers.evaluation.InformationRetrievalMetric"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationintervalbasedevaluationmetric","text":"Primarily a marker interface for interval-based evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForIntervalEstimator(IntervalEstimator classifier, Instance instance, double classValue) - updates stats for interval estimator based on current test instance. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"weka.classifiers.evaluation.IntervalBasedEvaluationMetric"},{"location":"faqs/serialization_is_nice_but_what_about_generating_actual_java_code_from_weka_classes/","text":"Some of WEKA's schemes support the generation of Java source code based on their internal state. See the Generating source code from WEKA classes article for more details.","title":"Serialization is nice but what about generating actual java code from weka classes"},{"location":"faqs/stack_overflow_error/","text":"Try increasing the stack of your virtual machine. With Sun's JDK you can use this command to increase the stacksize: java -Xss512k ... to set the maximum Java stack size to 512KB. If still not sufficient, slowly increase it. For Windows, see OutOfMemoryException for pointers on how to modify your setup.","title":"Stack overflow error"},{"location":"faqs/the_snowball_stemmers_dont_work_what_am_i_doing_wrong/","text":"When you're trying to use the Snowball stemmers in the StringToWordVector nothing happens and you get the message Stemmer 'porter' unknown! in the console. If this happens, you don't have the snowball classes in your classpath. Check out the article about the Stemmers for how to add the snowball stemmers to WEKA.","title":"The snowball stemmers dont work what am i doing wrong"},{"location":"faqs/trying_to_add_jdbc_driver_error_not_in_classpath/","text":"WEKA's default setup for databases tries to locate some common JDBC driver classes (\"JDBC\" is the Java way of connecting to databases, like MySQL, HSQLDB, etc.) at startup time. By just adding these JDBC drivers to your CLASSPATH , WEKA will be automatically able to connect to these databases. If you are not trying to access a database, just forget about these messages. Otherwise, check out the databases article for more information (the database type that you are trying to connect to might not be listed by default).","title":"Trying to add jdbc driver error not in classpath"},{"location":"faqs/ubuntu_1804_blas_warning/","text":"When running Ubuntu 18.04, you might see the following warning message(s) in the console: Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.BLAS <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.BLAS <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.LAPACK <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.LAPACK <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK You can easily fix this by installing the missing dependencies with this command: sudo apt-get install libgfortran-6-dev","title":"Ubuntu 1804 blas warning"},{"location":"faqs/use_csv_files/","text":"Yes, you can. But be aware that there is a drawback in comparison to ARFF files (WEKA's default file format): Train and test set may not be compatible. Using CSV files as train and test set can be a frustrating exercise. Since CSV files don't contain any information about the attributes, WEKA needs to determine the labels for nominal attributes itself. Not only does the order of the appearance of these labels create different nominal attributes (\"1,2,3\" vs \"1,3,2\"), but it is also not guaranteed that all the labels that appeared in the train set also appear in the test set (\"1,2,3,4\" vs \"1,3,4\") and vice versa.","title":"Use csv files"},{"location":"faqs/visualization/","text":"Access to visualization from the Classifier , Cluster and Attribute Selection panel is available from a popup menu. Click the right mouse button over an entry in the Result list to bring up the menu. You will be presented with options for viewing or saving the text output and, depending on the scheme, further options for visualizing errors, clusters, trees etc.","title":"Visualization"},{"location":"faqs/weka_download_problems/","text":"When you download WEKA , make sure that the resulting file size is the same as the WEKA webpage. Otherwise things won't work properly. Apparently some web browsers have trouble downloading WEKA. Also note, that the WEKA homepage only links to the files that are hosted on sourceforge.net . This normally involves a redirect to a mirror from which you'll download the actual file.","title":"Weka download problems"},{"location":"faqs/what_is_git_and_what_do_i_need_to_do_to_access_it/","text":"Git is the version control system that we use nowadays for WEKA's source code. See the git article from more information of how to access the repository and retrieve the source code from there.","title":"What is git and what do i need to do to access it"},{"location":"faqs/what_is_subversion_and_what_do_i_need_to_do_to_access_it/","text":"Subversion is the version control system that we use nowadays for WEKA's source code. See the Subversion article from more information of how to access the repository and retrieve the source code from there.","title":"What is subversion and what do i need to do to access it"},{"location":"faqs/where_can_i_find_information_regarding_roc_curves/","text":"Just check out the articles tagged with ROC , which cover the subject of ROC curves and AUC. These articles cover GUI handling as well as how to create ROC curves from code.","title":"Where can i find information regarding roc curves"},{"location":"faqs/where_can_i_get_wekas_source_code/","text":"Every WEKA release comes with a jar archive (this is just a simple ZIP archive) that contains the complete sources. It is called weka-src.jar . Alternatively, you can get WEKA's source code also from git .","title":"Where can i get wekas source code"},{"location":"faqs/why_am_i_missing_certain_nominal_or_string_values_from_sparse_instances/","text":"Internally, Weka stores all attribute values as double precision floating point numbers. In the case of nominal or string attributes these numbers are interpreted as indexes into the set of values for the attribute in question, with 0 corresponding to the first value, 1 the second and so forth. Because sparse data does not explicitly store zeros, any instances containing the first value (with index 0) of a nominal or string attribute does not show this value when printing out an ARFF file that is sparse format.","title":"Why am i missing certain nominal or string values from sparse instances"},{"location":"faqs/why_do_i_get_the_error_message_training_and_test_set_are_not_compatible/","text":"One of WEKA's fundamental assumption is that the structure of the training and test sets are exactly the same. This does not only mean that you need the exact same number of attributes, but also the exact same type . In case of nominal attributes, you must ensure that the number of labels and the order of the labels are the same. This may seem odd, as for making predictions with a trained classifier, you wouldn't need to include any class attribute information. This is true from a human perspective, but for speed reasons, WEKA doesn't perform any checks regarding the structure of dataset (no mapping of attribute names from training space to test space, also no mapping of labels). Internally, a single row in a dataset is represented as an array of doubles. In case of numeric attributes, this doesn't pose a problem, but for other attribute types, like nominal ones, the doubles represent indexes in the list of available labels. A different order of the labels would result in different labels represented by the same index. Predictions cannot be trusted then. Now, if you want to quickly check where the problem is, a visual diff program is very helpful. There is a plethora of applications available. To name a few cross-platform open-source ones: kdiff3 kompare diffuse If you used a filter for processing training and test set, then have a look at FAQ How do I generate compatible train and test sets that get processed with a filter?","title":"Why do i get the error message training and test set are not compatible"},{"location":"formats_and_processing/arff/","text":"Data format # A description of the ARFF format can be found in the following articles: ARFF (stable version) ARFF (developer version) Note how single quotes and spaces are handled: Single Quotes in Labels of ARFF Files Spaces in Labels of ARFF Files Creating an ARFF file # How to create an ARFF file on the fly, i.e., inside Java, you can find here: Creating an ARFF file CSV # CSV (comma separated value) files are able to be converted to ARFF format. See: Converting CSV to ARFF XML and XRFF # There is an XML-based extension of the ARFF format. See: XRFF XML Load an XML BIF file See also # ARFF Syntax Highlighting for various editors ARFF From Text Collections Remove Attributes Rename Attribute Values Save Instances to ARFF Transferring and ARFF File into a Databse Links # ARFF2DB.py - a Python script for importing an ARFF file into a database (similar functionality to the weka.core.converters.DatabaseSaver class)","title":"ARFF Format"},{"location":"formats_and_processing/arff/#data-format","text":"A description of the ARFF format can be found in the following articles: ARFF (stable version) ARFF (developer version) Note how single quotes and spaces are handled: Single Quotes in Labels of ARFF Files Spaces in Labels of ARFF Files","title":"Data format"},{"location":"formats_and_processing/arff/#creating-an-arff-file","text":"How to create an ARFF file on the fly, i.e., inside Java, you can find here: Creating an ARFF file","title":"Creating an ARFF file"},{"location":"formats_and_processing/arff/#csv","text":"CSV (comma separated value) files are able to be converted to ARFF format. See: Converting CSV to ARFF","title":"CSV"},{"location":"formats_and_processing/arff/#xml-and-xrff","text":"There is an XML-based extension of the ARFF format. See: XRFF XML Load an XML BIF file","title":"XML and XRFF"},{"location":"formats_and_processing/arff/#see-also","text":"ARFF Syntax Highlighting for various editors ARFF From Text Collections Remove Attributes Rename Attribute Values Save Instances to ARFF Transferring and ARFF File into a Databse","title":"See also"},{"location":"formats_and_processing/arff/#links","text":"ARFF2DB.py - a Python script for importing an ARFF file into a database (similar functionality to the weka.core.converters.DatabaseSaver class)","title":"Links"},{"location":"formats_and_processing/arff_developer/","text":"An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. Overview # ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive. Examples # Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files. The ARFF Header Section # The ARFF Header section of the file contains the relation declaration and attribute declarations. The @relation Declaration # The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name. == The @attribute Declarations == Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive. Numeric attributes # Numeric attributes can be real or integer numbers. Nominal attributes # Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted. String attributes # String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string Date attributes # Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below). Relational attributes # Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ... The ARFF Data Section # The ARFF Data section of the file contains the data declaration line and the actual instance lines. The @data Declaration # The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data The instance data # Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1 Sparse ARFF files # Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files. Instance weights in ARFF files # A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility. See also # Add weights to dataset ARFF Syntax Highlighting for various editors Links # ISO 8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Arff developer"},{"location":"formats_and_processing/arff_developer/#overview","text":"ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive.","title":"Overview"},{"location":"formats_and_processing/arff_developer/#examples","text":"Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files.","title":"Examples"},{"location":"formats_and_processing/arff_developer/#the-arff-header-section","text":"The ARFF Header section of the file contains the relation declaration and attribute declarations.","title":"The ARFF Header Section"},{"location":"formats_and_processing/arff_developer/#the-relation-declaration","text":"The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name. == The @attribute Declarations == Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive.","title":"The @relation Declaration"},{"location":"formats_and_processing/arff_developer/#numeric-attributes","text":"Numeric attributes can be real or integer numbers.","title":"Numeric attributes"},{"location":"formats_and_processing/arff_developer/#nominal-attributes","text":"Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted.","title":"Nominal attributes"},{"location":"formats_and_processing/arff_developer/#string-attributes","text":"String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string","title":"String attributes"},{"location":"formats_and_processing/arff_developer/#date-attributes","text":"Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below).","title":"Date attributes"},{"location":"formats_and_processing/arff_developer/#relational-attributes","text":"Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ...","title":"Relational attributes"},{"location":"formats_and_processing/arff_developer/#the-arff-data-section","text":"The ARFF Data section of the file contains the data declaration line and the actual instance lines.","title":"The ARFF Data Section"},{"location":"formats_and_processing/arff_developer/#the-data-declaration","text":"The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data","title":"The @data Declaration"},{"location":"formats_and_processing/arff_developer/#the-instance-data","text":"Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1","title":"The instance data"},{"location":"formats_and_processing/arff_developer/#sparse-arff-files","text":"Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files.","title":"Sparse ARFF files"},{"location":"formats_and_processing/arff_developer/#instance-weights-in-arff-files","text":"A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility.","title":"Instance weights in ARFF files"},{"location":"formats_and_processing/arff_developer/#see-also","text":"Add weights to dataset ARFF Syntax Highlighting for various editors","title":"See also"},{"location":"formats_and_processing/arff_developer/#links","text":"ISO 8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Links"},{"location":"formats_and_processing/arff_from_text_collections/","text":"The following utility generates an ARFF file from text documents in a given directory (download link is at the end of this article). The stable/developer version of Weka offer this tool as the weka.core.converters.TextDirectoryLoader converter. This can be used as: java -cp <path to weka.jar> weka.core.converters.TextDirectoryLoader -dir . For help just type: java -cp <path to weka.jar> weka.core.converters.TextDirectoryLoader /* * TextDirectoryToArff.java * Copyright (C) 2002 Richard Kirkby * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ import java.io.* ; import weka.core.* ; /** * Builds an arff dataset from the documents in a given directory. * Assumes that the file names for the documents end with \".txt\". * * Usage:<p/> * * TextDirectoryToArff <directory path> <p/> * * @author Richard Kirkby (rkirkby at cs.waikato.ac.nz) * @version 1.0 */ public class TextDirectoryToArff { public Instances createDataset ( String directoryPath ) throws Exception { FastVector atts = new FastVector ( 2 ); atts . addElement ( new Attribute ( \"filename\" , ( FastVector ) null )); atts . addElement ( new Attribute ( \"contents\" , ( FastVector ) null )); Instances data = new Instances ( \"text_files_in_\" + directoryPath , atts , 0 ); File dir = new File ( directoryPath ); String [] files = dir . list (); for ( int i = 0 ; i < files . length ; i ++ ) { if ( files [ i ] . endsWith ( \".txt\" )) { try { double [] newInst = new double [ 2 ] ; newInst [ 0 ] = ( double ) data . attribute ( 0 ). addStringValue ( files [ i ] ); File txt = new File ( directoryPath + File . separator + files [ i ] ); InputStreamReader is ; is = new InputStreamReader ( new FileInputStream ( txt )); StringBuffer txtStr = new StringBuffer (); int c ; while (( c = is . read ()) != - 1 ) { txtStr . append (( char ) c ); } newInst [ 1 ] = ( double ) data . attribute ( 1 ). addStringValue ( txtStr . toString ()); data . add ( new Instance ( 1.0 , newInst )); } catch ( Exception e ) { //System.err.println(\"failed to convert file: \" + directoryPath + File.separator + files[i]); } } } return data ; } public static void main ( String [] args ) { if ( args . length == 1 ) { TextDirectoryToArff tdta = new TextDirectoryToArff (); try { Instances dataset = tdta . createDataset ( args [ 0 ] ); System . out . println ( dataset ); } catch ( Exception e ) { System . err . println ( e . getMessage ()); e . printStackTrace (); } } else { System . out . println ( \"Usage: java TextDirectoryToArff <directory name>\" ); } } } See also # Text categorization with Weka Downloads # TextDirectoryToArff.java","title":"Arff from text collections"},{"location":"formats_and_processing/arff_from_text_collections/#see-also","text":"Text categorization with Weka","title":"See also"},{"location":"formats_and_processing/arff_from_text_collections/#downloads","text":"TextDirectoryToArff.java","title":"Downloads"},{"location":"formats_and_processing/arff_stable/","text":"An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. Overview # ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive. Examples # Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files. The ARFF Header Section # The ARFF Header section of the file contains the relation declaration and attribute declarations. The @relation Declaration # The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name. The @attribute Declarations # Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive. Numeric attributes # Numeric attributes can be real or integer numbers. Nominal attributes # Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted. String attributes # String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string Date attributes # Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below). Relational attributes # Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ... The ARFF Data Section # The ARFF Data section of the file contains the data declaration line and the actual instance lines. The @data Declaration # The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data The instance data # Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1 Sparse ARFF files # Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files. Instance weights in ARFF files # A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility. See also # Add weights to dataset ARFF Syntax Highlighting for various editors Links # ISO-8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Arff stable"},{"location":"formats_and_processing/arff_stable/#overview","text":"ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive.","title":"Overview"},{"location":"formats_and_processing/arff_stable/#examples","text":"Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files.","title":"Examples"},{"location":"formats_and_processing/arff_stable/#the-arff-header-section","text":"The ARFF Header section of the file contains the relation declaration and attribute declarations.","title":"The ARFF Header Section"},{"location":"formats_and_processing/arff_stable/#the-relation-declaration","text":"The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name.","title":"The @relation Declaration"},{"location":"formats_and_processing/arff_stable/#the-attribute-declarations","text":"Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive.","title":"The @attribute Declarations"},{"location":"formats_and_processing/arff_stable/#numeric-attributes","text":"Numeric attributes can be real or integer numbers.","title":"Numeric attributes"},{"location":"formats_and_processing/arff_stable/#nominal-attributes","text":"Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted.","title":"Nominal attributes"},{"location":"formats_and_processing/arff_stable/#string-attributes","text":"String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string","title":"String attributes"},{"location":"formats_and_processing/arff_stable/#date-attributes","text":"Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below).","title":"Date attributes"},{"location":"formats_and_processing/arff_stable/#relational-attributes","text":"Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ...","title":"Relational attributes"},{"location":"formats_and_processing/arff_stable/#the-arff-data-section","text":"The ARFF Data section of the file contains the data declaration line and the actual instance lines.","title":"The ARFF Data Section"},{"location":"formats_and_processing/arff_stable/#the-data-declaration","text":"The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data","title":"The @data Declaration"},{"location":"formats_and_processing/arff_stable/#the-instance-data","text":"Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1","title":"The instance data"},{"location":"formats_and_processing/arff_stable/#sparse-arff-files","text":"Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files.","title":"Sparse ARFF files"},{"location":"formats_and_processing/arff_stable/#instance-weights-in-arff-files","text":"A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility.","title":"Instance weights in ARFF files"},{"location":"formats_and_processing/arff_stable/#see-also","text":"Add weights to dataset ARFF Syntax Highlighting for various editors","title":"See also"},{"location":"formats_and_processing/arff_stable/#links","text":"ISO-8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Links"},{"location":"formats_and_processing/arff_syntax/","text":"Here you can find syntax highlightings for various editors: Emacs # Add the code from the arff.emacs file into your startup file. Notepad++ # Copy the contents of tag of the arff.notepadplus file into your %APPDATA%\\Notepad++\\userDefineLang.xml file. (Ensure that you maintain the XML structure). If userDefineLang.xml does not exist, simply rename the arff.notepadplus file to userDefineLang.xml TextPad # Copy the file arff.syn into your <TEXTPAD-DIR>/system directory. Then run the wizard for adding a new document class (Configure -> New Document Class...). Ultraedit # Just copy/paste the content of the file arff.ultraedit in your <ULTRAEDIT-DIR>/WORDFILE.TXT file. Adjust the /Lnn language number that it fits into the numbering of your current settings. vim/gvim # Save the file arff.vim in your $HOME/.vim/syntax directory. You can enable the syntax with :set syntax=arff . Links # Emacs homepage Notepad++ homepage TextPad homepage Ultraedit homepage vim homepage","title":"Arff syntax"},{"location":"formats_and_processing/arff_syntax/#emacs","text":"Add the code from the arff.emacs file into your startup file.","title":"Emacs"},{"location":"formats_and_processing/arff_syntax/#notepad","text":"Copy the contents of tag of the arff.notepadplus file into your %APPDATA%\\Notepad++\\userDefineLang.xml file. (Ensure that you maintain the XML structure). If userDefineLang.xml does not exist, simply rename the arff.notepadplus file to userDefineLang.xml","title":"Notepad++"},{"location":"formats_and_processing/arff_syntax/#textpad","text":"Copy the file arff.syn into your <TEXTPAD-DIR>/system directory. Then run the wizard for adding a new document class (Configure -> New Document Class...).","title":"TextPad"},{"location":"formats_and_processing/arff_syntax/#ultraedit","text":"Just copy/paste the content of the file arff.ultraedit in your <ULTRAEDIT-DIR>/WORDFILE.TXT file. Adjust the /Lnn language number that it fits into the numbering of your current settings.","title":"Ultraedit"},{"location":"formats_and_processing/arff_syntax/#vimgvim","text":"Save the file arff.vim in your $HOME/.vim/syntax directory. You can enable the syntax with :set syntax=arff .","title":"vim/gvim"},{"location":"formats_and_processing/arff_syntax/#links","text":"Emacs homepage Notepad++ homepage TextPad homepage Ultraedit homepage vim homepage","title":"Links"},{"location":"formats_and_processing/converting_csv_to_arff/","text":"For converting CSV (comma separated value) files into ARFF files you need the following two converters : CSVLoader for loading the CSV file into an Instances object ArffSaver to save the Instances as an ARFF file In the following you'll find some example code to show you how to use the converters . The class takes 2 arguments: the input CSV file the output ARFF file Example code: import weka.core.Instances ; import weka.core.converters.ArffSaver ; import weka.core.converters.CSVLoader ; import java.io.File ; public class CSV2Arff { /** * takes 2 arguments: * - CSV input file * - ARFF output file */ public static void main ( String [] args ) throws Exception { if ( args . length != 2 ) { System . out . println ( \"\\nUsage: CSV2Arff <input.csv> <output.arff>\\n\" ); System . exit ( 1 ); } // load CSV CSVLoader loader = new CSVLoader (); loader . setSource ( new File ( args [ 0 ] )); Instances data = loader . getDataSet (); // save ARFF ArffSaver saver = new ArffSaver (); saver . setInstances ( data ); saver . setFile ( new File ( args [ 1 ] )); saver . setDestination ( new File ( args [ 1 ] )); saver . writeBatch (); } } Note: with versions of Weka later than 3.5.3 the call of saver.setDestination(new File(args[1])); is no longer necessary, it is automatically done in the saver.setFile(new File(args[1])); method. See also # The Weka Examples collection dedicates several example classes of loading from and saving to various file formats: stable-3-8 developer","title":"Converting csv to arff"},{"location":"formats_and_processing/converting_csv_to_arff/#see-also","text":"The Weka Examples collection dedicates several example classes of loading from and saving to various file formats: stable-3-8 developer","title":"See also"},{"location":"formats_and_processing/creating_arff_file/","text":"The following code generates an Instances object and outputs it to stdout as ARFF file. It generates the following types of attributes: numeric nominal string date relational Example class AttTest : import weka.core.Attribute ; import weka.core.DenseInstance ; import weka.core.Instances ; import java.util.ArrayList ; /** * Generates a little ARFF file with different attribute types. * * @author FracPete */ public class AttTest { public static void main ( String [] args ) throws Exception { ArrayList < Attribute > atts ; ArrayList < Attribute > attsRel ; ArrayList < String > attVals ; ArrayList < String > attValsRel ; Instances data ; Instances dataRel ; double [] vals ; double [] valsRel ; int i ; // 1. set up attributes atts = new ArrayList < Attribute > (); // - numeric atts . add ( new Attribute ( \"att1\" )); // - nominal attVals = new ArrayList < String > (); for ( i = 0 ; i < 5 ; i ++ ) attVals . add ( \"val\" + ( i + 1 )); atts . add ( new Attribute ( \"att2\" , attVals )); // - string atts . add ( new Attribute ( \"att3\" , ( ArrayList < String > ) null )); // - date atts . add ( new Attribute ( \"att4\" , \"yyyy-MM-dd\" )); // - relational attsRel = new ArrayList < Attribute > (); // -- numeric attsRel . add ( new Attribute ( \"att5.1\" )); // -- nominal attValsRel = new ArrayList < String > (); for ( i = 0 ; i < 5 ; i ++ ) attValsRel . add ( \"val5.\" + ( i + 1 )); attsRel . add ( new Attribute ( \"att5.2\" , attValsRel )); dataRel = new Instances ( \"att5\" , attsRel , 0 ); atts . add ( new Attribute ( \"att5\" , dataRel , 0 )); // 2. create Instances object data = new Instances ( \"MyRelation\" , atts , 0 ); // 3. fill with data // first instance vals = new double [ data . numAttributes () ] ; // - numeric vals [ 0 ] = Math . PI ; // - nominal vals [ 1 ] = attVals . indexOf ( \"val3\" ); // - string vals [ 2 ] = data . attribute ( 2 ). addStringValue ( \"This is a string!\" ); // - date vals [ 3 ] = data . attribute ( 3 ). parseDate ( \"2001-11-09\" ); // - relational dataRel = new Instances ( data . attribute ( 4 ). relation (), 0 ); // -- first instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . PI + 1 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.3\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); // -- second instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . PI + 2 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.2\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); vals [ 4 ] = data . attribute ( 4 ). addRelation ( dataRel ); // add data . add ( new DenseInstance ( 1.0 , vals )); // second instance vals = new double [ data . numAttributes () ] ; // important: needs NEW array! // - numeric vals [ 0 ] = Math . E ; // - nominal vals [ 1 ] = attVals . indexOf ( \"val1\" ); // - string vals [ 2 ] = data . attribute ( 2 ). addStringValue ( \"And another one!\" ); // - date vals [ 3 ] = data . attribute ( 3 ). parseDate ( \"2000-12-01\" ); // - relational dataRel = new Instances ( data . attribute ( 4 ). relation (), 0 ); // -- first instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . E + 1 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.4\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); // -- second instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . E + 2 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.1\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); vals [ 4 ] = data . attribute ( 4 ). addRelation ( dataRel ); // add data . add ( new DenseInstance ( 1.0 , vals )); // 4. output data System . out . println ( data ); } } Missing values # By default, a new double array will be initialized with 0s. In case you want to be a value missing at a certain position, you have to explicitly set the missing value via the missingValue() method of the weka.core.Utils class. In case you already have an existing weka.core.Instance object, then you use its setMissing(int) method, which sets a missing value at the given position. Here are examples, which set the third attribute to missing: double array: double[] vals = ... // from somewhere, e.g., from AttTest.java example vals[2] = Utils.missingValue(); weka.core.Instance object: double[] vals = ... // from somewhere, e.g., from AttTest.java example Instance inst = new DenseInstance(1.0, vals); inst.setMissing(2); Downloads # AttTest.java ( stable , developer ) - the above class See also # Save Instances to an ARFF File - if you want to save the data to a file instead of printing it to stdout Adding attributes to a dataset - shows how to add attributes to an existing dataset ARFF format","title":"Creating arff file"},{"location":"formats_and_processing/creating_arff_file/#missing-values","text":"By default, a new double array will be initialized with 0s. In case you want to be a value missing at a certain position, you have to explicitly set the missing value via the missingValue() method of the weka.core.Utils class. In case you already have an existing weka.core.Instance object, then you use its setMissing(int) method, which sets a missing value at the given position. Here are examples, which set the third attribute to missing: double array: double[] vals = ... // from somewhere, e.g., from AttTest.java example vals[2] = Utils.missingValue(); weka.core.Instance object: double[] vals = ... // from somewhere, e.g., from AttTest.java example Instance inst = new DenseInstance(1.0, vals); inst.setMissing(2);","title":"Missing values"},{"location":"formats_and_processing/creating_arff_file/#downloads","text":"AttTest.java ( stable , developer ) - the above class","title":"Downloads"},{"location":"formats_and_processing/creating_arff_file/#see-also","text":"Save Instances to an ARFF File - if you want to save the data to a file instead of printing it to stdout Adding attributes to a dataset - shows how to add attributes to an existing dataset ARFF format","title":"See also"},{"location":"formats_and_processing/load_an_xml_bif_file/","text":"You should use the BIFReader class ( weka.classifiers.bayes.net.BIFReader ). Here is the snippet : import weka.classifiers.bayes.BayesNet ; import weka.classifiers.bayes.net.BIFReader ; public class wekaTest { public static void main ( String [] args ) throws Exception { BayesNet network = new BayesNet (); BIFReader reader = new BIFReader (); network = reader . processFile ( \"rb_on_min_attr.xml\" ); } } https:/weka.sourceforge.io/doc.dev/weka/classifiers/evaluation/ThresholdCurve.html Downloads # LoadBif.java ( stable-3.8 , developer )","title":"Load an xml bif file"},{"location":"formats_and_processing/load_an_xml_bif_file/#downloads","text":"LoadBif.java ( stable-3.8 , developer )","title":"Downloads"},{"location":"formats_and_processing/remove_attributes/","text":"The following code removes specified attributes from an ARFF file and prints the result to stdout. The class takes the following parameters: ARFF file attribute index/indices inversion of index/indices (false/true) import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Remove ; import java.io.BufferedReader ; import java.io.FileReader ; public class RemoveTest { /** * takes an ARFF file as first argument, the number of indices to remove * as second and thirdly whether to invert or not (true/false). * Dumps the generated data to stdout. */ public static void main ( String [] args ) throws Exception { Instances inst ; Instances instNew ; Remove remove ; inst = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); remove = new Remove (); remove . setAttributeIndices ( args [ 1 ] ); remove . setInvertSelection ( new Boolean ( args [ 2 ] ). booleanValue ()); remove . setInputFormat ( inst ); instNew = Filter . useFilter ( inst , remove ); System . out . println ( instNew ); } } The same can be achieved with the following filter commandline (add -V to invert the selection): java weka.filters.unsupervised.attribute.Remove -R <indices> -i <input> -o <output> Downloads # RemoveTest.java ( stable-3.8 , developer )","title":"Remove attributes"},{"location":"formats_and_processing/remove_attributes/#downloads","text":"RemoveTest.java ( stable-3.8 , developer )","title":"Downloads"},{"location":"formats_and_processing/rename_attribute_values/","text":"After discretizing an attribute you might want to rename the values of the newly created nominal attribute. E.g., after discretizing a numeric attribute with values ranging from 1 to 100 you might end up with a nominal attribute that has the following values: '1-15','16-18','29-100' You can use the renameAttributeValue(...) method of the weka.core.Instances class (see API ) to rename this values into, e.g., 0, 1 and 2. Here's a code snippet how to do this ( arff is an Instances object, att is an attribute of the same instances object): for ( int n = 0 ; n < att . numValues (); n ++ ) { arff . renameAttributeValue ( att , att . value ( n ), \"\" + n ); The Rename.java like mentioned above. See also # Use Weka in your Java code Downloads # Rename.java","title":"Rename attribute values"},{"location":"formats_and_processing/rename_attribute_values/#see-also","text":"Use Weka in your Java code","title":"See also"},{"location":"formats_and_processing/rename_attribute_values/#downloads","text":"Rename.java","title":"Downloads"},{"location":"formats_and_processing/save_instances_to_arff/","text":"DataSink # The easiest way to save an weka.core.Instances object to a file is by using the weka.core.converters.ConverterUtils.DataSink class. import weka.core.converters.ConverterUtils.DataSink ; import weka.core.Instances ; Instances dataset = ... String outputFilename = ... try { DataSink . write ( outputFilename , dataset ); } catch ( Exception e ) { System . err . println ( \"Failed to save data to: \" + outputFilename ); e . printStackTrace (); } Converter # You can use the ArffSaver class ( weka.core.converters.ArffSaver ) for saving a weka.core.Instances object to a file. Here is the snippet : Instances dataSet = ... ArffSaver saver = new ArffSaver (); saver . setInstances ( dataSet ); saver . setFile ( new File ( \"./data/test.arff\" )); saver . writeBatch (); Notes: using the converter approach, one can easily swap the ArffSaver with another saver, e.g., the CSVSaver to output the data in a different format. The Weka Examples collection dedicates quite a few examples to the use of converters in the wekaexamples.core.converters package ( stable , developer ) Java I/O # You can also save the weka.core.Instances object directly using Java I/O classes: import java.io.BufferedWriter ; import java.io.FileWriter ; ... Instances dataSet = ... BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"./data/test.arff\" )); writer . write ( dataSet . toString ()); writer . flush (); writer . close (); Note: using the toString() of the weka.core.Instances doesn't scale very well for large datasets, since the complete string has to fit into memory. It is best to use a converter, as described in the previous section, which uses an incremental approach for writing the dataset to disk.","title":"DataSink"},{"location":"formats_and_processing/save_instances_to_arff/#datasink","text":"The easiest way to save an weka.core.Instances object to a file is by using the weka.core.converters.ConverterUtils.DataSink class. import weka.core.converters.ConverterUtils.DataSink ; import weka.core.Instances ; Instances dataset = ... String outputFilename = ... try { DataSink . write ( outputFilename , dataset ); } catch ( Exception e ) { System . err . println ( \"Failed to save data to: \" + outputFilename ); e . printStackTrace (); }","title":"DataSink"},{"location":"formats_and_processing/save_instances_to_arff/#converter","text":"You can use the ArffSaver class ( weka.core.converters.ArffSaver ) for saving a weka.core.Instances object to a file. Here is the snippet : Instances dataSet = ... ArffSaver saver = new ArffSaver (); saver . setInstances ( dataSet ); saver . setFile ( new File ( \"./data/test.arff\" )); saver . writeBatch (); Notes: using the converter approach, one can easily swap the ArffSaver with another saver, e.g., the CSVSaver to output the data in a different format. The Weka Examples collection dedicates quite a few examples to the use of converters in the wekaexamples.core.converters package ( stable , developer )","title":"Converter"},{"location":"formats_and_processing/save_instances_to_arff/#java-io","text":"You can also save the weka.core.Instances object directly using Java I/O classes: import java.io.BufferedWriter ; import java.io.FileWriter ; ... Instances dataSet = ... BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"./data/test.arff\" )); writer . write ( dataSet . toString ()); writer . flush (); writer . close (); Note: using the toString() of the weka.core.Instances doesn't scale very well for large datasets, since the complete string has to fit into memory. It is best to use a converter, as described in the previous section, which uses an incremental approach for writing the dataset to disk.","title":"Java I/O"},{"location":"formats_and_processing/single_quotes_in_labels_of_arff_files/","text":"Single quotes in Weka are used to surround strings with spaces or other special characters (see spaces in labels of ARFF files ). If some of your labels contain single quotes, you have to escape them with a backslash. For example, the name of one of Alexandre Duma's musketeers is D'Artagnan and needs to be quoted and escaped as follows 'D\\'Artagnan' Using the Weka API, you can use the quote(String) method of the weka.core.Utils class for doing this: import weka.core.Utils ; ... String raw = \"D'Artagnan\" ; String escaped = Utils . quote ( raw );","title":"Single quotes in labels of arff files"},{"location":"formats_and_processing/spaces_in_labels_of_arff_files/","text":"A common problem people have with ARFF files is that labels can only have spaces if they are enclosed in single quotes, i.e., a label such as: some value should be written either 'some value' or some_value in the file. See single quotes in labels of ARFF files for an example using the Weka API for automatically quoting such strings.","title":"Spaces in labels of arff files"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/","text":"This example transfers a dataset stored in an ARFF file into the MySQL database weka_test on the MySQL server running on the same machine. In order to make this work, the MySQL JDBC driver must be in the CLASSPATH and the DatabaseUtils.props file must be configured accordingly. Usage: Arff2Database <input.arff> Source code: import weka.core.* ; import weka.core.converters.* ; import java.io.* ; /** * A simple API example of transferring an ARFF file into a MySQL table. * It loads the data into the database \"weka_test\" on the MySQL server * running on the same machine. Instead of using the relation name of the * database as the table name, \"mytable\" is used instead. The * DatabaseUtils.props file must be configured accordingly. * * Usage: Arff2Database input.arff * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class Arff2Database { /** * loads a dataset into a MySQL database * * @param args the commandline arguments */ public static void main ( String [] args ) throws Exception { Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); DatabaseSaver save = new DatabaseSaver (); save . setUrl ( \"jdbc:mysql://localhost:3306/weka_test\" ); save . setUser ( \"fracpete\" ); save . setPassword ( \"fracpete\" ); save . setInstances ( data ); save . setRelationForTableName ( false ); save . setTableName ( \"mytable\" ); save . connectToDatabase (); save . writeBatch (); } } See also # Databases - explains how to use databases within Weka weka/experiment/DatabaseUtils.props - the properties file explained in detail Downloads # Arff2Database.java The Weka Examples collection contains several example classes: SaveDataToDbBatch.java ( stable-3.8 , developer ) SaveDataToDbIncremental.java ( stable-3.8 , developer ) Links # MySQL homepage","title":"Transferring an arff file into a database"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/#see-also","text":"Databases - explains how to use databases within Weka weka/experiment/DatabaseUtils.props - the properties file explained in detail","title":"See also"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/#downloads","text":"Arff2Database.java The Weka Examples collection contains several example classes: SaveDataToDbBatch.java ( stable-3.8 , developer ) SaveDataToDbIncremental.java ( stable-3.8 , developer )","title":"Downloads"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/#links","text":"MySQL homepage","title":"Links"},{"location":"formats_and_processing/xml/","text":"Weka now supports XML (e X tensible M arkup L anguage) in several places: Command Line # WEKA now allows to start Classifiers and Experiments with the -xml option followed by a filename to retrieve the command line options from the XML file instead of the command line. For such simple classifiers like e.g. J48 this looks like overkill, but as soon as one uses Meta-Classifiers or Meta-Meta-Classifiers the handling gets tricky and one spends a lot of time looking for missing quotes. With the hierarchical structure of XML files it is simple to plug in other classifiers by just exchanging tags. The DTD for the XML options is quite simple: <!DOCTYPE options [ <!ELEMENT options (option)*> <!ATTLIST options type CDATA \"classifier\"> <!ATTLIST options value CDATA \"\"> <!ELEMENT option (#PCDATA | options)*> <!ATTLIST option name CDATA #REQUIRED> <!ATTLIST option type (flag | single | hyphens | quotes) \"single\"> ] > The type attribute of the option tag needs some explanations. There are currently four different types of options in WEKA: flag The simplest option that takes no arguments, like e.g. the -V flag for inversing an selection. <option name= \"V\" type= \"flag\" /> single The option takes exactly one parameter, directly following after the option, e.g., for specifying the trainings file with -t somefile.arff . Here the parameter value is just put between the opening and closing tag. Since single is the default value for the type tag we don't need to specify it explicitly. <option name= \"t\" > somefile.arff </option> hyphens Meta-Classifiers like AdaBoostM1 take another classifier as option with the -W option, where the options for the base classifier follow after the -- . And here it is where the fun starts; where to put parameters for the base classifier if the Meta-Classifier itself is a base classifier for another Meta-Classifier? E.g., does -W weka.classifiers.trees.J48 -- -C 0.001 become this: <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> Internally, all the options enclosed by the options tag are pushed to the end after the -- if one transforms the XML into a command line string. quotes A Meta-Classifier like Stacking can take several -B options, where each single one encloses other options in quotes (this itself can contain a Meta-Classifier!). From -B \"weka.classifiers.trees.J48\" we then get this XML: <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> With the XML representation one doesn't have to worry anymore about the level of quotes one is using and therefore doesn't have to care about the correct escaping (i.e. \" ... \\\" ... \\\" ...\") since this is done automatically. And if we now put all together we can transform this more complicated command line ( java and the CLASSPATH omitted): weka.classifiers.meta.Stacking -B \"weka.classifiers.meta.AdaBoostM1 -W weka.classifiers.trees.J48 -- -C 0.001\" -B \"weka.classifiers.meta.Bagging -W weka.classifiers.meta.AdaBoostM1 -- -W weka.classifiers.trees.J48\" -B \"weka.classifiers.meta.Stacking -B \\\"weka.classifiers.trees.J48\\\"\" -t test/datasets/hepatitis.arff into XML: <options type= \"class\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Bagging\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> <option name= \"t\" > test/datasets/hepatitis.arff </option> </options> Note: The type and value attribute of the outermost options tag is not used while reading the parameters. It is merely for documentation purposes, so that one knows which class was actually started from the command line. Responsible Class(es): weka.core.xml.XMLOptions Example(s): commandline.xml Serialization of Experiments # It is now possible to serialize the Experiments from the WEKA Experimenter not only in the proprietary binary format Java offers with serialization (with this you run into problems trying to read old experiments with a newer WEKA version, due to different SerialUIDs), but also in XML. There are currently two different ways to do this: built-in The built-in serialization captures only the necessary informations of an experiment and doesn't serialize anything else. It's sole purpose is to save the setup of a specific experiment and can therefore not store any built models. Thanks to this limitation we'll never run into problems with mismatching SerialUIDs. This kind of serialization is always available and can be selected via a Filter (*.xml) in the Save/Open-Dialog of the Experimenter. The DTD is very simple and looks like this (for version 3.4.5): <!DOCTYPE object[ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"no\"> <!ATTLIST object array CDATA \"no\"> <!-- the dimensions of the array; no=0, yes=1 --> <!ATTLIST object null CDATA \"no\"> <!ATTLIST object version CDATA \"3.4.5\"> ]> Prior to versions 3.4.5 and 3.5.0 it looked like this: <!DOCTYPE object [ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"yes\"> <!ATTLIST object array CDATA \"no\"> ] > Responsible Class(es): weka.experiment.xml.XMLExperiment for general Serialization: weka.core.xml.XMLSerialization weka.core.xml.XMLBasicSerialization Example(s): serialization.xml KOML The Koala Object Markup Language (KOML) is published under the LGPL and is an alternative way of serializing and derserializing Java Objects in an XML file. Like the normal serialization it serializes everything into XML via an ObjectOutputStream, including the SerialUID of each class. Even though we have the same problems with mismatching SerialUIDs it is at least possible edit the XML files by hand and replace the offending IDs with the new ones. In order to use KOML one only has to assure that the KOML classes are in the CLASSPATH with which the Experimenter is launched. As soon as KOML is present another Filter (*.koml) will show up in the Save/Open-Dialog. The DTD for KOML can be found here . Responsible Class(es): weka.core.xml.KOML Example(s): serialization.koml The experiment class can of course read those XML files if passed as input or output file (see options of weka.experiment.Experiment and weka.experiment.RemoteExperiment ). Serialization of Classifiers # The options for models of a classifier, -l for the input model and -d for the output model, now also supports XML serialized files. Here we have to differentiate between two different formats: built-in The built-in serialization captures only the options of a classifier but not the built model. With the -l one still has to provide a training file, since we only retrieve the options from the XML file. It is possible to add more options on the command line, but it is no check performed whether they collide with the ones stored in the XML file. The file is expected to end with .xml . KOML Since the KOML serialization captures everything of a Java Object we can use it just like the normal Java serialization. The file is expected to end with .koml . The built-in serialization can be used in the Experimenter for loading/saving options from algorithms that have been added to a Simple Experiment. Unfortunately it is not possible to create such a hierarchical structure like mentioned in Command Line . This is because of the loss of information caused by the getOptions() method of classifiers: it returns only a flat String-Array and not a tree structure. Responsible Class(es): weka.core.xml.KOML weka.classifiers.xml.XMLClassifier Example(s): commandline_inputmodel.xml Bayesian Networks # The GraphVisualizer ( weka.gui.graphvisualizer.GraphVisualizer ) can save graphs into the Interchange Format for Bayesian Networks (BIF). If started from command line with an XML filename as first parameter and not from the Explorer it can display the given file directly. The DTD for BIF is this: <!DOCTYPE BIF [ <!ELEMENT BIF ( NETWORK )*> <!ATTLIST BIF VERSION CDATA #REQUIRED> <!ELEMENT NETWORK ( NAME, ( PROPERTY | VARIABLE | DEFINITION )* )> <!ELEMENT NAME (#PCDATA)> <!ELEMENT VARIABLE ( NAME, ( OUTCOME | PROPERTY )* ) > <!ATTLIST VARIABLE TYPE (nature|decision|utility) \"nature\"> <!ELEMENT OUTCOME (#PCDATA)> <!ELEMENT DEFINITION ( FOR | GIVEN | TABLE | PROPERTY )* > <!ELEMENT FOR (#PCDATA)> <!ELEMENT GIVEN (#PCDATA)> <!ELEMENT TABLE (#PCDATA)> <!ELEMENT PROPERTY (#PCDATA)> ]> Responsible Class(es): weka.classifiers.bayes.BayesNet#toXMLBIF03() weka.classifiers.bayes.net.BIFReader weka.gui.graphvisualizer.BIFParser Example(s): bif.xml Tools # Experimenter options The XSLT script options.xsl parses an XML file for the experimenter and outputs the options in two ways: in an array-like fashion, i.e., each option on a separate line; the class is output first. commandline-like, i.e., the class followed by all its parameters; at each end of a line a \"\\\" is appended. (works only on *nix and Cygwin ) (Use options_single.xsl Usage: xsltproc options.xsl <xml file> Note: you can use any XSLT processor, e.g., xt; xsltproc is just one. Downloads # KOML koml12.dtd - local copy of the KOML DTD 1.2 koml_bin.zip koml_sources.zip - the KOML source code","title":"Xml"},{"location":"formats_and_processing/xml/#command-line","text":"WEKA now allows to start Classifiers and Experiments with the -xml option followed by a filename to retrieve the command line options from the XML file instead of the command line. For such simple classifiers like e.g. J48 this looks like overkill, but as soon as one uses Meta-Classifiers or Meta-Meta-Classifiers the handling gets tricky and one spends a lot of time looking for missing quotes. With the hierarchical structure of XML files it is simple to plug in other classifiers by just exchanging tags. The DTD for the XML options is quite simple: <!DOCTYPE options [ <!ELEMENT options (option)*> <!ATTLIST options type CDATA \"classifier\"> <!ATTLIST options value CDATA \"\"> <!ELEMENT option (#PCDATA | options)*> <!ATTLIST option name CDATA #REQUIRED> <!ATTLIST option type (flag | single | hyphens | quotes) \"single\"> ] > The type attribute of the option tag needs some explanations. There are currently four different types of options in WEKA: flag The simplest option that takes no arguments, like e.g. the -V flag for inversing an selection. <option name= \"V\" type= \"flag\" /> single The option takes exactly one parameter, directly following after the option, e.g., for specifying the trainings file with -t somefile.arff . Here the parameter value is just put between the opening and closing tag. Since single is the default value for the type tag we don't need to specify it explicitly. <option name= \"t\" > somefile.arff </option> hyphens Meta-Classifiers like AdaBoostM1 take another classifier as option with the -W option, where the options for the base classifier follow after the -- . And here it is where the fun starts; where to put parameters for the base classifier if the Meta-Classifier itself is a base classifier for another Meta-Classifier? E.g., does -W weka.classifiers.trees.J48 -- -C 0.001 become this: <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> Internally, all the options enclosed by the options tag are pushed to the end after the -- if one transforms the XML into a command line string. quotes A Meta-Classifier like Stacking can take several -B options, where each single one encloses other options in quotes (this itself can contain a Meta-Classifier!). From -B \"weka.classifiers.trees.J48\" we then get this XML: <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> With the XML representation one doesn't have to worry anymore about the level of quotes one is using and therefore doesn't have to care about the correct escaping (i.e. \" ... \\\" ... \\\" ...\") since this is done automatically. And if we now put all together we can transform this more complicated command line ( java and the CLASSPATH omitted): weka.classifiers.meta.Stacking -B \"weka.classifiers.meta.AdaBoostM1 -W weka.classifiers.trees.J48 -- -C 0.001\" -B \"weka.classifiers.meta.Bagging -W weka.classifiers.meta.AdaBoostM1 -- -W weka.classifiers.trees.J48\" -B \"weka.classifiers.meta.Stacking -B \\\"weka.classifiers.trees.J48\\\"\" -t test/datasets/hepatitis.arff into XML: <options type= \"class\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Bagging\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> <option name= \"t\" > test/datasets/hepatitis.arff </option> </options> Note: The type and value attribute of the outermost options tag is not used while reading the parameters. It is merely for documentation purposes, so that one knows which class was actually started from the command line. Responsible Class(es): weka.core.xml.XMLOptions Example(s): commandline.xml","title":"Command Line"},{"location":"formats_and_processing/xml/#serialization-of-experiments","text":"It is now possible to serialize the Experiments from the WEKA Experimenter not only in the proprietary binary format Java offers with serialization (with this you run into problems trying to read old experiments with a newer WEKA version, due to different SerialUIDs), but also in XML. There are currently two different ways to do this: built-in The built-in serialization captures only the necessary informations of an experiment and doesn't serialize anything else. It's sole purpose is to save the setup of a specific experiment and can therefore not store any built models. Thanks to this limitation we'll never run into problems with mismatching SerialUIDs. This kind of serialization is always available and can be selected via a Filter (*.xml) in the Save/Open-Dialog of the Experimenter. The DTD is very simple and looks like this (for version 3.4.5): <!DOCTYPE object[ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"no\"> <!ATTLIST object array CDATA \"no\"> <!-- the dimensions of the array; no=0, yes=1 --> <!ATTLIST object null CDATA \"no\"> <!ATTLIST object version CDATA \"3.4.5\"> ]> Prior to versions 3.4.5 and 3.5.0 it looked like this: <!DOCTYPE object [ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"yes\"> <!ATTLIST object array CDATA \"no\"> ] > Responsible Class(es): weka.experiment.xml.XMLExperiment for general Serialization: weka.core.xml.XMLSerialization weka.core.xml.XMLBasicSerialization Example(s): serialization.xml KOML The Koala Object Markup Language (KOML) is published under the LGPL and is an alternative way of serializing and derserializing Java Objects in an XML file. Like the normal serialization it serializes everything into XML via an ObjectOutputStream, including the SerialUID of each class. Even though we have the same problems with mismatching SerialUIDs it is at least possible edit the XML files by hand and replace the offending IDs with the new ones. In order to use KOML one only has to assure that the KOML classes are in the CLASSPATH with which the Experimenter is launched. As soon as KOML is present another Filter (*.koml) will show up in the Save/Open-Dialog. The DTD for KOML can be found here . Responsible Class(es): weka.core.xml.KOML Example(s): serialization.koml The experiment class can of course read those XML files if passed as input or output file (see options of weka.experiment.Experiment and weka.experiment.RemoteExperiment ).","title":"Serialization of Experiments"},{"location":"formats_and_processing/xml/#serialization-of-classifiers","text":"The options for models of a classifier, -l for the input model and -d for the output model, now also supports XML serialized files. Here we have to differentiate between two different formats: built-in The built-in serialization captures only the options of a classifier but not the built model. With the -l one still has to provide a training file, since we only retrieve the options from the XML file. It is possible to add more options on the command line, but it is no check performed whether they collide with the ones stored in the XML file. The file is expected to end with .xml . KOML Since the KOML serialization captures everything of a Java Object we can use it just like the normal Java serialization. The file is expected to end with .koml . The built-in serialization can be used in the Experimenter for loading/saving options from algorithms that have been added to a Simple Experiment. Unfortunately it is not possible to create such a hierarchical structure like mentioned in Command Line . This is because of the loss of information caused by the getOptions() method of classifiers: it returns only a flat String-Array and not a tree structure. Responsible Class(es): weka.core.xml.KOML weka.classifiers.xml.XMLClassifier Example(s): commandline_inputmodel.xml","title":"Serialization of Classifiers"},{"location":"formats_and_processing/xml/#bayesian-networks","text":"The GraphVisualizer ( weka.gui.graphvisualizer.GraphVisualizer ) can save graphs into the Interchange Format for Bayesian Networks (BIF). If started from command line with an XML filename as first parameter and not from the Explorer it can display the given file directly. The DTD for BIF is this: <!DOCTYPE BIF [ <!ELEMENT BIF ( NETWORK )*> <!ATTLIST BIF VERSION CDATA #REQUIRED> <!ELEMENT NETWORK ( NAME, ( PROPERTY | VARIABLE | DEFINITION )* )> <!ELEMENT NAME (#PCDATA)> <!ELEMENT VARIABLE ( NAME, ( OUTCOME | PROPERTY )* ) > <!ATTLIST VARIABLE TYPE (nature|decision|utility) \"nature\"> <!ELEMENT OUTCOME (#PCDATA)> <!ELEMENT DEFINITION ( FOR | GIVEN | TABLE | PROPERTY )* > <!ELEMENT FOR (#PCDATA)> <!ELEMENT GIVEN (#PCDATA)> <!ELEMENT TABLE (#PCDATA)> <!ELEMENT PROPERTY (#PCDATA)> ]> Responsible Class(es): weka.classifiers.bayes.BayesNet#toXMLBIF03() weka.classifiers.bayes.net.BIFReader weka.gui.graphvisualizer.BIFParser Example(s): bif.xml","title":"Bayesian Networks"},{"location":"formats_and_processing/xml/#tools","text":"Experimenter options The XSLT script options.xsl parses an XML file for the experimenter and outputs the options in two ways: in an array-like fashion, i.e., each option on a separate line; the class is output first. commandline-like, i.e., the class followed by all its parameters; at each end of a line a \"\\\" is appended. (works only on *nix and Cygwin ) (Use options_single.xsl Usage: xsltproc options.xsl <xml file> Note: you can use any XSLT processor, e.g., xt; xsltproc is just one.","title":"Tools"},{"location":"formats_and_processing/xml/#downloads","text":"KOML koml12.dtd - local copy of the KOML DTD 1.2 koml_bin.zip koml_sources.zip - the KOML source code","title":"Downloads"},{"location":"formats_and_processing/xrff/","text":"The XRFF (e X tensible attribute- R elation F ile F ormat) is an XML-based extension of the ARFF format. File extensions # .xrff the default extension of XRFF files .xrff.gz the extension for gzip compressed XRFF files (see Compression for more details) Comparison # ARFF # In the following a snippet of the UCI dataset iris in ARFF format: @relation iris @attribute sepallength numeric @attribute sepalwidth numeric @attribute petallength numeric @attribute petalwidth numeric @attribute class {Iris-setosa,Iris-versicolor,Iris-virginica} @data 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3,1.4,0.2,Iris-setosa ... XRFF # And the same dataset represented as XRFF file: <?xml version=\"1.0\" encoding=\"utf-8\"?> <!DOCTYPE dataset [ <!ELEMENT dataset (header,body)> <!ATTLIST dataset name CDATA #REQUIRED> <!ATTLIST dataset version CDATA \"3.5.4\"> <!ELEMENT header (notes?,attributes)> <!ELEMENT body (instances)> <!ELEMENT notes ANY> <!-- comments, information, copyright, etc. --> <!ELEMENT attributes (attribute+)> <!ELEMENT attribute (labels?,metadata?,attributes?)> <!ATTLIST attribute name CDATA #REQUIRED> <!ATTLIST attribute type (numeric|date|nominal|string|relational) #REQUIRED> <!ATTLIST attribute format CDATA #IMPLIED> <!ATTLIST attribute class (yes|no) \"no\"> <!ELEMENT labels (label*)> <!-- only for type \"nominal\" --> <!ELEMENT label ANY> <!ELEMENT metadata (property*)> <!ELEMENT property ANY> <!ATTLIST property name CDATA #REQUIRED> <!ELEMENT instances (instance*)> <!ELEMENT instance (value*)> <!ATTLIST instance type (normal|sparse) \"normal\"> <!ATTLIST instance weight CDATA #IMPLIED> <!ELEMENT value (#PCDATA|instances)*> <!ATTLIST value index CDATA #IMPLIED> <!-- 1-based index (only used for instance format \"sparse\") --> <!ATTLIST value missing (yes|no) \"no\"> ] > <dataset name= \"iris\" version= \"3.5.3\" > <header> <attributes> <attribute name= \"sepallength\" type= \"numeric\" /> <attribute name= \"sepalwidth\" type= \"numeric\" /> <attribute name= \"petallength\" type= \"numeric\" /> <attribute name= \"petalwidth\" type= \"numeric\" /> <attribute class= \"yes\" name= \"class\" type= \"nominal\" > <labels> <label> Iris-setosa </label> <label> Iris-versicolor </label> <label> Iris-virginica </label> </labels> </attribute> </attributes> </header> <body> <instances> <instance> <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> <instance> <value> 4.9 </value> <value> 3 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> ... </instances> </body> </dataset> Sparse format # The XRFF format also supports a sparse data representation. Even though the iris dataset does not contain sparse data, the above example will be used here to illustrate the sparse format: ... <instances> <instance type= \"sparse\" > <value index= \"1\" > 5.1 </value> <value index= \"2\" > 3.5 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> <instance type= \"sparse\" > <value index= \"1\" > 4.9 </value> <value index= \"2\" > 3 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> ... </instances> ... In contrast to the normal data format, each sparse instance tag contains a type attribute with the value sparse : <instance type= \"sparse\" > And each value tag needs to specify the index attribute, which contains the 1-based index of this value. <value index= \"1\" > 5.1 </value> Compression # Since the XML representation takes up considerably more space than the rather compact ARFF format, one can also compress the data via gzip . Weka automatically recognizes a file being gzip compressed, if the file's extension is .xrff.gz instead of .xrff . The Weka Explorer now allows to load/save compressed and uncompressed XRFF files (this applies also to ARFF files). Additional features # In addition to all the features of the ARFF format, the XRFF format contains the following additional features: class attribute specification attribute weights instance weights Class attribute specification # Via the class=\"yes\" attribute in the attribute specification in the header, one can define which attribute should act as class attribute. A feature that can be used on the command line as well as in the Experimenter, which now can also load other data formats, and removing the limitation of the class attribute always having to be the last one. Snippet from the iris dataset: <attribute ** class= \"yes\" ** name= \"class\" type= \"nominal\" > Attribute weights # Attribute weights are stored in an attributes meta-data tag (in the header section). Here's an example of the petalwidth attribute with a weight of 0.9: <attribute name= \"petalwidth\" type= \"numeric\" > <metadata> <property name= \"weight\" > 0.9 </property> </metadata> </attribute> Instance weights # Instance weights are defined via the weight attribute in each instance tag. By default, the weight is 1. Here's an example: <instance weight= \"0.75\" > <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance>","title":"Xrff"},{"location":"formats_and_processing/xrff/#file-extensions","text":".xrff the default extension of XRFF files .xrff.gz the extension for gzip compressed XRFF files (see Compression for more details)","title":"File extensions"},{"location":"formats_and_processing/xrff/#comparison","text":"","title":"Comparison"},{"location":"formats_and_processing/xrff/#arff","text":"In the following a snippet of the UCI dataset iris in ARFF format: @relation iris @attribute sepallength numeric @attribute sepalwidth numeric @attribute petallength numeric @attribute petalwidth numeric @attribute class {Iris-setosa,Iris-versicolor,Iris-virginica} @data 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3,1.4,0.2,Iris-setosa ...","title":"ARFF"},{"location":"formats_and_processing/xrff/#xrff","text":"And the same dataset represented as XRFF file: <?xml version=\"1.0\" encoding=\"utf-8\"?> <!DOCTYPE dataset [ <!ELEMENT dataset (header,body)> <!ATTLIST dataset name CDATA #REQUIRED> <!ATTLIST dataset version CDATA \"3.5.4\"> <!ELEMENT header (notes?,attributes)> <!ELEMENT body (instances)> <!ELEMENT notes ANY> <!-- comments, information, copyright, etc. --> <!ELEMENT attributes (attribute+)> <!ELEMENT attribute (labels?,metadata?,attributes?)> <!ATTLIST attribute name CDATA #REQUIRED> <!ATTLIST attribute type (numeric|date|nominal|string|relational) #REQUIRED> <!ATTLIST attribute format CDATA #IMPLIED> <!ATTLIST attribute class (yes|no) \"no\"> <!ELEMENT labels (label*)> <!-- only for type \"nominal\" --> <!ELEMENT label ANY> <!ELEMENT metadata (property*)> <!ELEMENT property ANY> <!ATTLIST property name CDATA #REQUIRED> <!ELEMENT instances (instance*)> <!ELEMENT instance (value*)> <!ATTLIST instance type (normal|sparse) \"normal\"> <!ATTLIST instance weight CDATA #IMPLIED> <!ELEMENT value (#PCDATA|instances)*> <!ATTLIST value index CDATA #IMPLIED> <!-- 1-based index (only used for instance format \"sparse\") --> <!ATTLIST value missing (yes|no) \"no\"> ] > <dataset name= \"iris\" version= \"3.5.3\" > <header> <attributes> <attribute name= \"sepallength\" type= \"numeric\" /> <attribute name= \"sepalwidth\" type= \"numeric\" /> <attribute name= \"petallength\" type= \"numeric\" /> <attribute name= \"petalwidth\" type= \"numeric\" /> <attribute class= \"yes\" name= \"class\" type= \"nominal\" > <labels> <label> Iris-setosa </label> <label> Iris-versicolor </label> <label> Iris-virginica </label> </labels> </attribute> </attributes> </header> <body> <instances> <instance> <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> <instance> <value> 4.9 </value> <value> 3 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> ... </instances> </body> </dataset>","title":"XRFF"},{"location":"formats_and_processing/xrff/#sparse-format","text":"The XRFF format also supports a sparse data representation. Even though the iris dataset does not contain sparse data, the above example will be used here to illustrate the sparse format: ... <instances> <instance type= \"sparse\" > <value index= \"1\" > 5.1 </value> <value index= \"2\" > 3.5 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> <instance type= \"sparse\" > <value index= \"1\" > 4.9 </value> <value index= \"2\" > 3 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> ... </instances> ... In contrast to the normal data format, each sparse instance tag contains a type attribute with the value sparse : <instance type= \"sparse\" > And each value tag needs to specify the index attribute, which contains the 1-based index of this value. <value index= \"1\" > 5.1 </value>","title":"Sparse format"},{"location":"formats_and_processing/xrff/#compression","text":"Since the XML representation takes up considerably more space than the rather compact ARFF format, one can also compress the data via gzip . Weka automatically recognizes a file being gzip compressed, if the file's extension is .xrff.gz instead of .xrff . The Weka Explorer now allows to load/save compressed and uncompressed XRFF files (this applies also to ARFF files).","title":"Compression"},{"location":"formats_and_processing/xrff/#additional-features","text":"In addition to all the features of the ARFF format, the XRFF format contains the following additional features: class attribute specification attribute weights instance weights","title":"Additional features"},{"location":"formats_and_processing/xrff/#class-attribute-specification","text":"Via the class=\"yes\" attribute in the attribute specification in the header, one can define which attribute should act as class attribute. A feature that can be used on the command line as well as in the Experimenter, which now can also load other data formats, and removing the limitation of the class attribute always having to be the last one. Snippet from the iris dataset: <attribute ** class= \"yes\" ** name= \"class\" type= \"nominal\" >","title":"Class attribute specification"},{"location":"formats_and_processing/xrff/#attribute-weights","text":"Attribute weights are stored in an attributes meta-data tag (in the header section). Here's an example of the petalwidth attribute with a weight of 0.9: <attribute name= \"petalwidth\" type= \"numeric\" > <metadata> <property name= \"weight\" > 0.9 </property> </metadata> </attribute>","title":"Attribute weights"},{"location":"formats_and_processing/xrff/#instance-weights","text":"Instance weights are defined via the weight attribute in each instance tag. By default, the weight is 1. Here's an example: <instance weight= \"0.75\" > <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance>","title":"Instance weights"},{"location":"not_so_faq/gsp/","text":"Class # weka.associators.GeneralizedSequentialPatterns Publication # Ramakrishnan Srikant, Rakesh Agrawal (1996). Mining Sequential Patterns: Generalizations and Performance Improvements. Downloads # GeneralizedSequentialPattern_example.arff","title":"Class"},{"location":"not_so_faq/gsp/#class","text":"weka.associators.GeneralizedSequentialPatterns","title":"Class"},{"location":"not_so_faq/gsp/#publication","text":"Ramakrishnan Srikant, Rakesh Agrawal (1996). Mining Sequential Patterns: Generalizations and Performance Improvements.","title":"Publication"},{"location":"not_so_faq/gsp/#downloads","text":"GeneralizedSequentialPattern_example.arff","title":"Downloads"},{"location":"not_so_faq/j48_numbers/","text":"J48 pruned tree node-caps = yes | deg-malig = 1: recurrence-events (1.01/0.4) | deg-malig = 2: no-recurrence-events (26.2/8.0) | deg-malig = 3: recurrence-events (30.4/7.4) node-caps = no: no-recurrence-events (228.39/53.4) The first number is the total number of instances (weight of instances) reaching the leaf. The second number is the number (weight) of those instances that are misclassified. If your data has missing attribute values then you will end up with fractional instances at the leafs. When splitting on an attribute where some of the training instances have missing values, J48 will divide a training instance with a missing value for the split attribute up into fractional parts proportional to the frequencies of the observed non-missing values. This is discussed in the Witten & Frank Data Mining book as well as Ross Quinlan's original publications on C4.5.","title":"J48 numbers"},{"location":"packages/","text":"Weka 3.7.2 introduced support for packages, making it easy to extend Weka without having to recompile or patch the underlying Weka installation. Here are some pointers for using and developing packages: How do I use the package manager? Unofficial packages How are packages structured for the package management system? List of available packages","title":"Packages"},{"location":"packages/manager/","text":"Usually, the term \"package\" is used to refer to Java's concept of organizing classes. From version 3.7.2, Weka has the concept of a package as a bundle of additional functionality, separate from that supplied in the main weka.jar file. A package consists of various jar files, documentation, meta data, and possibly source code. Many learning algorithms and tools that were present in earlier versions of Weka have become separate packages from version 3.7.2. This simplifies the core Weka system and allows users to install just what they need or are interested in. It also provides a simple mechanism for people to use when contributing to Weka. There are a number of packages available for Weka that add learning schemes or extend the functionality of the core system in some fashion. Many are provided by the Weka team and others are from third parties. Weka includes a facility for the management of packages and a mechanism to load them dynamically at runtime. There is both a command-line and GUI package manager. If the package manager does not start when you try to run it, take a look at this page. Command line package management # Assuming that the weka.jar file is in the classpath, the package manager can be accessed by typing: java weka.core.WekaPackageManager Supplying no options will print the usage information: Usage: weka.core.WekaPackageManager [option] Options: -list-packages <all | installed | available> -package-info <repository | installed | archive> packageName -install-package <packageName | packageZip | URL> [version] -uninstall-package <packageName> -refresh-cache Weka 3.7.8 now offers a completely \"offline\" mode that involves no attempts to connect to the internet. This mode can be used to install package zip files that the user already has on the file system, and to browse already installed packages. This mode can be accessed from the command line package manager by specifying the \"-offline\" option. Alternatively, the property weka.packageManager.offline=true can be provided to the Java virtual machine on the command line or in a properties file (see the section on properties below). Information (meta data) about packages is stored on a web server hosted on Sourceforge. The first time the package manager is run, for a new installation of Weka, there will be a short delay while the system downloads and stores a cache of the meta data from the server. Maintaining a cache speeds up the process of browsing the package information. From time to time you should update the local cache of package meta data in order to get the latest information on packages from the server. This can be achieved by supplying the -refresh-cache option. The -list-packages option will, as the name suggests, print information (version numbers and short descriptions) about various packages. The option must be followed by one of three keywords: all will print information on all packages that the system knows about installed will print information on all packages that are installed locally available will print information on all packages that are not installed The following shows an example of listing all packages installed locally: java weka.core.WekaPackageManager -list-packages installed Installed Repository Package ========= ========== ======= 1.0.0 1.0.0 DTNB: Class for building and using a decision table/naive bayes hybrid classifier. 1.0.0 1.0.0 massiveOnlineAnalysis: MOA (Massive On-line Analysis). 1.0.0 1.0.0 multiInstanceFilters: A collection of filters for manipulating multi-instance data. 1.0.0 1.0.0 naiveBayesTree: Class for generating a decision tree with naive Bayes classifiers at the leaves. 1.0.0 1.0.0 scatterPlot3D: A visualization component for displaying a 3D scatter plot of the data using Java 3D. The -package-info command lists information about a package given its name. The command is followed by one of three keywords and then the name of a package: repository will print info from the repository for the named package installed will print info on the installed version of the named package archive will print info for a package stored in a zip archive. In this case, the \u201carchive\u201d keyword must be followed by the path to an package zip archive file rather than just the name of a package The following shows an example of listing information for the \u201cisotonicRegression\u201d package from the server: java weka.core.WekaPackageManager -package-info repository isotonicRegression Description:Learns an isotonic regression model. Picks the attribute that results in the lowest squared error. Missing values are not allowed. Can only deal with numeric attributes. Considers the monotonically increasing case as well as the monotonically decreasing case. Version:1.0.0 PackageURL:http://prdownloads.sourceforge.net/weka/isotonicRegression1.0.0.zip?download Author:Eibe Frank PackageName:isotonicRegression Title:Learns an isotonic regression model. Date:2009-09-10 URL:https://weka.sourceforge.io/doc.dev/weka/classifiers/IsotonicRegression.html Category:Regression Depends:weka (>=3.7.1) License:GPL 2.0 Maintainer:Weka team <wekalist@list.scms.waikato.ac.nz> The -install-package command allows a package to be installed from one of three locations: specifying a name of a package will install the package using the information in the package description meta data stored on the server. If no version number is given, then the latest available version of the package is installed. providing a path to a zip file will attempt to unpack and install the archive as a Weka package providing a URL (beginning with http:// ) to a package zip file on the web will download and attempt to install the zip file as a Weka package The uninstall-package command will uninstall the named package. Of course, the named package has to be installed for this command to have any effect! Running installed learning algorithms # Running learning algorithms that come with the main weka distribution (i.e. are contained in the weka.jar file) was covered earlier in the Primer . But what about algorithms from packages that you\u2019ve installed using the package manager? We don\u2019t want to have to add a ton of jar files to our classpath every time we wan\u2019t to run a particular algorithm. Fortunately, we don\u2019t have to. Weka has a mechanism to load installed packages dynamically at run time. This means that newly installed packages are available in Weka's GUIs immediately. What about running algorithms from packages on the command line I hear you ask? We can run a named algorithm by using the Run command: java weka.Run If no arguments are supplied, then Run outputs the following usage information: Usage: weka.Run [-no-scan] [-no-load] <scheme name [scheme options]> The Run command supports sub-string matching, so you can run a classifier (such as J48) like so: java weka.Run J48 When there are multiple matches on a supplied scheme name you will be presented with a list. For example: java weka.Run NaiveBayes Select a scheme to run, or <return> to exit: 1) weka.classifiers.bayes.ComplementNaiveBayes 2) weka.classifiers.bayes.NaiveBayes 3) weka.classifiers.bayes.NaiveBayesMultinomial 4) weka.classifiers.bayes.NaiveBayesMultinomialUpdateable 5) weka.classifiers.bayes.NaiveBayesSimple 6) weka.classifiers.bayes.NaiveBayesUpdateable Enter a number > You can turn off the scanning of packages and sub-string matching by providing the -no-scan option. This is useful when using the Run command in a script. In this case, you need to specify the fully qualified name of the algorithm to use. E.g. java weka.Run -no-scan weka.classifiers.bayes.NaiveBayes To reduce startup time you can also turn off the dynamic loading of installed packages by specifying the -no-load option. In this case, you will need to explicitly include any packaged algorithms in your classpath if you plan to use them. E.g. java -classpath ./weka.jar:$HOME/wekafiles/packages/DTNB/DTNB.jar rweka.Run -no-load -no-scan weka.classifiers.rules.DTNB GUI package manager # As well as a command line client, there is also a graphical interface to Weka\u2019s package management system. This is available from the Tools menu in the GUIChooser . All the functionality available in the command line client to the package management system is available in the GUI version, along with the ability to install and uninstall multiple packages in one hit. The package manager\u2019s window is split horizontally into two parts: at the top is a list of packages and at the bottom is a mini browser that can be used to display information on the currently selected package. The package list shows the name of a package, its category, the currently installed version (if installed), the latest version available via the repository and whether the package has been loaded or not. This list may be sorted by either package name or category by clicking on the appropriate column header. A second click on the same header reverses the sort order. Three radio buttons in the upper left of the window can be used to filter what is displayed in the list. All packages (default), all available packages (i.e. those not yet installed) or only installed packages can be displayed. If multiple versions of a package are available, they can be accessed by clicking on an entry in the \u201cRepository version\u201d column: Installing and removing packages # At the very top of the window are three buttons. On the left-hand side is a button that can be used to refresh the cached copy of the package repository meta data. The first time that the package manager (GUI or command line) is used there will be a short delay as the initial cache is established. NOTE: Weka (3.7.2) will not automatically check for new information at the central repository, so it is a good idea to refresh the local cache regularly. From Weka 3.7.3 the package manager will notify you if there are new packages available at the central repository . The two buttons at the top right are used to install and remove packages repspectively. Multiple packages may be installed/removed by using a shift-left-click combination to select a range and/or by using a command-left-click combination to add to the selection. Underneath the install and uninstall but- tons is a checkbox that can be enabled to ignore any dependencies required by selected packages and any conflicts that may occur. Installing packages while this checkbox is selected will '''not''' install required dependencies. Some packages may have additional information on how to complete the installation or special instructions that gets displayed when the package is installed: Usually it is not necessary to restart Weka after packages have been installed\u2014the changes should be available immediately. An exception is when upgrading a package that is already installed. If in doubt, restart Weka. Unofficial packages # The package list shows those packages that have their meta data stored in Weka\u2019s central meta data repository. These packages are \u201cofficial\u201d Weka packages and the Weka team as verified that they appear to provide what is advertised (and do not contain malicious code or malware). It is also possible to install an unofficial package that has not gone through the process of become official. Unofficial packages might be provided, for exam- ple, by researchers who want to make experimental algorithms quickly available to the community for feedback. Unofficial packages can be installed by clicking the \u201cFile/url\u201d button on the top-right of the package manager window. This will bring up an \u201cUnnoficial package install\u201d dialog where the user can browse their file system for a package zip file or directly enter an URL to the package zip file. Note that no dependency checking is done for unofficial packages. Using a HTTP proxy # Both the GUI and command line package managers can operate via a http proxy. To do so, start Weka from the command line and supply property values for the proxy host and port: java -Dhttp.proxyHost=some.proxy.somewhere.net -Dhttp.proxyPort=port weka.gui.GUIChooser If your proxy requires authentication, then Weka will present a GUI dialog where you can enter your username and password. If you are running on a headless environment, then two more (non-standard) properties can be supplied: -Dhttp.proxyUser=some_user_name -Dhttp.proxyPassword=some_password Using an alternative central package meta data repository # By default, both the command-line and GUI package managers use the central package meta data repository hosted on Sourceforge. In the unlikely event that this site is unavailable for one reason or another, it is possible to point the package management system at an alternative repository. This mechanism allows a temporary backup of the official repostory to be accessed, local mirrors to be established and alternative repositories to be set up for use etc. An alternative repository can be specified by setting a Java property: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere This can either be set when starting Weka from the command line with the -D flag, or it can be placed into a file called \u201cPackageRepository.props\u201d in $WEKA_HOME/props . The default value of WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. More information on how and where Weka stores configuration information is given in the Package Structure article. Package manager property file # As mentioned in the previous section, an alternative package meta data repository can be specified by placing an entry in the PackageRepository.props file in $WEKA_HOME/props . From Weka 3.7.8, the package manager also looks for properties placed in $WEKA_HOME/props/PackageManager.props . The current set of properties that can be set are: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere weka.packageManager.offline=[true | false] weka.packageManager.loadPackages=[true | false] weka.pluginManager.disable=com.funky.FunkyExplorerPluginTab The default for offline mode (if unspecified) is false and for loadPackages is true . The weka.pluginManager.disable property can be used to specify a comma-separated list of fully qualified class names to \"disable\" in the GUI. This can be used to make problematic components unavailable in the GUI without having to prevent the entire package that contains them from being loaded. E.g. \"funkyPackage\" might provide several classifiers and a special Explorer plugin tab for visualization. Suppose, for example, that the plugin Explorer tab has issues with certain data sets and causes annoying exceptions to be generated (or perhaps in the worst cases crashes the Explorer!). In this case we might want to use the classifiers provided by the package and just disable the Explorer plugin. Listing the fully qualified name of the Explorer plugin as a member of the comma-separated list associated with the weka.pluginManager.disable property will achieve this.","title":"Manager"},{"location":"packages/manager/#command-line-package-management","text":"Assuming that the weka.jar file is in the classpath, the package manager can be accessed by typing: java weka.core.WekaPackageManager Supplying no options will print the usage information: Usage: weka.core.WekaPackageManager [option] Options: -list-packages <all | installed | available> -package-info <repository | installed | archive> packageName -install-package <packageName | packageZip | URL> [version] -uninstall-package <packageName> -refresh-cache Weka 3.7.8 now offers a completely \"offline\" mode that involves no attempts to connect to the internet. This mode can be used to install package zip files that the user already has on the file system, and to browse already installed packages. This mode can be accessed from the command line package manager by specifying the \"-offline\" option. Alternatively, the property weka.packageManager.offline=true can be provided to the Java virtual machine on the command line or in a properties file (see the section on properties below). Information (meta data) about packages is stored on a web server hosted on Sourceforge. The first time the package manager is run, for a new installation of Weka, there will be a short delay while the system downloads and stores a cache of the meta data from the server. Maintaining a cache speeds up the process of browsing the package information. From time to time you should update the local cache of package meta data in order to get the latest information on packages from the server. This can be achieved by supplying the -refresh-cache option. The -list-packages option will, as the name suggests, print information (version numbers and short descriptions) about various packages. The option must be followed by one of three keywords: all will print information on all packages that the system knows about installed will print information on all packages that are installed locally available will print information on all packages that are not installed The following shows an example of listing all packages installed locally: java weka.core.WekaPackageManager -list-packages installed Installed Repository Package ========= ========== ======= 1.0.0 1.0.0 DTNB: Class for building and using a decision table/naive bayes hybrid classifier. 1.0.0 1.0.0 massiveOnlineAnalysis: MOA (Massive On-line Analysis). 1.0.0 1.0.0 multiInstanceFilters: A collection of filters for manipulating multi-instance data. 1.0.0 1.0.0 naiveBayesTree: Class for generating a decision tree with naive Bayes classifiers at the leaves. 1.0.0 1.0.0 scatterPlot3D: A visualization component for displaying a 3D scatter plot of the data using Java 3D. The -package-info command lists information about a package given its name. The command is followed by one of three keywords and then the name of a package: repository will print info from the repository for the named package installed will print info on the installed version of the named package archive will print info for a package stored in a zip archive. In this case, the \u201carchive\u201d keyword must be followed by the path to an package zip archive file rather than just the name of a package The following shows an example of listing information for the \u201cisotonicRegression\u201d package from the server: java weka.core.WekaPackageManager -package-info repository isotonicRegression Description:Learns an isotonic regression model. Picks the attribute that results in the lowest squared error. Missing values are not allowed. Can only deal with numeric attributes. Considers the monotonically increasing case as well as the monotonically decreasing case. Version:1.0.0 PackageURL:http://prdownloads.sourceforge.net/weka/isotonicRegression1.0.0.zip?download Author:Eibe Frank PackageName:isotonicRegression Title:Learns an isotonic regression model. Date:2009-09-10 URL:https://weka.sourceforge.io/doc.dev/weka/classifiers/IsotonicRegression.html Category:Regression Depends:weka (>=3.7.1) License:GPL 2.0 Maintainer:Weka team <wekalist@list.scms.waikato.ac.nz> The -install-package command allows a package to be installed from one of three locations: specifying a name of a package will install the package using the information in the package description meta data stored on the server. If no version number is given, then the latest available version of the package is installed. providing a path to a zip file will attempt to unpack and install the archive as a Weka package providing a URL (beginning with http:// ) to a package zip file on the web will download and attempt to install the zip file as a Weka package The uninstall-package command will uninstall the named package. Of course, the named package has to be installed for this command to have any effect!","title":"Command line package management"},{"location":"packages/manager/#running-installed-learning-algorithms","text":"Running learning algorithms that come with the main weka distribution (i.e. are contained in the weka.jar file) was covered earlier in the Primer . But what about algorithms from packages that you\u2019ve installed using the package manager? We don\u2019t want to have to add a ton of jar files to our classpath every time we wan\u2019t to run a particular algorithm. Fortunately, we don\u2019t have to. Weka has a mechanism to load installed packages dynamically at run time. This means that newly installed packages are available in Weka's GUIs immediately. What about running algorithms from packages on the command line I hear you ask? We can run a named algorithm by using the Run command: java weka.Run If no arguments are supplied, then Run outputs the following usage information: Usage: weka.Run [-no-scan] [-no-load] <scheme name [scheme options]> The Run command supports sub-string matching, so you can run a classifier (such as J48) like so: java weka.Run J48 When there are multiple matches on a supplied scheme name you will be presented with a list. For example: java weka.Run NaiveBayes Select a scheme to run, or <return> to exit: 1) weka.classifiers.bayes.ComplementNaiveBayes 2) weka.classifiers.bayes.NaiveBayes 3) weka.classifiers.bayes.NaiveBayesMultinomial 4) weka.classifiers.bayes.NaiveBayesMultinomialUpdateable 5) weka.classifiers.bayes.NaiveBayesSimple 6) weka.classifiers.bayes.NaiveBayesUpdateable Enter a number > You can turn off the scanning of packages and sub-string matching by providing the -no-scan option. This is useful when using the Run command in a script. In this case, you need to specify the fully qualified name of the algorithm to use. E.g. java weka.Run -no-scan weka.classifiers.bayes.NaiveBayes To reduce startup time you can also turn off the dynamic loading of installed packages by specifying the -no-load option. In this case, you will need to explicitly include any packaged algorithms in your classpath if you plan to use them. E.g. java -classpath ./weka.jar:$HOME/wekafiles/packages/DTNB/DTNB.jar rweka.Run -no-load -no-scan weka.classifiers.rules.DTNB","title":"Running installed learning algorithms"},{"location":"packages/manager/#gui-package-manager","text":"As well as a command line client, there is also a graphical interface to Weka\u2019s package management system. This is available from the Tools menu in the GUIChooser . All the functionality available in the command line client to the package management system is available in the GUI version, along with the ability to install and uninstall multiple packages in one hit. The package manager\u2019s window is split horizontally into two parts: at the top is a list of packages and at the bottom is a mini browser that can be used to display information on the currently selected package. The package list shows the name of a package, its category, the currently installed version (if installed), the latest version available via the repository and whether the package has been loaded or not. This list may be sorted by either package name or category by clicking on the appropriate column header. A second click on the same header reverses the sort order. Three radio buttons in the upper left of the window can be used to filter what is displayed in the list. All packages (default), all available packages (i.e. those not yet installed) or only installed packages can be displayed. If multiple versions of a package are available, they can be accessed by clicking on an entry in the \u201cRepository version\u201d column:","title":"GUI package manager"},{"location":"packages/manager/#installing-and-removing-packages","text":"At the very top of the window are three buttons. On the left-hand side is a button that can be used to refresh the cached copy of the package repository meta data. The first time that the package manager (GUI or command line) is used there will be a short delay as the initial cache is established. NOTE: Weka (3.7.2) will not automatically check for new information at the central repository, so it is a good idea to refresh the local cache regularly. From Weka 3.7.3 the package manager will notify you if there are new packages available at the central repository . The two buttons at the top right are used to install and remove packages repspectively. Multiple packages may be installed/removed by using a shift-left-click combination to select a range and/or by using a command-left-click combination to add to the selection. Underneath the install and uninstall but- tons is a checkbox that can be enabled to ignore any dependencies required by selected packages and any conflicts that may occur. Installing packages while this checkbox is selected will '''not''' install required dependencies. Some packages may have additional information on how to complete the installation or special instructions that gets displayed when the package is installed: Usually it is not necessary to restart Weka after packages have been installed\u2014the changes should be available immediately. An exception is when upgrading a package that is already installed. If in doubt, restart Weka.","title":"Installing and removing packages"},{"location":"packages/manager/#unofficial-packages","text":"The package list shows those packages that have their meta data stored in Weka\u2019s central meta data repository. These packages are \u201cofficial\u201d Weka packages and the Weka team as verified that they appear to provide what is advertised (and do not contain malicious code or malware). It is also possible to install an unofficial package that has not gone through the process of become official. Unofficial packages might be provided, for exam- ple, by researchers who want to make experimental algorithms quickly available to the community for feedback. Unofficial packages can be installed by clicking the \u201cFile/url\u201d button on the top-right of the package manager window. This will bring up an \u201cUnnoficial package install\u201d dialog where the user can browse their file system for a package zip file or directly enter an URL to the package zip file. Note that no dependency checking is done for unofficial packages.","title":"Unofficial packages"},{"location":"packages/manager/#using-a-http-proxy","text":"Both the GUI and command line package managers can operate via a http proxy. To do so, start Weka from the command line and supply property values for the proxy host and port: java -Dhttp.proxyHost=some.proxy.somewhere.net -Dhttp.proxyPort=port weka.gui.GUIChooser If your proxy requires authentication, then Weka will present a GUI dialog where you can enter your username and password. If you are running on a headless environment, then two more (non-standard) properties can be supplied: -Dhttp.proxyUser=some_user_name -Dhttp.proxyPassword=some_password","title":"Using a HTTP proxy"},{"location":"packages/manager/#using-an-alternative-central-package-meta-data-repository","text":"By default, both the command-line and GUI package managers use the central package meta data repository hosted on Sourceforge. In the unlikely event that this site is unavailable for one reason or another, it is possible to point the package management system at an alternative repository. This mechanism allows a temporary backup of the official repostory to be accessed, local mirrors to be established and alternative repositories to be set up for use etc. An alternative repository can be specified by setting a Java property: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere This can either be set when starting Weka from the command line with the -D flag, or it can be placed into a file called \u201cPackageRepository.props\u201d in $WEKA_HOME/props . The default value of WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. More information on how and where Weka stores configuration information is given in the Package Structure article.","title":"Using an alternative central package meta data repository"},{"location":"packages/manager/#package-manager-property-file","text":"As mentioned in the previous section, an alternative package meta data repository can be specified by placing an entry in the PackageRepository.props file in $WEKA_HOME/props . From Weka 3.7.8, the package manager also looks for properties placed in $WEKA_HOME/props/PackageManager.props . The current set of properties that can be set are: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere weka.packageManager.offline=[true | false] weka.packageManager.loadPackages=[true | false] weka.pluginManager.disable=com.funky.FunkyExplorerPluginTab The default for offline mode (if unspecified) is false and for loadPackages is true . The weka.pluginManager.disable property can be used to specify a comma-separated list of fully qualified class names to \"disable\" in the GUI. This can be used to make problematic components unavailable in the GUI without having to prevent the entire package that contains them from being loaded. E.g. \"funkyPackage\" might provide several classifiers and a special Explorer plugin tab for visualization. Suppose, for example, that the plugin Explorer tab has issues with certain data sets and causes annoying exceptions to be generated (or perhaps in the worst cases crashes the Explorer!). In this case we might want to use the classifiers provided by the package and just disable the Explorer plugin. Listing the fully qualified name of the Explorer plugin as a member of the comma-separated list associated with the weka.pluginManager.disable property will achieve this.","title":"Package manager property file"},{"location":"packages/structure/","text":"Articles such as How do I use WEKA's classes in my own code? and How do I write a new classifier or filter? describe how to extend Weka to add your own learning algorithms and so forth. This article describes how such enhancements can be assembled into a package that can be accessed via Weka\u2019s package management system. Bundling your enhancements in a package makes it easy to share with other Weka users. In this article we refer to a package as an archive containing various resources such as compiled code, source code, javadocs, package description files (meta data), third-party libraries and configuration property files. Not all of the preceding may be in a given package, and there may be other resources included as well. This concept of a package is quite different to that of a Java packages, which simply define how classes are arranged hierarchically. Where does WEKA store packages and other configuration stuff? # By default, Weka stores packages and other information in $WEKA_HOME . The default location for WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. You can change the default location for WEKA_HOME by setting this either as an evironment variable for your platform, or by specifying it as a Java property when starting Weka. E.g.: export WEKA_HOME=/home/somewhere/weka_bits_and_bobs will set the directory that Weka uses to /home/somewhere/weka_bits_and_bobs under the LINUX operating system. The same thing can be accomplished when starting Weka by specifying a Java property on the command line, E.g.: java -DWEKA_HOME=/home/somewhere/weka_bits_and_bobs -jar weka.jar Inside $WEKA_HOME you will find the main weka log file (weka.log) and a number of directories: packages holds installed packages. Each package is contained its own subdirectory. props holds various Java property files used by Weka. This directory replaces the user\u2019s home directory (used in earlier releases of Weka) as one of the locations checked by Weka for properties files (such as DatabaseUtils.props ). Weka first checks, in order, the current directory (i.e. the directory that Weka is launched from), then $WEKA_HOME/props and finally the weka.jar file for property files. repCache holds the cached copy of the meta data from the central package repository. If the contents of this directory get corrupted it can be safely deleted and Weka will recreate it on the next restart. systemDialogs holds marker files that are created when you check Don\u2019t show this again in various system popup dialogs. Removing this directory or its contents will cause Weka to display those prompts anew. Anatomy of a package # A Weka package is a zip archive that must unpack to the current directory. For example, the DTNB package contains the decision table naive Bayes hybrid classifier and is delivered in a file called DTNB.zip . When unpacked this zip file creates the following directory structure: <current directory> +-DTNB.jar +-Description.props +-build_package.xml +-src | +-main | | +-java | | +-weka | | +-classifiers | | +-rules | | +-DTNB.java | +-test | +-java | +-weka | +-classifiers | +-rules | +-DTNBTest.java +-lib +-doc When installing, the package manager will use the value of the \"PackageName\" field in the Description.props file (see below) to create a directory in $WEKA_HOME/packages to hold the package contents. The contents of the doc directory have not been shown in the diagram above, but this directory contains javadoc for the DTNB class. A package must have a Description.props file and contain at least one jar file with compiled Java classes. The package manager will attempt to load all jar files that it finds in the root directory and the lib directory. Other files are optional, but if the package is open-source then it is nice to include the source code and an ant build file that can be used to compile the code. Template versions of the Description.props file and build_package.xml file are available from the Weka site and here. The description file # A valid package must contain a Description.props file that provides meta data on the package. Identical files are stored at the central package repository and the local cache maintained by the package manager. The package manager uses these files to compare what is installed to what is available and resolve dependencies. The Description.props contains basic information on the package in the following format: # Template Description file for a Weka package # Package name (required) PackageName=funkyPackage # Version (required) Version=1.0.0 #Date (year-month-day) Date=2010-01-01 # Title (required) Title=My cool algorithm # Category (recommended) Category=Classification # Author (required) Author=Joe Dev <joe@somewhere.net>,Dev2 <dev2@somewhereelse.net> # Maintainer (required) Maintainer=Joe Dev <joe@somewhere.net> # License (required) License=GPL 2.0|Mozilla # Description (required) Description=This package contains the famous Funky Classifer that performs \\ truely funky prediction. # Changes and/or bug fixes in this package (optional) Changes=Fixed a serious bug that affected overall coolness of the Funky Classifier # Package URL for obtaining the package archive (required) PackageURL=http://somewhere.net/weka/funkyPackage.zip # URL for further information URL=http://somewhere.net/funkyResearchInfo.html # Enhances various other packages? Enhances=packageName1,packageName2,... # Related to other packages? Related=packageName1,packageName2,... # Dependencies (required; format: packageName (equality/inequality version_number) Depends=weka (>=3.7.1), packageName1 (=x.y.z), packageName2 (>u.v.w|<=x.y.z),... The PackageName and Version give the name of the package and version number respectively. The name can consist of letters, numbers, and the dot character. It should not start with a dot and should not contain any spaces. The version number is a sequence of three non-negative integers separated by single . or - characters. The Title field should give a one sentence description of the package. The Description field can give a longer description of the package spaning multiple sentences. It may include technical references and can use HTML markup. The Category field is strongly recommended as this information is displayed on both the repository web site and in the GUI package manager client. In the latter, the user can sort the packages on the basis of the category field. It is recommended that an existing category be assigned if possible. Some examples include (Classification, Text classification, Ensemble learning, Regression, Clustering, Associations, Preprocessing, Visualization, Explorer, Experimenter, KnowledgeFlow). The Author field describes who wrote the package and may include multiple names (separated by commas). Email addresses may be given in angle brackets after each name. The field is intended for human readers and no email addresses are automatically extracted. The Maintainer field lists who maintains the package and should include a single email address, enclosed in angle brackets, for sending bug reports to. The License field lists the license(s) that apply to the package. This field may contain the short specification of a license (such as LGPL, GPL 2.0 etc.) or the string file LICENSE , where LICENSE exists as a file in the top-level directory of the package. The string Unlimited may be supplied to indicate that there are no restrictions on distribution or use aside from those imposed by relevant laws. The PackageURL field lists valid URL that points to the package zip file. This URL is used by the package manager to download and install the package. The required Depends field gives a comma separated list of packages which this package depends on. The name of a package is followed by a version number constraint enclosed in parenthesis. Valid operators for version number constraints include =, <, >, <=, >=. The keyword weka is reserved to refer to the base Weka system and can be used to indicate a dependency on a particular version of Weka. At a minimum, the Depends field should list the base version of Weka that the package will operate with. Some examples include: Depends=weka (>=3.7.2), DTNB (=1.0.0) states that this package requires Weka 3.7.2 or higher and version 1.0.0 of the package DTNB. Depends=weka (>3.7.1|<3.8.0) states that this package requires a version of Weka between 3.7.1 and 3.8.0. Depends=weka (>=3.7.2), DTNB (<1.5.0|>=2.0.1) states that this package requires that a version of the DTNB package be installed that is either less than version 1.5.0 or greater than or equal to version 2.0.1. If there is no version number constraint following a package name, the package manager assumes that the latest version of the dependent package is suitable. The optional URL field gives a URL at which the user can find additional online information about the package or its constituent algorithms. The optional Enhances field can be used to indicate which other packages this package is based on (i.e. if it extends methods/algorithms from another package in some fashion). The optional Related field is similar to the Enhances field. It can be used to point the user to other packages that are related in some fashion to this one. The optional Changes field should be used to indicate what changes/bug fixes are included in the current release of the package. There are several other fields that can be used to provide information to assist the user with completing installation (if it can\u2019t be completely accomplished with the package zip file) or display error messages if necessary components are missing: MessageToDisplayOnInstallation=Funky package requires some extra\\n\\ stuff to be installed after installing this package. You will\\n\\ need to blah, blah, blah in order to blah, blah, blah... DoNotLoadIfFileNotPresent=lib/someLibrary.jar,otherStuff/important,... DoNotLoadIfFileNotPresentMessage=funkyPackage can't be loaded because some \\ funky libraries are missing. Please download funkyLibrary.jar from \\ http://www.funky.com and install in $WEKA_HOME/packages/funkyPackage/lib DoNotLoadIfClassNotPresent=com.some.class.from.some.Where,org.some.other.Class,... DoNotLoadIfClassNotPresentMessage=funkyPackage can't be loaded because \\ com.funky.FunkyClass can't be instantiated. Have you downloaded and run \\ the funky software installer for your platform? The optional MessageToDisplayOnInstallation field allows you to specify special instructions to the user in order to help them complete the intallation manually. This message gets displayed on the console, written to the log and appears in a pop-up information dialog if using the GUI package manager. It should include \\n in order to avoid long lines when displayed in a GUI pop-up dialog. The optional DoNotLoadIfFileNotPresent field can be used to prevent Weka from loading the package if the named ''files'' and/or ''directories'' are not present in the package\u2019s installation directory. An example is the massiveOnlineAnalysis package. This package is a connector only package and does not include the MOA library. Users of this package must download the moa.jar file separately and copy it to the package\u2019s lib directory manually. Multiple files and directories can be specified as a comma separated list. All paths are relative to the top-level directory of the package. IMPORTANT : use forward slashes as separator characters, as these are portable accross all platforms. The DoNotLoadIfFileNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a file or directory is missing from the package. This message will be displayed on the console and in the log. The optional DoNotLoadIfClassNotPresent field can be used to prevent Weka from loading the package if the named ''class(es)'' can\u2019t be instantiated. This is useful for packages that rely on stuff that has to be installed manually by the user. For example, Java3D is a separate download on all platforms except for OSX, and installs itself into the system JRE/JDK. The DoNotLoadIfClassNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a class can\u2019t be instantiated. Again, this will be displayed on the console and in the log. New in Weka 3.9.2 and 3.8.2 is the ability to constrain the OS and architecture that a package can be installed and loaded on. Two new fields are used for this: # Specify which OS's the package can operate with. Omitting this entry indicates no restrictions on OS. (optional) OSName=Windows,Mac,Linux # Specify which architecture the package can operate with. Omitting this entry indicates no restriction. (optional) OSArch=64 Entries in the OSName field are compared against the value of the Java property \"os.name\" using a String.toLowerCase().contains() operation. Any single match indicates a pass. If an OSName field exists in the Description.props, and it matches, then the optional OSArch field is examined. If present, values in the OSArch list are compared against the value of the Java property \"os.arch\". The special OSArch entries \"32\" and \"64\" are tested using a String().contains() operation; all other entries are compared using String.equalsIgnoreCase(). Additional configuration files # Certain types of packages may require additional configuration files to be present as part of the package. The last chapter covered various ways in which Weka can be extended without having to alter the core Weka code. These plugin mechanisms have been subsumed by the package management system, so some of the configuration property files they require must be present in the package\u2019s top-level directory if the package in question contains such a plugin. Examples include additional tabs for the Explorer, mappings to custom property editors for Weka\u2019s GenericObjectEditor and Knowledge Flow plugins. Here are some examples: The scatterPlot3D package adds a new tab to the Explorer. In order to accomplish this a property has to be set in the Explorer.props file (which contains default values for the Explorer) in order to tell Weka to instantiate and display the new panel. The scatterPlot3D file includes an Explorer.props file in its top-level directory that has the following contents: # Explorer.props file. Adds the Explorer3DPanel to the Tabs key. Tabs=weka.gui.explorer.Explorer3DPanel TabsPolicy=append This property file is read by the package management system when the package is loaded and any key-value pairs are added to existing Explorer properties that have been loaded by the system at startup. If the key already exists in the Explorer properties, then the package has the option to either replace (i.e. overwrite) or append to the existing value. This can be specified with the \"TabsPolicy\" key. In this case, the value weka.gui.explorer.Explorer3DPanel is appended to any existing value associated with the \"Tabs\" key. Explorer3DPanel gets instantiated and added as a new tab when the Explorer starts. Another example is the kfGroovy package. This package adds a plugin component to Weka\u2019s Knowledge Flow that allows a Knowledge Flow step to be implemented and compiled dynamically at runtime as a Groovy script. In order for the Knowledge Flow to make the new step appear in its Plugins toolbar, there needs to be a Beans.props file in the package\u2019s top level directory. In the case of kfGroovy, this property file has the following contents: # Specifies that this component goes into the Plugins toolbar weka.gui.beans.KnowledgeFlow.Plugins=org.pentaho.dm.kf.GroovyComponent The new pluggable evaluation metrics for classification/regression (from Weka 3.7.8) are managed by the PluginManager class. To tell PluginManager that your package provides a new evaluation metric you need to provide a \"PluginManager.props\" file in the package's top level directory. For example, a hypothetical bobsMetric package might declare a new \"Area under Bob curve\" metric like so: # Specify a new plugin Evaluation metric weka.classifiers.evaluation.AbstractEvaluationMetric=weka.classifiers.evaluation.BobsAUC Contributing a package # If you have created a package for Weka then there are two options for making it available to the community. In both cases, hosting the package\u2019s zip archive is the responsibility of the contributer. The first, and official, route is to contact the current Weka maintainer (normally also the admin of the WEKA homepage) and supply your package\u2019s Description.props file. The Weka team will then test downloading and using your package to make sure that there are no obvious problems with what has been specified in the Description.props file and that the software runs and does not contain any malware/malicious code. If all is well, then the package will become an official Weka package and the central package repository meta data will be updated with the package\u2019s Description.props file . Responsibility for maintaining and supporting the package resides with the contributer . The second, and unofficial, route is to simply make the package\u2019s zip archive available on the web somewhere and advertise it yourself. Although users will not be able to browse it\u2019s description in the official package repository, they will be able to download and install it directly from your URL by using the command line version of the package manager. This route could be attractive for people who have published a new algorithm and want to quiclky make a beta version available for others to try without having to go through the official route. Creating a mirror of the package meta data repository # In this section we discuss an easy approach to setting up and maintaining a mirror of the package meta data repository. Having a local mirror may provide faster access times than to that of the official repository on Sourceforge. Extending this approach to the creation of an alternative central repository (hosting packages not available at the official repository) should be straight forward. Just about everything necessary for creating a mirror exists in the local meta data cache created by Weka\u2019s package management system. This cache resides at $WEKA_HOME/repCache . The only thing missing (in Weka 3.7.2) for a complete mirror is the file images.txt , that lists all the image files used in the html index files. This file contains the following two lines: Title-Bird-Header.gif pentaho_logo_rgb_sm.png images.txt is downloaded automatically by the package management system in Weka 3.7.3 and higher. To create a mirror: 1. Copy the contents of $WEKA_HOME/repCache to a temporary directory. For the purposes of this example we\u2019ll call it tempRep 2. Change directory into tempRep and run java weka.core.RepositoryIndexGenerator . . Don't forget the \".\" after the command (this tells RepoistoryIndexGenerator to operate on the current directory) 3. Change directory to the parent of tempRep and synchronize its contents to wherever your web server is located (this is easy via rsync under Nix-like operating systems). RepositoryIndexGenerator automatically creates the main index.html file, all the package index.html files and html files correpsonding to all version prop files for each package. It will also create packageList.txt and numPackages.txt files. IMPORTANT : Make sure that all the files in tempRep are world readable. It is easy to make packages available that are not part of the official Weka repository. Assuming you want to add a package called funkyPackage (as specified by the PackageName field in the Description.props file): Create a directory called funkyPackage in tempRep Copy the Description.props file to tempRep/funkyPackage/Latest.props Copy the Description.props file to tempRep/funkyPackage/<version number>.props , where version number is the version number specified in the Version field of Description.props Run RepositoryIndexGenerator as described previously and sync tempRep to your web server Adding a new version of an existing package is very similar to what has already been described. All that is required is that the new Description.props file corresponding to the new version is copied to Latest.props and to <version numer>.props in the package\u2019s folder. Running RepositoryIndexGenerator will ensure that all necessary html files are created and supporting text files are updated. Automating the mirroring process would simply involve using your OS\u2019s scheduler to execute a script that: Runs weka.core.WekaPackageManager -refresh-cache rsyncs $WEKA_HOME/repCache to tempRep Runs weka.core.RepoistoryIndexGenerator rsyncs tempRep to your web server","title":"Structure"},{"location":"packages/structure/#where-does-weka-store-packages-and-other-configuration-stuff","text":"By default, Weka stores packages and other information in $WEKA_HOME . The default location for WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. You can change the default location for WEKA_HOME by setting this either as an evironment variable for your platform, or by specifying it as a Java property when starting Weka. E.g.: export WEKA_HOME=/home/somewhere/weka_bits_and_bobs will set the directory that Weka uses to /home/somewhere/weka_bits_and_bobs under the LINUX operating system. The same thing can be accomplished when starting Weka by specifying a Java property on the command line, E.g.: java -DWEKA_HOME=/home/somewhere/weka_bits_and_bobs -jar weka.jar Inside $WEKA_HOME you will find the main weka log file (weka.log) and a number of directories: packages holds installed packages. Each package is contained its own subdirectory. props holds various Java property files used by Weka. This directory replaces the user\u2019s home directory (used in earlier releases of Weka) as one of the locations checked by Weka for properties files (such as DatabaseUtils.props ). Weka first checks, in order, the current directory (i.e. the directory that Weka is launched from), then $WEKA_HOME/props and finally the weka.jar file for property files. repCache holds the cached copy of the meta data from the central package repository. If the contents of this directory get corrupted it can be safely deleted and Weka will recreate it on the next restart. systemDialogs holds marker files that are created when you check Don\u2019t show this again in various system popup dialogs. Removing this directory or its contents will cause Weka to display those prompts anew.","title":"Where does WEKA store packages and other configuration stuff?"},{"location":"packages/structure/#anatomy-of-a-package","text":"A Weka package is a zip archive that must unpack to the current directory. For example, the DTNB package contains the decision table naive Bayes hybrid classifier and is delivered in a file called DTNB.zip . When unpacked this zip file creates the following directory structure: <current directory> +-DTNB.jar +-Description.props +-build_package.xml +-src | +-main | | +-java | | +-weka | | +-classifiers | | +-rules | | +-DTNB.java | +-test | +-java | +-weka | +-classifiers | +-rules | +-DTNBTest.java +-lib +-doc When installing, the package manager will use the value of the \"PackageName\" field in the Description.props file (see below) to create a directory in $WEKA_HOME/packages to hold the package contents. The contents of the doc directory have not been shown in the diagram above, but this directory contains javadoc for the DTNB class. A package must have a Description.props file and contain at least one jar file with compiled Java classes. The package manager will attempt to load all jar files that it finds in the root directory and the lib directory. Other files are optional, but if the package is open-source then it is nice to include the source code and an ant build file that can be used to compile the code. Template versions of the Description.props file and build_package.xml file are available from the Weka site and here.","title":"Anatomy of a package"},{"location":"packages/structure/#the-description-file","text":"A valid package must contain a Description.props file that provides meta data on the package. Identical files are stored at the central package repository and the local cache maintained by the package manager. The package manager uses these files to compare what is installed to what is available and resolve dependencies. The Description.props contains basic information on the package in the following format: # Template Description file for a Weka package # Package name (required) PackageName=funkyPackage # Version (required) Version=1.0.0 #Date (year-month-day) Date=2010-01-01 # Title (required) Title=My cool algorithm # Category (recommended) Category=Classification # Author (required) Author=Joe Dev <joe@somewhere.net>,Dev2 <dev2@somewhereelse.net> # Maintainer (required) Maintainer=Joe Dev <joe@somewhere.net> # License (required) License=GPL 2.0|Mozilla # Description (required) Description=This package contains the famous Funky Classifer that performs \\ truely funky prediction. # Changes and/or bug fixes in this package (optional) Changes=Fixed a serious bug that affected overall coolness of the Funky Classifier # Package URL for obtaining the package archive (required) PackageURL=http://somewhere.net/weka/funkyPackage.zip # URL for further information URL=http://somewhere.net/funkyResearchInfo.html # Enhances various other packages? Enhances=packageName1,packageName2,... # Related to other packages? Related=packageName1,packageName2,... # Dependencies (required; format: packageName (equality/inequality version_number) Depends=weka (>=3.7.1), packageName1 (=x.y.z), packageName2 (>u.v.w|<=x.y.z),... The PackageName and Version give the name of the package and version number respectively. The name can consist of letters, numbers, and the dot character. It should not start with a dot and should not contain any spaces. The version number is a sequence of three non-negative integers separated by single . or - characters. The Title field should give a one sentence description of the package. The Description field can give a longer description of the package spaning multiple sentences. It may include technical references and can use HTML markup. The Category field is strongly recommended as this information is displayed on both the repository web site and in the GUI package manager client. In the latter, the user can sort the packages on the basis of the category field. It is recommended that an existing category be assigned if possible. Some examples include (Classification, Text classification, Ensemble learning, Regression, Clustering, Associations, Preprocessing, Visualization, Explorer, Experimenter, KnowledgeFlow). The Author field describes who wrote the package and may include multiple names (separated by commas). Email addresses may be given in angle brackets after each name. The field is intended for human readers and no email addresses are automatically extracted. The Maintainer field lists who maintains the package and should include a single email address, enclosed in angle brackets, for sending bug reports to. The License field lists the license(s) that apply to the package. This field may contain the short specification of a license (such as LGPL, GPL 2.0 etc.) or the string file LICENSE , where LICENSE exists as a file in the top-level directory of the package. The string Unlimited may be supplied to indicate that there are no restrictions on distribution or use aside from those imposed by relevant laws. The PackageURL field lists valid URL that points to the package zip file. This URL is used by the package manager to download and install the package. The required Depends field gives a comma separated list of packages which this package depends on. The name of a package is followed by a version number constraint enclosed in parenthesis. Valid operators for version number constraints include =, <, >, <=, >=. The keyword weka is reserved to refer to the base Weka system and can be used to indicate a dependency on a particular version of Weka. At a minimum, the Depends field should list the base version of Weka that the package will operate with. Some examples include: Depends=weka (>=3.7.2), DTNB (=1.0.0) states that this package requires Weka 3.7.2 or higher and version 1.0.0 of the package DTNB. Depends=weka (>3.7.1|<3.8.0) states that this package requires a version of Weka between 3.7.1 and 3.8.0. Depends=weka (>=3.7.2), DTNB (<1.5.0|>=2.0.1) states that this package requires that a version of the DTNB package be installed that is either less than version 1.5.0 or greater than or equal to version 2.0.1. If there is no version number constraint following a package name, the package manager assumes that the latest version of the dependent package is suitable. The optional URL field gives a URL at which the user can find additional online information about the package or its constituent algorithms. The optional Enhances field can be used to indicate which other packages this package is based on (i.e. if it extends methods/algorithms from another package in some fashion). The optional Related field is similar to the Enhances field. It can be used to point the user to other packages that are related in some fashion to this one. The optional Changes field should be used to indicate what changes/bug fixes are included in the current release of the package. There are several other fields that can be used to provide information to assist the user with completing installation (if it can\u2019t be completely accomplished with the package zip file) or display error messages if necessary components are missing: MessageToDisplayOnInstallation=Funky package requires some extra\\n\\ stuff to be installed after installing this package. You will\\n\\ need to blah, blah, blah in order to blah, blah, blah... DoNotLoadIfFileNotPresent=lib/someLibrary.jar,otherStuff/important,... DoNotLoadIfFileNotPresentMessage=funkyPackage can't be loaded because some \\ funky libraries are missing. Please download funkyLibrary.jar from \\ http://www.funky.com and install in $WEKA_HOME/packages/funkyPackage/lib DoNotLoadIfClassNotPresent=com.some.class.from.some.Where,org.some.other.Class,... DoNotLoadIfClassNotPresentMessage=funkyPackage can't be loaded because \\ com.funky.FunkyClass can't be instantiated. Have you downloaded and run \\ the funky software installer for your platform? The optional MessageToDisplayOnInstallation field allows you to specify special instructions to the user in order to help them complete the intallation manually. This message gets displayed on the console, written to the log and appears in a pop-up information dialog if using the GUI package manager. It should include \\n in order to avoid long lines when displayed in a GUI pop-up dialog. The optional DoNotLoadIfFileNotPresent field can be used to prevent Weka from loading the package if the named ''files'' and/or ''directories'' are not present in the package\u2019s installation directory. An example is the massiveOnlineAnalysis package. This package is a connector only package and does not include the MOA library. Users of this package must download the moa.jar file separately and copy it to the package\u2019s lib directory manually. Multiple files and directories can be specified as a comma separated list. All paths are relative to the top-level directory of the package. IMPORTANT : use forward slashes as separator characters, as these are portable accross all platforms. The DoNotLoadIfFileNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a file or directory is missing from the package. This message will be displayed on the console and in the log. The optional DoNotLoadIfClassNotPresent field can be used to prevent Weka from loading the package if the named ''class(es)'' can\u2019t be instantiated. This is useful for packages that rely on stuff that has to be installed manually by the user. For example, Java3D is a separate download on all platforms except for OSX, and installs itself into the system JRE/JDK. The DoNotLoadIfClassNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a class can\u2019t be instantiated. Again, this will be displayed on the console and in the log. New in Weka 3.9.2 and 3.8.2 is the ability to constrain the OS and architecture that a package can be installed and loaded on. Two new fields are used for this: # Specify which OS's the package can operate with. Omitting this entry indicates no restrictions on OS. (optional) OSName=Windows,Mac,Linux # Specify which architecture the package can operate with. Omitting this entry indicates no restriction. (optional) OSArch=64 Entries in the OSName field are compared against the value of the Java property \"os.name\" using a String.toLowerCase().contains() operation. Any single match indicates a pass. If an OSName field exists in the Description.props, and it matches, then the optional OSArch field is examined. If present, values in the OSArch list are compared against the value of the Java property \"os.arch\". The special OSArch entries \"32\" and \"64\" are tested using a String().contains() operation; all other entries are compared using String.equalsIgnoreCase().","title":"The description file"},{"location":"packages/structure/#additional-configuration-files","text":"Certain types of packages may require additional configuration files to be present as part of the package. The last chapter covered various ways in which Weka can be extended without having to alter the core Weka code. These plugin mechanisms have been subsumed by the package management system, so some of the configuration property files they require must be present in the package\u2019s top-level directory if the package in question contains such a plugin. Examples include additional tabs for the Explorer, mappings to custom property editors for Weka\u2019s GenericObjectEditor and Knowledge Flow plugins. Here are some examples: The scatterPlot3D package adds a new tab to the Explorer. In order to accomplish this a property has to be set in the Explorer.props file (which contains default values for the Explorer) in order to tell Weka to instantiate and display the new panel. The scatterPlot3D file includes an Explorer.props file in its top-level directory that has the following contents: # Explorer.props file. Adds the Explorer3DPanel to the Tabs key. Tabs=weka.gui.explorer.Explorer3DPanel TabsPolicy=append This property file is read by the package management system when the package is loaded and any key-value pairs are added to existing Explorer properties that have been loaded by the system at startup. If the key already exists in the Explorer properties, then the package has the option to either replace (i.e. overwrite) or append to the existing value. This can be specified with the \"TabsPolicy\" key. In this case, the value weka.gui.explorer.Explorer3DPanel is appended to any existing value associated with the \"Tabs\" key. Explorer3DPanel gets instantiated and added as a new tab when the Explorer starts. Another example is the kfGroovy package. This package adds a plugin component to Weka\u2019s Knowledge Flow that allows a Knowledge Flow step to be implemented and compiled dynamically at runtime as a Groovy script. In order for the Knowledge Flow to make the new step appear in its Plugins toolbar, there needs to be a Beans.props file in the package\u2019s top level directory. In the case of kfGroovy, this property file has the following contents: # Specifies that this component goes into the Plugins toolbar weka.gui.beans.KnowledgeFlow.Plugins=org.pentaho.dm.kf.GroovyComponent The new pluggable evaluation metrics for classification/regression (from Weka 3.7.8) are managed by the PluginManager class. To tell PluginManager that your package provides a new evaluation metric you need to provide a \"PluginManager.props\" file in the package's top level directory. For example, a hypothetical bobsMetric package might declare a new \"Area under Bob curve\" metric like so: # Specify a new plugin Evaluation metric weka.classifiers.evaluation.AbstractEvaluationMetric=weka.classifiers.evaluation.BobsAUC","title":"Additional configuration files"},{"location":"packages/structure/#contributing-a-package","text":"If you have created a package for Weka then there are two options for making it available to the community. In both cases, hosting the package\u2019s zip archive is the responsibility of the contributer. The first, and official, route is to contact the current Weka maintainer (normally also the admin of the WEKA homepage) and supply your package\u2019s Description.props file. The Weka team will then test downloading and using your package to make sure that there are no obvious problems with what has been specified in the Description.props file and that the software runs and does not contain any malware/malicious code. If all is well, then the package will become an official Weka package and the central package repository meta data will be updated with the package\u2019s Description.props file . Responsibility for maintaining and supporting the package resides with the contributer . The second, and unofficial, route is to simply make the package\u2019s zip archive available on the web somewhere and advertise it yourself. Although users will not be able to browse it\u2019s description in the official package repository, they will be able to download and install it directly from your URL by using the command line version of the package manager. This route could be attractive for people who have published a new algorithm and want to quiclky make a beta version available for others to try without having to go through the official route.","title":"Contributing a package"},{"location":"packages/structure/#creating-a-mirror-of-the-package-meta-data-repository","text":"In this section we discuss an easy approach to setting up and maintaining a mirror of the package meta data repository. Having a local mirror may provide faster access times than to that of the official repository on Sourceforge. Extending this approach to the creation of an alternative central repository (hosting packages not available at the official repository) should be straight forward. Just about everything necessary for creating a mirror exists in the local meta data cache created by Weka\u2019s package management system. This cache resides at $WEKA_HOME/repCache . The only thing missing (in Weka 3.7.2) for a complete mirror is the file images.txt , that lists all the image files used in the html index files. This file contains the following two lines: Title-Bird-Header.gif pentaho_logo_rgb_sm.png images.txt is downloaded automatically by the package management system in Weka 3.7.3 and higher. To create a mirror: 1. Copy the contents of $WEKA_HOME/repCache to a temporary directory. For the purposes of this example we\u2019ll call it tempRep 2. Change directory into tempRep and run java weka.core.RepositoryIndexGenerator . . Don't forget the \".\" after the command (this tells RepoistoryIndexGenerator to operate on the current directory) 3. Change directory to the parent of tempRep and synchronize its contents to wherever your web server is located (this is easy via rsync under Nix-like operating systems). RepositoryIndexGenerator automatically creates the main index.html file, all the package index.html files and html files correpsonding to all version prop files for each package. It will also create packageList.txt and numPackages.txt files. IMPORTANT : Make sure that all the files in tempRep are world readable. It is easy to make packages available that are not part of the official Weka repository. Assuming you want to add a package called funkyPackage (as specified by the PackageName field in the Description.props file): Create a directory called funkyPackage in tempRep Copy the Description.props file to tempRep/funkyPackage/Latest.props Copy the Description.props file to tempRep/funkyPackage/<version number>.props , where version number is the version number specified in the Version field of Description.props Run RepositoryIndexGenerator as described previously and sync tempRep to your web server Adding a new version of an existing package is very similar to what has already been described. All that is required is that the new Description.props file corresponding to the new version is copied to Latest.props and to <version numer>.props in the package\u2019s folder. Running RepositoryIndexGenerator will ensure that all necessary html files are created and supporting text files are updated. Automating the mirroring process would simply involve using your OS\u2019s scheduler to execute a script that: Runs weka.core.WekaPackageManager -refresh-cache rsyncs $WEKA_HOME/repCache to tempRep Runs weka.core.RepoistoryIndexGenerator rsyncs tempRep to your web server","title":"Creating a mirror of the package meta data repository"},{"location":"packages/unofficial/","text":"There are a number of packages for WEKA 3.8 on the internet that are not listed in the \"official\" WEKA package repository. These packages can nevertheless be easily installed via the package manager in WEKA 3.8 (available via the Tools menu in WEKA's GUIChooser) by providing the URL for the package .zip file. Below is an (incomplete list) of packages that are available. Input/outout # common-csv -- loader/saver for various common CSV formats, using the Apache Commons CSV library. matlab -- loader/saver for binary Matlab .mat files, using the MFL library. Preprocessing # dataset-weights -- filters for setting attribute and instance weights using various methods. missing-values-imputation -- various methods for imputing missing values using a filter. mxexpression -- filter for updating a target attribute using a mathematical expression. Classification # Java neural network package -- Java (convolutional or fully-connected) neural network implementation with plugin for Weka . Uses dropout and rectified linear units. Implementation is multithreaded and uses MTJ matrix library with native libs for performance. HMMWeka -- This library makes Hidden Markov Model machine learning available in Weka. Collective classification -- Algorithms around semi-supervised learning and collective classification. Bagging ensemble selection -- Bagging Ensemble Selection - a new ensemble learning strategy. DataSqueezer -- Efficient rule builder that generates a set of production rules from labeled input data. It can handle missing data and has log-linear asymptotic complexity with the number of training examples. miDS -- mi-DS is a multiple-Instance learning supervised algorithm based on the DataSqueezer algorithm. LibD3C -- Ensemble classifiers with a clustering and dynamic selection strategy. ICRM -- An Interpretable Classification Rule Mining Algorithm. tclass -- TClass is a supervised learner for multivariate time series, originally developed by Waleed Kadous . wekaclassalgos -- collection of artificial neural network (ANN) algorithms and artificial immune system (AIS) algorithms, originally developed by Jason Brownlee . mxexpression -- classifier for making predictions using a mathematical expression. Clustering # APCluster -- Affinity propagation algorithm for clustering, used especially in bioinformatics and computer vision. Fast Optics -- Fast Implementation of OPTICS algorithm using random projections for Euclidean distances. Similarity functions # wekabiosimilarity -- implements several measures to compare binary feature vectors; and, additionally, extrapolates those measures to work with multi-value, string and numerical feature vectors. Discretization # ur-CAIM -- Improved CAIM Discretization for Unbalanced and Balanced Data. CAIM -- Class-Attribute Interdependence Maximization algorithm: discretizes a continuous feature into a number of intervals. This is done by using class information, without requiring the user to provide this number. Feature selection # RSARSubsetEval -- Rough set feature selection. Frequent pattern mining # XApriori --Available case analysis modification of Apriori frequent pattern mining algorithm. Stemming # Snowball stemmers -- Contains the actual snowball stemmer algorithms to make the Snowball stemmer wrapper in Weka work. PTStemmer -- Wrapper for Pedro Oliveira's stemmer library for Portuguese. Text mining # nlp -- Contains components for natural language processing, eg part-of-speech tagging filter and Penn Tree Bank tokenizer. Makes use of the Stanford Parser (parser models need to be downloaded separately). Visualization # graphviz-treevisualize -- Generating nice graphs in the Explorer from trees (eg J48) using the GraphViz executables. confusionmatrix -- Various visualizations of confusion matrices in the Explorer. serialized-model-viewer -- Adds a standalone tab to the Explorer that allows the user to load a serialized model and view its content as text (simply uses the objects' toString() method). Parameter optimization # multisearch -- Meta-classifier similar to GridSearch, but for optimizing arbitrary number of parameters. Others # screencast4j -- Allows you to record sound, webcam and screen feeds, storing them in separate files to be combined into a screencast using a video editor . This screencast you can then share on YouTube, for instance. command-to-code -- Turns command-lines (eg of classifiers or filters) into various Java code snippets. jshell-scripting -- Allows scripting in Java, using jshell","title":"Unofficial"},{"location":"packages/unofficial/#inputoutout","text":"common-csv -- loader/saver for various common CSV formats, using the Apache Commons CSV library. matlab -- loader/saver for binary Matlab .mat files, using the MFL library.","title":"Input/outout"},{"location":"packages/unofficial/#preprocessing","text":"dataset-weights -- filters for setting attribute and instance weights using various methods. missing-values-imputation -- various methods for imputing missing values using a filter. mxexpression -- filter for updating a target attribute using a mathematical expression.","title":"Preprocessing"},{"location":"packages/unofficial/#classification","text":"Java neural network package -- Java (convolutional or fully-connected) neural network implementation with plugin for Weka . Uses dropout and rectified linear units. Implementation is multithreaded and uses MTJ matrix library with native libs for performance. HMMWeka -- This library makes Hidden Markov Model machine learning available in Weka. Collective classification -- Algorithms around semi-supervised learning and collective classification. Bagging ensemble selection -- Bagging Ensemble Selection - a new ensemble learning strategy. DataSqueezer -- Efficient rule builder that generates a set of production rules from labeled input data. It can handle missing data and has log-linear asymptotic complexity with the number of training examples. miDS -- mi-DS is a multiple-Instance learning supervised algorithm based on the DataSqueezer algorithm. LibD3C -- Ensemble classifiers with a clustering and dynamic selection strategy. ICRM -- An Interpretable Classification Rule Mining Algorithm. tclass -- TClass is a supervised learner for multivariate time series, originally developed by Waleed Kadous . wekaclassalgos -- collection of artificial neural network (ANN) algorithms and artificial immune system (AIS) algorithms, originally developed by Jason Brownlee . mxexpression -- classifier for making predictions using a mathematical expression.","title":"Classification"},{"location":"packages/unofficial/#clustering","text":"APCluster -- Affinity propagation algorithm for clustering, used especially in bioinformatics and computer vision. Fast Optics -- Fast Implementation of OPTICS algorithm using random projections for Euclidean distances.","title":"Clustering"},{"location":"packages/unofficial/#similarity-functions","text":"wekabiosimilarity -- implements several measures to compare binary feature vectors; and, additionally, extrapolates those measures to work with multi-value, string and numerical feature vectors.","title":"Similarity functions"},{"location":"packages/unofficial/#discretization","text":"ur-CAIM -- Improved CAIM Discretization for Unbalanced and Balanced Data. CAIM -- Class-Attribute Interdependence Maximization algorithm: discretizes a continuous feature into a number of intervals. This is done by using class information, without requiring the user to provide this number.","title":"Discretization"},{"location":"packages/unofficial/#feature-selection","text":"RSARSubsetEval -- Rough set feature selection.","title":"Feature selection"},{"location":"packages/unofficial/#frequent-pattern-mining","text":"XApriori --Available case analysis modification of Apriori frequent pattern mining algorithm.","title":"Frequent pattern mining"},{"location":"packages/unofficial/#stemming","text":"Snowball stemmers -- Contains the actual snowball stemmer algorithms to make the Snowball stemmer wrapper in Weka work. PTStemmer -- Wrapper for Pedro Oliveira's stemmer library for Portuguese.","title":"Stemming"},{"location":"packages/unofficial/#text-mining","text":"nlp -- Contains components for natural language processing, eg part-of-speech tagging filter and Penn Tree Bank tokenizer. Makes use of the Stanford Parser (parser models need to be downloaded separately).","title":"Text mining"},{"location":"packages/unofficial/#visualization","text":"graphviz-treevisualize -- Generating nice graphs in the Explorer from trees (eg J48) using the GraphViz executables. confusionmatrix -- Various visualizations of confusion matrices in the Explorer. serialized-model-viewer -- Adds a standalone tab to the Explorer that allows the user to load a serialized model and view its content as text (simply uses the objects' toString() method).","title":"Visualization"},{"location":"packages/unofficial/#parameter-optimization","text":"multisearch -- Meta-classifier similar to GridSearch, but for optimizing arbitrary number of parameters.","title":"Parameter optimization"},{"location":"packages/unofficial/#others","text":"screencast4j -- Allows you to record sound, webcam and screen feeds, storing them in separate files to be combined into a screencast using a video editor . This screencast you can then share on YouTube, for instance. command-to-code -- Turns command-lines (eg of classifiers or filters) into various Java code snippets. jshell-scripting -- Allows scripting in Java, using jshell","title":"Others"},{"location":"visualization/changing_the_plot_background/","text":"The default background color for plots, e.g., for ROC curves , is black , which might not be convenient for screenshots. Since the color is stored as a value in a Properties file , you can easily change the color: extract the following file from the weka.jar or weka-src.jar file weka/gui/visualize/Visualize.props place a copy of that file in your HOME directory (on *nix $HOME , on Windows %USERPROFILE% ) edit the Properties file with any text editor replace the value of following key with a color of your liking, e.g., white weka.gui.visualize.Plot2D.backgroundColour save the file and restart Weka","title":"Changing the plot background"},{"location":"visualization/displaying_results_of_cross_validation_folds/","text":"The following KnowledgeFlow setup outputs the cross-validation models for each train/test pair: ArffLoader --dataSet--> CrossValidationFoldMaker --trainingSet/testSet--> J48 --text--> TextViewer Links # Cross-validation_folds_output.kfml - Example KnowledgeFlow layout file","title":"Displaying results of cross validation folds"},{"location":"visualization/displaying_results_of_cross_validation_folds/#links","text":"Cross-validation_folds_output.kfml - Example KnowledgeFlow layout file","title":"Links"},{"location":"visualization/explorer_error_visualization_plugins/","text":"Introduction # As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.ErrorVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.ErrorVisualizePlugin key of the weka/gui/GenericPropertiesCreator.props file. Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . * getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # The following screenshots were generated using weka.classifiers.functions.LinearRegression with default parameters on the UCI dataset bolts , using a percentage split of 66% for the training set and the remainder for testing. Using Weka panels # The ClassifierErrorsWeka.java example simply displays the classifier errors as the Visualize classifier errors menu item already available in Weka. It is just to demonstrate how to use existing Weka classes. Using JMathtools' Boxplot # The ClassifierErrorsMathtools.java . The relative error per prediction is displayed as vertical line. Note: This display is only available for numeric classes. Downloads # ClassifierErrorsWeka.java ClassifierErrorsMathtools.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Links # JMathTools homepage","title":"Introduction"},{"location":"visualization/explorer_error_visualization_plugins/#introduction","text":"As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0.","title":"Introduction"},{"location":"visualization/explorer_error_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.ErrorVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.ErrorVisualizePlugin key of the weka/gui/GenericPropertiesCreator.props file.","title":"Requirements"},{"location":"visualization/explorer_error_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . * getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_error_visualization_plugins/#examples","text":"The following screenshots were generated using weka.classifiers.functions.LinearRegression with default parameters on the UCI dataset bolts , using a percentage split of 66% for the training set and the remainder for testing.","title":"Examples"},{"location":"visualization/explorer_error_visualization_plugins/#using-weka-panels","text":"The ClassifierErrorsWeka.java example simply displays the classifier errors as the Visualize classifier errors menu item already available in Weka. It is just to demonstrate how to use existing Weka classes.","title":"Using Weka panels"},{"location":"visualization/explorer_error_visualization_plugins/#using-jmathtools-boxplot","text":"The ClassifierErrorsMathtools.java . The relative error per prediction is displayed as vertical line. Note: This display is only available for numeric classes.","title":"Using JMathtools' Boxplot"},{"location":"visualization/explorer_error_visualization_plugins/#downloads","text":"ClassifierErrorsWeka.java ClassifierErrorsMathtools.java","title":"Downloads"},{"location":"visualization/explorer_error_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins","title":"See also"},{"location":"visualization/explorer_error_visualization_plugins/#links","text":"JMathTools homepage","title":"Links"},{"location":"visualization/explorer_graph_visualization_plugins/","text":"Introduction # As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. graph is referring to graphs generated, for instance, by the weka.classifiers.bayes.BayesNet classifier. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.GraphVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.GraphVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file. Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # Prefuse visualization toolkit # The PrefuseGraph.java . It is based on the prefuse.demos.GraphView demo class. The following screenshot was generated using BayesNet on the UCI dataset anneal with the following parametrization: weka.classifiers.bayes.BayesNet -D -Q weka.classifiers.bayes.net.search.local.K2 -- -P 3 -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5 Downloads # PrefuseGraph.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Explorer tree visualization plugins Links # Prefuse homepage PAP - prefuse assistance pool","title":"Introduction"},{"location":"visualization/explorer_graph_visualization_plugins/#introduction","text":"As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. graph is referring to graphs generated, for instance, by the weka.classifiers.bayes.BayesNet classifier. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0.","title":"Introduction"},{"location":"visualization/explorer_graph_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.GraphVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.GraphVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file.","title":"Requirements"},{"location":"visualization/explorer_graph_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_graph_visualization_plugins/#examples","text":"","title":"Examples"},{"location":"visualization/explorer_graph_visualization_plugins/#prefuse-visualization-toolkit","text":"The PrefuseGraph.java . It is based on the prefuse.demos.GraphView demo class. The following screenshot was generated using BayesNet on the UCI dataset anneal with the following parametrization: weka.classifiers.bayes.BayesNet -D -Q weka.classifiers.bayes.net.search.local.K2 -- -P 3 -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5","title":"Prefuse visualization toolkit"},{"location":"visualization/explorer_graph_visualization_plugins/#downloads","text":"PrefuseGraph.java","title":"Downloads"},{"location":"visualization/explorer_graph_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Explorer tree visualization plugins","title":"See also"},{"location":"visualization/explorer_graph_visualization_plugins/#links","text":"Prefuse homepage PAP - prefuse assistance pool","title":"Links"},{"location":"visualization/explorer_prediction_visualization_plugins/","text":"Introduction # As of Weka version >3.5.2 one can easily add visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.VisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.VisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file (only available in 3.5.7 or later). Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # Table with predictions # The PredictionTable.java example simply displays the actual class label and the one predicted by the classifier. In addition to that, it lists whether it was an incorrect prediction and the class probability for the correct class label. Bar plot with probabilities # The PredictionError.java to display a simple bar plot of the predictions. The correct predictions are displayed in blue , the incorrect ones in red . In both cases the class probability that the classifier returned for the correct class label is displayed on the y axis. The x axis is simply the index of the prediction starting with 0. Downloads # PredictionTable.java PredictionError.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Links # JMathTools homepage","title":"Introduction"},{"location":"visualization/explorer_prediction_visualization_plugins/#introduction","text":"As of Weka version >3.5.2 one can easily add visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0.","title":"Introduction"},{"location":"visualization/explorer_prediction_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.VisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.VisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file (only available in 3.5.7 or later).","title":"Requirements"},{"location":"visualization/explorer_prediction_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_prediction_visualization_plugins/#examples","text":"","title":"Examples"},{"location":"visualization/explorer_prediction_visualization_plugins/#table-with-predictions","text":"The PredictionTable.java example simply displays the actual class label and the one predicted by the classifier. In addition to that, it lists whether it was an incorrect prediction and the class probability for the correct class label.","title":"Table with predictions"},{"location":"visualization/explorer_prediction_visualization_plugins/#bar-plot-with-probabilities","text":"The PredictionError.java to display a simple bar plot of the predictions. The correct predictions are displayed in blue , the incorrect ones in red . In both cases the class probability that the classifier returned for the correct class label is displayed on the y axis. The x axis is simply the index of the prediction starting with 0.","title":"Bar plot with probabilities"},{"location":"visualization/explorer_prediction_visualization_plugins/#downloads","text":"PredictionTable.java PredictionError.java","title":"Downloads"},{"location":"visualization/explorer_prediction_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins","title":"See also"},{"location":"visualization/explorer_prediction_visualization_plugins/#links","text":"JMathTools homepage","title":"Links"},{"location":"visualization/explorer_tree_visualization_plugins/","text":"Introduction # As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add tree visualization plugins in the Explorer (Classify and Cluster panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. tree is referring to trees generated, for instance, by the weka.classifiers.trees.J48 classifier. To be more precise, all classes that import the weka.core.Drawable interface and which graphType() method returns weka.core.Drawable.TREE . This means, that the trees the clusterer weka.clusterers.Cobweb generates, can be displayed as well. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.TreeVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.TreeVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file. Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # prefuse visualization toolkit # The PrefuseTree.java . It is based on the prefuse.demos.TreeView demo class. The following screenshot was generated using J48 on the UCI dataset anneal with default parameters: And here is an example of Cobweb on the same dataset, once again with default parameters: Note: Both trees are only partially displayed, since the prefuse tree component offers exploration of the loaded tree. Downloads # PrefuseTree.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Links # Prefuse homepage PAP - prefuse assistance pool","title":"Introduction"},{"location":"visualization/explorer_tree_visualization_plugins/#introduction","text":"As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add tree visualization plugins in the Explorer (Classify and Cluster panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. tree is referring to trees generated, for instance, by the weka.classifiers.trees.J48 classifier. To be more precise, all classes that import the weka.core.Drawable interface and which graphType() method returns weka.core.Drawable.TREE . This means, that the trees the clusterer weka.clusterers.Cobweb generates, can be displayed as well.","title":"Introduction"},{"location":"visualization/explorer_tree_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.TreeVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.TreeVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file.","title":"Requirements"},{"location":"visualization/explorer_tree_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_tree_visualization_plugins/#examples","text":"","title":"Examples"},{"location":"visualization/explorer_tree_visualization_plugins/#prefuse-visualization-toolkit","text":"The PrefuseTree.java . It is based on the prefuse.demos.TreeView demo class. The following screenshot was generated using J48 on the UCI dataset anneal with default parameters: And here is an example of Cobweb on the same dataset, once again with default parameters: Note: Both trees are only partially displayed, since the prefuse tree component offers exploration of the loaded tree.","title":"prefuse visualization toolkit"},{"location":"visualization/explorer_tree_visualization_plugins/#downloads","text":"PrefuseTree.java","title":"Downloads"},{"location":"visualization/explorer_tree_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins","title":"See also"},{"location":"visualization/explorer_tree_visualization_plugins/#links","text":"Prefuse homepage PAP - prefuse assistance pool","title":"Links"},{"location":"visualization/explorer_visualization_plugins/","text":"The Explorer offers various possibilities to add custom plugins: predictions errors graphs trees Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0.","title":"Explorer visualization plugins"},{"location":"visualization/exporting_charts_from_the_knowledge_flow/","text":"From Weka 3.7.5 it is possible to create and save charts such as scatter plots, attribute histograms, error plots, ROC curves etc. non-interactively as part of a data mining process. Weka's built-in charts can be used or an optional package, such as jfreechartOffscreenChartRenderer (to be released in conjunction with Weka 3.7.5) can be installed using the package manager in order to render pretty charts using the JFreeChart library . A new template is included in the Knowledge Flow that can be used to get you started and demonstrate the options available. The above example Knowledge Flow uses the german credit data from the UCI repository and is configured to use the built-in Weka charting routines. The \" ImageSaver \" components are configured to save the generated PNG charts to the user's home directory (any writable place on the filesystem can be used of course). Tool tips appear when the mouse hovers over the labels of the options for the renderer in the configuration dialog for the \" DataVisualizer \", \" AttributeSummarizer \" and \" ModelPerformanceChart \" components that explain the available options. The following screenshots show the charts that are generated by the flow using using the optional jfreechartOffscreenChartRenderer package: Version 1.0.1 of the jfreechartOffscreenChartRenderer adds the ability to plot a Pareto chart for nominal attributes. The -pareto renderer option can be used to accomplish this.","title":"Exporting charts from the knowledge flow"},{"location":"visualization/extensions_for_wekas_main_gui/","text":"Description # The main GUI (= weka.gui.Main ) contains a plugin mechanism to add functionality to the main menu without having to modify the code of that class (the GUIChooser in the developer version as well). Thanks to the automatic class discovery, Weka will display all components that are found in packages listed in the GenericPropertiesCreator.props file. Version # 3.5.5 Requirements # The are only two requirements for components to be listed in the main menu (under the Extensions menu): they have to implement the weka.gui.MainMenuExtension interface the packages they reside in must be listed in the GenericPropertiesCreator.props under the weka.gui.MainMenuExtension entry Examples # In the following, I'll present two really simple examples of how to add stuff to the main menu. An item that gets added to the main menu either handles everything itself, i.e., creating frame and displaying it, or it needs a frame to place its GUI components in. In the first case, one only needs to let the getActionListener(JFrame) method return an ActionListener and implement the fillFrame(Component) method with an empty body. In the other case, one lets the getActionListener(JFrame) method return null and uses the fillFrame(Component) method to fill the frame with life. Launching a browser # Launching a browser with the Weka homepage is a really example, since one only needs to use the weka.gui.BrowserHelper class to launch a browser with a specific URL. Since we don't need a frame for this, we don't add any functionality to the fillFrame(Component) , but only let the getActionListener(JFrame) method return an ActionListener that launches the browser with the Weka homepage. The StartBrowser.java extension will be listed in the sub-menu Internet as Start browser . public String getSubmenuTitle () { return \"Internet\" ; } public String getMenuTitle () { return \"Start browser\" ; } public ActionListener getActionListener ( JFrame owner ) { final JFrame finalOwner = owner ; ActionListener result = new ActionListener () { public void actionPerformed ( ActionEvent evt ) { BrowserHelper . openURL ( finalOwner , \"http://www.cs.waikato.ac.nz/~ml/weka/\" ); } }; return result ; } public void fillFrame ( Component frame ) { } Since the class is part of the weka.gui.extensions package, we must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file, e.g.: weka.gui.MainMenuExtension = \\ weka.gui.extensions SQL Worksheet # The SqlWorksheet.java mode, one needs to take care of to check for JFrame and JInternalFrame as ancestor of the frame that is being passed through. The method only instantiates an SqlViewer panel, places it in the center of the frame, resizes the frame to 800x600 and moves it into the center of the screen. public String getSubmenuTitle () { return null ; } public String getMenuTitle () { return \"SQL Worksheet\" ; } public ActionListener getActionListener ( JFrame owner ) { return null ; } public void fillFrame ( Component frame ) { SqlViewer sql = new SqlViewer ( null ); * add sql viewer component if ( frame instanceof JFrame ) { JFrame f = ( JFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } else if ( frame instanceof JInternalFrame ) { JInternalFrame f = ( JInternalFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } * size + location ( = centered ) frame . setSize ( 800 , 600 ); frame . validate (); int screenHeight = frame . getGraphicsConfiguration (). getBounds (). height ; int screenWidth = frame . getGraphicsConfiguration (). getBounds (). width ; frame . setLocation ( ( screenWidth - frame . getBounds (). width ) / 2 , ( screenHeight - frame . getBounds (). height ) / 2 ); } This class is part of the weka.gui.extensions package and we therefore must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file: weka.gui.MainMenuExtension = \\ weka.gui.extensions Downloads # StartBrowser.java SqlWorksheet.java","title":"Description"},{"location":"visualization/extensions_for_wekas_main_gui/#description","text":"The main GUI (= weka.gui.Main ) contains a plugin mechanism to add functionality to the main menu without having to modify the code of that class (the GUIChooser in the developer version as well). Thanks to the automatic class discovery, Weka will display all components that are found in packages listed in the GenericPropertiesCreator.props file.","title":"Description"},{"location":"visualization/extensions_for_wekas_main_gui/#version","text":"3.5.5","title":"Version"},{"location":"visualization/extensions_for_wekas_main_gui/#requirements","text":"The are only two requirements for components to be listed in the main menu (under the Extensions menu): they have to implement the weka.gui.MainMenuExtension interface the packages they reside in must be listed in the GenericPropertiesCreator.props under the weka.gui.MainMenuExtension entry","title":"Requirements"},{"location":"visualization/extensions_for_wekas_main_gui/#examples","text":"In the following, I'll present two really simple examples of how to add stuff to the main menu. An item that gets added to the main menu either handles everything itself, i.e., creating frame and displaying it, or it needs a frame to place its GUI components in. In the first case, one only needs to let the getActionListener(JFrame) method return an ActionListener and implement the fillFrame(Component) method with an empty body. In the other case, one lets the getActionListener(JFrame) method return null and uses the fillFrame(Component) method to fill the frame with life.","title":"Examples"},{"location":"visualization/extensions_for_wekas_main_gui/#launching-a-browser","text":"Launching a browser with the Weka homepage is a really example, since one only needs to use the weka.gui.BrowserHelper class to launch a browser with a specific URL. Since we don't need a frame for this, we don't add any functionality to the fillFrame(Component) , but only let the getActionListener(JFrame) method return an ActionListener that launches the browser with the Weka homepage. The StartBrowser.java extension will be listed in the sub-menu Internet as Start browser . public String getSubmenuTitle () { return \"Internet\" ; } public String getMenuTitle () { return \"Start browser\" ; } public ActionListener getActionListener ( JFrame owner ) { final JFrame finalOwner = owner ; ActionListener result = new ActionListener () { public void actionPerformed ( ActionEvent evt ) { BrowserHelper . openURL ( finalOwner , \"http://www.cs.waikato.ac.nz/~ml/weka/\" ); } }; return result ; } public void fillFrame ( Component frame ) { } Since the class is part of the weka.gui.extensions package, we must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file, e.g.: weka.gui.MainMenuExtension = \\ weka.gui.extensions","title":"Launching a browser"},{"location":"visualization/extensions_for_wekas_main_gui/#sql-worksheet","text":"The SqlWorksheet.java mode, one needs to take care of to check for JFrame and JInternalFrame as ancestor of the frame that is being passed through. The method only instantiates an SqlViewer panel, places it in the center of the frame, resizes the frame to 800x600 and moves it into the center of the screen. public String getSubmenuTitle () { return null ; } public String getMenuTitle () { return \"SQL Worksheet\" ; } public ActionListener getActionListener ( JFrame owner ) { return null ; } public void fillFrame ( Component frame ) { SqlViewer sql = new SqlViewer ( null ); * add sql viewer component if ( frame instanceof JFrame ) { JFrame f = ( JFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } else if ( frame instanceof JInternalFrame ) { JInternalFrame f = ( JInternalFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } * size + location ( = centered ) frame . setSize ( 800 , 600 ); frame . validate (); int screenHeight = frame . getGraphicsConfiguration (). getBounds (). height ; int screenWidth = frame . getGraphicsConfiguration (). getBounds (). width ; frame . setLocation ( ( screenWidth - frame . getBounds (). width ) / 2 , ( screenHeight - frame . getBounds (). height ) / 2 ); } This class is part of the weka.gui.extensions package and we therefore must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file: weka.gui.MainMenuExtension = \\ weka.gui.extensions","title":"SQL Worksheet"},{"location":"visualization/extensions_for_wekas_main_gui/#downloads","text":"StartBrowser.java SqlWorksheet.java","title":"Downloads"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/","text":"The KnowledgeFlow enables one to plot the error rate (= RMSE, root mean squared error) and the accuracy of an incremental classifier. An incremental classifier is a classifier that does not need to see the whole data at once, but can be trained instance by instance. All classifiers implementing the interface [weka.classifiers.UpdateableClassifier](https://weka.sourceforge.io/doc.dev/weka/classifiers/updateableclassifier.html) are incremental ones. Setup # The most basic setup for an incremental classifier is show below, using the classifier NaiveBayesUpdateable : ArffLoader --instance--> NaiveBayesUpdateable --incrementalClassifier--> IncrementalClassifierEvaluator --chart--> StripChart Here is a screenshot of the setup: You can also download KnowledgeFlow-incremental_classifier.kfml , an XML version of this setup. Displaying the chart # select a dataset that you want to train the classifier with, via Configure... from the ArffLoader's context menu, e.g., the UCI dataset anneal . select Show chart from the StripChart's context menu (the chart will only be updated if visible!) select Start loading from the ArffLoader's context menu. bring the chart back to front and you should get a graph similar to this one (click to enlarge): Exporting the chart # As of 08/07/2008 (or Weka >3.5.7), the chart can be exported to several file formats using the developer version of Weka. The magic key for bringing up the export dialog is <Ctrl+Alt+Shift><Left-Click> . Since the default black background is not very practical if one wants to embed the chart in a document, one can change the background color via the following property of the Beans.props properties file (and set it to white ): weka.gui.beans.StripChart.backgroundColour The text color of the legend can be modified via the following property (and set it to black ): weka.gui.beans.StripChart$LegendPanel.borderColour Note: Due to the design of the StripChart (nested JPanels), the EPS export does not work properly. But one can always export it as PNG and then convert it under Linux via the pngtopnm/pnmtops/ps2epsi chain. See commandline help of those tools for more details. See also # Properties File weka/gui/beans/Beans.props Links # KnowledgeFlow-incremental_classifier.kfml","title":"Plotting error rate for incremental classifier"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#setup","text":"The most basic setup for an incremental classifier is show below, using the classifier NaiveBayesUpdateable : ArffLoader --instance--> NaiveBayesUpdateable --incrementalClassifier--> IncrementalClassifierEvaluator --chart--> StripChart Here is a screenshot of the setup: You can also download KnowledgeFlow-incremental_classifier.kfml , an XML version of this setup.","title":"Setup"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#displaying-the-chart","text":"select a dataset that you want to train the classifier with, via Configure... from the ArffLoader's context menu, e.g., the UCI dataset anneal . select Show chart from the StripChart's context menu (the chart will only be updated if visible!) select Start loading from the ArffLoader's context menu. bring the chart back to front and you should get a graph similar to this one (click to enlarge):","title":"Displaying the chart"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#exporting-the-chart","text":"As of 08/07/2008 (or Weka >3.5.7), the chart can be exported to several file formats using the developer version of Weka. The magic key for bringing up the export dialog is <Ctrl+Alt+Shift><Left-Click> . Since the default black background is not very practical if one wants to embed the chart in a document, one can change the background color via the following property of the Beans.props properties file (and set it to white ): weka.gui.beans.StripChart.backgroundColour The text color of the legend can be modified via the following property (and set it to black ): weka.gui.beans.StripChart$LegendPanel.borderColour Note: Due to the design of the StripChart (nested JPanels), the EPS export does not work properly. But one can always export it as PNG and then convert it under Linux via the pngtopnm/pnmtops/ps2epsi chain. See commandline help of those tools for more details.","title":"Exporting the chart"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#see-also","text":"Properties File weka/gui/beans/Beans.props","title":"See also"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#links","text":"KnowledgeFlow-incremental_classifier.kfml","title":"Links"},{"location":"visualization/visualizing_a_tree/","text":"The following code sample ( VisualizeJ48.java ) takes an ARFF file as input, trains a [J48](https://weka.sourceforge.io/doc/weka/classifiers/trees/j48.html) and displays the generated tree with the [TreeVisualizer](https://weka.sourceforge.io/doc/weka/gui/treevisualizer/treevisualizer.html) class. This can be done with all classifiers that implement the [weka.core.Drawable](https://weka.sourceforge.io/doc/weka/core/drawable.html) interface. Source code # import java.awt.BorderLayout ; import java.awt.event.WindowAdapter ; import java.awt.event.WindowEvent ; import java.io.BufferedReader ; import java.io.FileReader ; import javax.swing.JFrame ; import weka.classifiers.trees.J48 ; import weka.core.Instances ; import weka.gui.treevisualizer.PlaceNode2 ; import weka.gui.treevisualizer.TreeVisualizer ; /** * Displays a trained J48 as tree. * Expects an ARFF filename as first argument. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeJ48 { public static void main ( String args [] ) throws Exception { // train classifier J48 cls = new J48 (); Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); cls . buildClassifier ( data ); // display classifier final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Tree Visualizer: J48\" ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); TreeVisualizer tv = new TreeVisualizer ( null , cls . graph (), new PlaceNode2 ()); jf . getContentPane (). add ( tv , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); tv . fitToScreen (); } } Downloads # VisualizeJ48.java","title":"Visualizing a tree"},{"location":"visualization/visualizing_a_tree/#source-code","text":"import java.awt.BorderLayout ; import java.awt.event.WindowAdapter ; import java.awt.event.WindowEvent ; import java.io.BufferedReader ; import java.io.FileReader ; import javax.swing.JFrame ; import weka.classifiers.trees.J48 ; import weka.core.Instances ; import weka.gui.treevisualizer.PlaceNode2 ; import weka.gui.treevisualizer.TreeVisualizer ; /** * Displays a trained J48 as tree. * Expects an ARFF filename as first argument. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeJ48 { public static void main ( String args [] ) throws Exception { // train classifier J48 cls = new J48 (); Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); cls . buildClassifier ( data ); // display classifier final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Tree Visualizer: J48\" ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); TreeVisualizer tv = new TreeVisualizer ( null , cls . graph (), new PlaceNode2 ()); jf . getContentPane (). add ( tv , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); tv . fitToScreen (); } }","title":"Source code"},{"location":"visualization/visualizing_a_tree/#downloads","text":"VisualizeJ48.java","title":"Downloads"},{"location":"visualization/visualizing_cluster_assignments/","text":"The following code sample ( VisualizeClusterAssignments.java ) displays the cluster assignments of a clusterer on a particular dataset. This is the same functionality as you get with the right-click menu in the Explorer, choosing Visualize cluster assignments . Example command-line: java -classpath .:weka.jar VisualizeClusterAssignments -t /some/where/data.arff -W \"weka.clusterers.EM -I 50\" Note: The command above is for Linux/Unix. For the Windows platform, you have to use semicolons in the CLASSPATH and backlashes instead of forward slashes in the paths. Source code # import weka.clusterers.* ; import weka.core.* ; import weka.core.converters.ConverterUtils.* ; import weka.gui.explorer.ClustererPanel ; import weka.gui.visualize.* ; import java.awt.* ; import java.io.* ; import java.text.* ; import java.util.* ; import javax.swing.* ; /** * Runs a clusterer on a dataset and visualizes the cluster assignments, * like with right-click menu in Explorer. * <p/> * Takes two arguments: * <ol> * <li>-t dataset</li> * <li>-W cluster algorithm with options</li> * </ol> * * Note: code should work with Weka 3.6.0 and 3.5.8. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeClusterAssignments { public static void main ( String [] args ) throws Exception { // load data Instances train = DataSource . read ( Utils . getOption ( 't' , args )); // some data formats store the class attribute information as well if ( train . classIndex () != - 1 ) throw new IllegalArgumentException ( \"Data cannot have class attribute!\" ); // instantiate clusterer String [] options = Utils . splitOptions ( Utils . getOption ( 'W' , args )); String classname = options [ 0 ] ; options [ 0 ] = \"\" ; Clusterer clusterer = AbstractClusterer . forName ( classname , options ); // evaluate clusterer clusterer . buildClusterer ( train ); ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( train ); // setup visualization // taken from: ClustererPanel.startClusterer() PlotData2D predData = ClustererPanel . setUpVisualizableInstances ( train , eval ); String name = ( new SimpleDateFormat ( \"HH:mm:ss - \" )). format ( new Date ()); String cname = clusterer . getClass (). getName (); if ( cname . startsWith ( \"weka.clusterers.\" )) name += cname . substring ( \"weka.clusterers.\" . length ()); else name += cname ; VisualizePanel vp = new VisualizePanel (); vp . setName ( name + \" (\" + train . relationName () + \")\" ); predData . setPlotName ( name + \" (\" + train . relationName () + \")\" ); vp . addPlot ( predData ); // display data // taken from: ClustererPanel.visualizeClusterAssignments(VisualizePanel) String plotName = vp . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Clusterer Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vp , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } } Downloads # VisualizeClusterAssignments.java","title":"Visualizing cluster assignments"},{"location":"visualization/visualizing_cluster_assignments/#source-code","text":"import weka.clusterers.* ; import weka.core.* ; import weka.core.converters.ConverterUtils.* ; import weka.gui.explorer.ClustererPanel ; import weka.gui.visualize.* ; import java.awt.* ; import java.io.* ; import java.text.* ; import java.util.* ; import javax.swing.* ; /** * Runs a clusterer on a dataset and visualizes the cluster assignments, * like with right-click menu in Explorer. * <p/> * Takes two arguments: * <ol> * <li>-t dataset</li> * <li>-W cluster algorithm with options</li> * </ol> * * Note: code should work with Weka 3.6.0 and 3.5.8. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeClusterAssignments { public static void main ( String [] args ) throws Exception { // load data Instances train = DataSource . read ( Utils . getOption ( 't' , args )); // some data formats store the class attribute information as well if ( train . classIndex () != - 1 ) throw new IllegalArgumentException ( \"Data cannot have class attribute!\" ); // instantiate clusterer String [] options = Utils . splitOptions ( Utils . getOption ( 'W' , args )); String classname = options [ 0 ] ; options [ 0 ] = \"\" ; Clusterer clusterer = AbstractClusterer . forName ( classname , options ); // evaluate clusterer clusterer . buildClusterer ( train ); ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( train ); // setup visualization // taken from: ClustererPanel.startClusterer() PlotData2D predData = ClustererPanel . setUpVisualizableInstances ( train , eval ); String name = ( new SimpleDateFormat ( \"HH:mm:ss - \" )). format ( new Date ()); String cname = clusterer . getClass (). getName (); if ( cname . startsWith ( \"weka.clusterers.\" )) name += cname . substring ( \"weka.clusterers.\" . length ()); else name += cname ; VisualizePanel vp = new VisualizePanel (); vp . setName ( name + \" (\" + train . relationName () + \")\" ); predData . setPlotName ( name + \" (\" + train . relationName () + \")\" ); vp . addPlot ( predData ); // display data // taken from: ClustererPanel.visualizeClusterAssignments(VisualizePanel) String plotName = vp . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Clusterer Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vp , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } }","title":"Source code"},{"location":"visualization/visualizing_cluster_assignments/#downloads","text":"VisualizeClusterAssignments.java","title":"Downloads"},{"location":"visualization/visualizing_roc_curve/","text":"The following class lets you display a previously saved ROC curve , which also displays the AUC . If you don't need the AUC , then you can also use this command to display the curve: java [ CLASSPATH | -classpath <your-classpath> ] weka.gui.visualize.VisualizePanel <file> Source code: import java.awt.* ; import java.io.* ; import javax.swing.* ; import weka.core.* ; import weka.classifiers.evaluation.* ; import weka.gui.visualize.* ; /** * Visualizes a previously saved ROC curve. Code taken from the * <code>weka.gui.explorer.ClassifierPanel</code> - involved methods: * <ul> * <li>visualize(String,int,int)</li> * </li>visualizeClassifierErrors(VisualizePanel)</li> * </ul> * * @author FracPete */ public class VisualizeROC { /** * takes one argument: previously saved ROC curve data (ARFF file) */ public static void main ( String [] args ) throws Exception { Instances result = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); result . setClassIndex ( result . numAttributes () - 1 ); ThresholdCurve tc = new ThresholdCurve (); // method visualize ThresholdVisualizePanel vmc = new ThresholdVisualizePanel (); vmc . setROCString ( \"(Area under ROC = \" + Utils . doubleToString ( tc . getROCArea ( result ), 4 ) + \")\" ); vmc . setName ( result . relationName ()); PlotData2D tempd = new PlotData2D ( result ); tempd . setPlotName ( result . relationName ()); tempd . addInstanceNumberAttribute (); // specify which points are connected boolean [] cp = new boolean [ result . numInstances () ] ; for ( int n = 1 ; n < cp . length ; n ++ ) cp [ n ] = true ; tempd . setConnectPoints ( cp ); // add plot vmc . addPlot ( tempd ); // method visualizeClassifierErrors String plotName = vmc . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vmc , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } } See also # ROC curves Plotting multiple ROC curves - also contains a Java example of plotting multiple ROC curves in a single plot Downloads # VisualizeROC.java ( stable , developer )","title":"Visualizing roc curve"},{"location":"visualization/visualizing_roc_curve/#see-also","text":"ROC curves Plotting multiple ROC curves - also contains a Java example of plotting multiple ROC curves in a single plot","title":"See also"},{"location":"visualization/visualizing_roc_curve/#downloads","text":"VisualizeROC.java ( stable , developer )","title":"Downloads"}]}
\ No newline at end of file
+{"config":{"indexing":"full","lang":["en"],"min_search_length":3,"prebuild_index":false,"separator":"[\\s\\-]+"},"docs":[{"location":"","text":"New to Weka? # Have a look at the Frequently Asked Questions (FAQ), the Troubleshooting article or search the mailing list archives . Don't forget to check out the documentation and the online courses . You have questions regarding Weka? # You can post questions to the Weka mailing list . Please keep in mind that you cannot expect an immediate answer to your question(s). The questions are mainly answered by volunteers, Weka users just like you. You are looking for packages? # With Weka 3.7.2 and later, you can easily install packages through Weka's package manager interface, either official ones or unofficial ones. Have a look at the Packages article for more information on this topic. You want to contribute to the wiki? # The wiki is based on Markdown articles, which are turned into static HTML using MkDocs (see here for details on writing articles). The content of the wiki is available as repository on GitHub . Feel free to add/update and then do a pull request . You found a bug? # Please post the bug report to the Weka mailing list . The following information will help tracking things down: version of Weka (e.g., 3.9.6) operating system (e.g., Windows 11 or Ubuntu 20.04 64bit) Java version (e.g., 11.0.11+9) You can also run the following command in the SimpleCLI and attach the generated output as a text file to your post: java weka.core.SystemInfo","title":"Home"},{"location":"#new-to-weka","text":"Have a look at the Frequently Asked Questions (FAQ), the Troubleshooting article or search the mailing list archives . Don't forget to check out the documentation and the online courses .","title":"New to Weka?"},{"location":"#you-have-questions-regarding-weka","text":"You can post questions to the Weka mailing list . Please keep in mind that you cannot expect an immediate answer to your question(s). The questions are mainly answered by volunteers, Weka users just like you.","title":"You have questions regarding Weka?"},{"location":"#you-are-looking-for-packages","text":"With Weka 3.7.2 and later, you can easily install packages through Weka's package manager interface, either official ones or unofficial ones. Have a look at the Packages article for more information on this topic.","title":"You are looking for packages?"},{"location":"#you-want-to-contribute-to-the-wiki","text":"The wiki is based on Markdown articles, which are turned into static HTML using MkDocs (see here for details on writing articles). The content of the wiki is available as repository on GitHub . Feel free to add/update and then do a pull request .","title":"You want to contribute to the wiki?"},{"location":"#you-found-a-bug","text":"Please post the bug report to the Weka mailing list . The following information will help tracking things down: version of Weka (e.g., 3.9.6) operating system (e.g., Windows 11 or Ubuntu 20.04 64bit) Java version (e.g., 11.0.11+9) You can also run the following command in the SimpleCLI and attach the generated output as a text file to your post: java weka.core.SystemInfo","title":"You found a bug?"},{"location":"add_weights_to_dataset/","text":"The following examples show how to add weights to normal datasets and save them in the new XRFF data format. A version of Weka later than 3.5.3 (or the code from Git ) is necessary for this code to work. Add arbitrary weights # import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", sets class if necessary (in that case the last * attribute), adds some test weights and saves it as XRFF file * under \"args[1]\". E.g.: <br/> * AddWeights anneal.arff anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeights { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * set weights double factor = 0.5 / ( double ) data . numInstances (); for ( int i = 0 ; i < data . numInstances (); i ++ ) { data . instance ( i ). setWeight ( 0.5 + factor * i ); } // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 1 ] )); saver . setInstances ( data ); saver . writeBatch (); } } Add weights stored in an external file # import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.BufferedReader ; import java.io.File ; import java.io.FileReader ; /** * Loads file \"args[0]\" (can be ARFF, CSV, C4.5, etc.), sets class if necessary * (in that case the last attribute), adds weights from \"args[1]\" (one weight * per line) and saves it as XRFF file under \"args[2]\". E.g.: <br/> * AddWeightsFromFile anneal.arff weights.txt anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeightsFromFile { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * read and set weights BufferedReader reader = new BufferedReader ( new FileReader ( args [ 1 ] )); for ( int i = 0 ; i < data . numInstances (); i ++ ) { String line = reader . readLine (); double weight = Double . parseDouble ( line ); data . instance ( i ). setWeight ( weight ); } reader . close (); // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } } Add weights stored in the attribute # import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", Adds weight given in attribute with * index \"args[1]\" - 1, deletes this attribute. * sets class if necessary (in that case the last * attribute) and saves it as XRFF file * under \"args[2]\". E.g.: <br/> * AddWeightsFromAtt file.arff 2 file.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) * @author gabi (gs23 at waikato dot ac dot nz) */ public class AddWeightsFromAtt { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); * get weight index int wIndex = Integer . parseInt ( args [ 1 ] ) - 1 ; * set weights for ( int i = 0 ; i < data . numInstances (); i ++ ) { double weight = data . instance ( i ). value ( wIndex ); data . instance ( i ). setWeight ( weight ); } * delete weight attribute and set class index data . deleteAttributeAt ( wIndex ); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } } Download # AddWeights.java AddWeightsFromFile.java AddWeightsFromAtt.java See also # git The unofficial Weka package dataset-weights allows you to modify attribute/instance weights using filters - no coding required","title":"Add weights to dataset"},{"location":"add_weights_to_dataset/#add-arbitrary-weights","text":"import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", sets class if necessary (in that case the last * attribute), adds some test weights and saves it as XRFF file * under \"args[1]\". E.g.: <br/> * AddWeights anneal.arff anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeights { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * set weights double factor = 0.5 / ( double ) data . numInstances (); for ( int i = 0 ; i < data . numInstances (); i ++ ) { data . instance ( i ). setWeight ( 0.5 + factor * i ); } // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 1 ] )); saver . setInstances ( data ); saver . writeBatch (); } }","title":"Add arbitrary weights"},{"location":"add_weights_to_dataset/#add-weights-stored-in-an-external-file","text":"import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.BufferedReader ; import java.io.File ; import java.io.FileReader ; /** * Loads file \"args[0]\" (can be ARFF, CSV, C4.5, etc.), sets class if necessary * (in that case the last attribute), adds weights from \"args[1]\" (one weight * per line) and saves it as XRFF file under \"args[2]\". E.g.: <br/> * AddWeightsFromFile anneal.arff weights.txt anneal.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddWeightsFromFile { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * read and set weights BufferedReader reader = new BufferedReader ( new FileReader ( args [ 1 ] )); for ( int i = 0 ; i < data . numInstances (); i ++ ) { String line = reader . readLine (); double weight = Double . parseDouble ( line ); data . instance ( i ). setWeight ( weight ); } reader . close (); // save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } }","title":"Add weights stored in an external file"},{"location":"add_weights_to_dataset/#add-weights-stored-in-the-attribute","text":"import weka.core.converters.ConverterUtils.DataSource ; import weka.core.converters.XRFFSaver ; import weka.core.Instances ; import java.io.File ; /** * Loads file \"args[0]\", Adds weight given in attribute with * index \"args[1]\" - 1, deletes this attribute. * sets class if necessary (in that case the last * attribute) and saves it as XRFF file * under \"args[2]\". E.g.: <br/> * AddWeightsFromAtt file.arff 2 file.xrff.gz * * @author FracPete (fracpete at waikato dot ac dot nz) * @author gabi (gs23 at waikato dot ac dot nz) */ public class AddWeightsFromAtt { public static void main ( String [] args ) throws Exception { * load data DataSource source = new DataSource ( args [ 0 ] ); Instances data = source . getDataSet (); * get weight index int wIndex = Integer . parseInt ( args [ 1 ] ) - 1 ; * set weights for ( int i = 0 ; i < data . numInstances (); i ++ ) { double weight = data . instance ( i ). value ( wIndex ); data . instance ( i ). setWeight ( weight ); } * delete weight attribute and set class index data . deleteAttributeAt ( wIndex ); if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); * save data XRFFSaver saver = new XRFFSaver (); saver . setFile ( new File ( args [ 2 ] )); saver . setInstances ( data ); saver . writeBatch (); } }","title":"Add weights stored in the attribute"},{"location":"add_weights_to_dataset/#download","text":"AddWeights.java AddWeightsFromFile.java AddWeightsFromAtt.java","title":"Download"},{"location":"add_weights_to_dataset/#see-also","text":"git The unofficial Weka package dataset-weights allows you to modify attribute/instance weights using filters - no coding required","title":"See also"},{"location":"adding_attributes_to_dataset/","text":"The following example class adds a nominal and a numeric attribute to the dataset identified by the filename given as first parameter. The second parameter defines whether the data is manipulated via the Add filter (= filter ) or through the Weka API directly (= java ). Usage: AddAttribute <file.arff> <filter|java> Source code: import weka.core.* ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Add ; import java.io.* ; import java.util.* ; /** * Adds a nominal and a numeric attribute to the dataset provided as first * parameter (and fills it with random values) and outputs the result to * stdout. It's either done via the Add filter (first option \"filter\") * or manual with Java (second option \"java\"). * * Usage: AddAttribute &lt;file.arff&gt; &lt;filter|java&gt; * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class AddAttribute { /** * adds the attributes * * @param args the commandline arguments */ public static void main ( String [] args ) throws Exception { if ( args . length != 2 ) { System . out . println ( \"\\nUsage: AddAttribute <file.arff> <filter|java>\\n\" ); System . exit ( 1 ); } // load dataset Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); Instances newData = null ; // filter or java? if ( args [ 1 ] . equals ( \"filter\" )) { Add filter ; newData = new Instances ( data ); // 1. nominal attribute filter = new Add (); filter . setAttributeIndex ( \"last\" ); filter . setNominalLabels ( \"A,B,C,D\" ); filter . setAttributeName ( \"NewNominal\" ); filter . setInputFormat ( newData ); newData = Filter . useFilter ( newData , filter ); // 2. numeric attribute filter = new Add (); filter . setAttributeIndex ( \"last\" ); filter . setAttributeName ( \"NewNumeric\" ); filter . setInputFormat ( newData ); newData = Filter . useFilter ( newData , filter ); } else if ( args [ 1 ] . equals ( \"java\" )) { newData = new Instances ( data ); // add new attributes // 1. nominal FastVector values = new FastVector (); /* FastVector is now deprecated. Users can use any java.util.List */ values . addElement ( \"A\" ); /* implementation now */ values . addElement ( \"B\" ); values . addElement ( \"C\" ); values . addElement ( \"D\" ); newData . insertAttributeAt ( new Attribute ( \"NewNominal\" , values ), newData . numAttributes ()); // 2. numeric newData . insertAttributeAt ( new Attribute ( \"NewNumeric\" ), newData . numAttributes ()); } else { System . out . println ( \"\\nUsage: AddAttribute <file.arff> <filter|java>\\n\" ); System . exit ( 2 ); } // random values Random rand = new Random ( 1 ); for ( int i = 0 ; i < newData . numInstances (); i ++ ) { // 1. nominal // index of labels A:0,B:1,C:2,D:3 newData . instance ( i ). setValue ( newData . numAttributes () - 2 , rand . nextInt ( 4 )); // 2. numeric newData . instance ( i ). setValue ( newData . numAttributes () - 1 , rand . nextDouble ()); } // output on stdout System . out . println ( newData ); } } See also # Creating an ARFF file - explains the creation of all the different attribute types Use Weka in your Java code - for general usage of the Weka API Save Instances to an ARFF File - if you want to save the output to a file instead of printing them to stdout Downloads # AddAttribute.java ( stable , developer )","title":"Adding attributes to dataset"},{"location":"adding_attributes_to_dataset/#see-also","text":"Creating an ARFF file - explains the creation of all the different attribute types Use Weka in your Java code - for general usage of the Weka API Save Instances to an ARFF File - if you want to save the output to a file instead of printing them to stdout","title":"See also"},{"location":"adding_attributes_to_dataset/#downloads","text":"AddAttribute.java ( stable , developer )","title":"Downloads"},{"location":"adding_tabs_in_the_explorer/","text":"Description # This article explains how to add extra tabs in the Explorer in order to add new functionality without the hassle of having to dig into the Explorer code oneself. With the new plugin-architecture of the Explorer it is fairly easy making your extensions available in the GUI. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0 of the stable-3.6/developer version later than 10/01/2010. Version # 3.5.5 Requirements # Here is roughly what is required in order to add a new tab (the examples go into more detail): your class must be derived from javax.swing.JPanel your class must implemented the interface weka.gui.explorer.Explorer.ExplorerPanel optional interfaces weka.gui.explorer.Explorer.LogHandler in case you want to take advantage of the logging in the Explorer weka.gui.explorer.Explorer.CapabilitiesFilterChangeListener in case your class needs to be notified of changes in the Capabilities, e.g., if new data is loaded into the Explorer adding the classname of your class to the Tabs property in the Explorer.props file Examples # The following examples demonstrate the new plugin architecture (a bold term for such a simple extension mechanism). Only the necessary details are discussed, as the full source code is available for download as well. SQL worksheet # Purpose # Displaying the SqlViewer as a tab in the Explorer instead of using it either via the Open DB... button or as standalone application. Uses the existing components already available in Weka and just assembles them in a JPanel . Since this tab does not rely on a dataset being loaded into the Explorer, it will be used as a standalone one. Useful for people who are working a lot with databases and would like to have an SQL worksheet available all the time instead of clicking on a button every time to open up a database dialog. Implementation # class is derived from javax.swing.JPanel and implements the weka.gui.explorer.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.explorer.Explorer.LogHandler interface, but that is only additional functionality): public class SqlPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"SQL\" ; * what ' s displayed as tab - title , e . g ., * Classify // } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Retrieving data from databases\" ; // the tooltip of the tab } /** ignored, since we *\"generate\"* data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements /** the actual SQL worksheet */ protected SqlViewer m_Viewer ; /** the panel for the buttons */ protected JPanel m_PanelButtons ; /** the Load button - makes the data available in the Explorer */ protected JButton m_ButtonLoad = new JButton ( \"Load data\" ); /** displays the current query */ protected JLabel m_LabelQuery = new JLabel ( \"\" ); * loading the data into the Explorer by clicking on the Load button will fire a property change: m_ButtonLoad . addActionListener ( new ActionListener () { public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { * load data InstanceQuery query = new InstanceQuery (); query . setDatabaseURL ( m_Viewer . getURL ()); query . setUsername ( m_Viewer . getUser ()); query . setPassword ( m_Viewer . getPassword ()); Instances data = query . retrieveInstances ( m_Viewer . getQuery ()); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); } } }); * In order to add our SqlPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.SqlPanel,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Screenshot # Source # SqlPanel.java ( stable-3.8 , developer ) Artificial data generation # Purpose # Instead of only having a Generate... button in the PreprocessPanel or using it from commandline, this example creates a new panel to be displayed as extra tab in the Explorer. This tab will be available regardless whether a dataset is already loaded or not (= standalone ). Implementation # class is derived from javax.swing.JPanel and implements the weka.gui.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.Explorer.LogHandler interface, but that is only additional functionality): public class GeneratorPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have (the same as for the SqlPanel class): /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces (almost identical to SqlPanel ): /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"DataGeneration\" ; // what's displayed as tab-title, e.g., Classify } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Generating artificial datasets\" ; // the tooltip of the tab } /** ignored, since we \"generate\" data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements: /** the GOE for the generators */ protected GenericObjectEditor m_GeneratorEditor = new GenericObjectEditor (); /** the text area for the output of the generated data */ protected JTextArea m_Output = new JTextArea (); /** the Generate button */ protected JButton m_ButtonGenerate = new JButton ( \"Generate\" ); /** the Use button */ protected JButton m_ButtonUse = new JButton ( \"Use\" ); * the Generate button doesn't load the generated data directly into the Explorer, but only outputs in the JTextArea (this is done with the Use button - see further down): m_ButtonGenerate . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ DataGenerator generator = ( DataGenerator ) m_GeneratorEditor . getValue (); String relName = generator . getRelationName (); String cname = generator . getClass (). getName (). replaceAll ( \".*\\\\.\" , \"\" ); String cmd = generator . getClass (). getName (); if ( generator instanceof OptionHandler ) cmd += \" \" + Utils . joinOptions ((( OptionHandler ) generator ). getOptions ()); try { * generate data StringWriter output = new StringWriter (); generator . setOutput ( new PrintWriter ( output )); DataGenerator . makeData ( generator , generator . getOptions ()); m_Output . setText ( output . toString ()); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } generator . setRelationName ( relName ); } }); * the Use button finally fires a property change event that will load the data into the Explorer: m_ButtonUse . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { Instances data = new Instances ( new StringReader ( m_Output . getText ())); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } } }); * In order to add our GeneratorPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.GeneratorPanel:standalone,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Note: the standalone option is used to make the tab available without requiring the preprocess panel to load a dataset first. Screenshot # Source # GeneratorPanel.java ( stable-3.8 , developer ) Experimenter \"light\" # Purpose # By default the Classify panel only performs 1 run of 10-fold cross-validation. Since most classifiers are rather sensitive to the order of the data being presented to them, those results can be too optimistic or pessimistic. Averaging the results over 10 runs with differently randomized train/test pairs returns more reliable results. And this is where this plugin comes in: it can be used to obtain statistical sound results for a specific classifier/dataset combination, without having to setup a whole experiment in the Experimenter. Implementation # Since this plugin is rather bulky, we omit the implementation details, but the following can be said: based on the weka.gui.explorer.ClassifierPanel the actual code doing the work follows the example in Using the Experiment API article * In order to add our ExperimentPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ExperimentPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Screenshot # Source # ExperimentPanel.java ( stable-3.6 , developer )","title":"Description"},{"location":"adding_tabs_in_the_explorer/#description","text":"This article explains how to add extra tabs in the Explorer in order to add new functionality without the hassle of having to dig into the Explorer code oneself. With the new plugin-architecture of the Explorer it is fairly easy making your extensions available in the GUI. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0 of the stable-3.6/developer version later than 10/01/2010.","title":"Description"},{"location":"adding_tabs_in_the_explorer/#version","text":"3.5.5","title":"Version"},{"location":"adding_tabs_in_the_explorer/#requirements","text":"Here is roughly what is required in order to add a new tab (the examples go into more detail): your class must be derived from javax.swing.JPanel your class must implemented the interface weka.gui.explorer.Explorer.ExplorerPanel optional interfaces weka.gui.explorer.Explorer.LogHandler in case you want to take advantage of the logging in the Explorer weka.gui.explorer.Explorer.CapabilitiesFilterChangeListener in case your class needs to be notified of changes in the Capabilities, e.g., if new data is loaded into the Explorer adding the classname of your class to the Tabs property in the Explorer.props file","title":"Requirements"},{"location":"adding_tabs_in_the_explorer/#examples","text":"The following examples demonstrate the new plugin architecture (a bold term for such a simple extension mechanism). Only the necessary details are discussed, as the full source code is available for download as well.","title":"Examples"},{"location":"adding_tabs_in_the_explorer/#sql-worksheet","text":"","title":"SQL worksheet"},{"location":"adding_tabs_in_the_explorer/#purpose","text":"Displaying the SqlViewer as a tab in the Explorer instead of using it either via the Open DB... button or as standalone application. Uses the existing components already available in Weka and just assembles them in a JPanel . Since this tab does not rely on a dataset being loaded into the Explorer, it will be used as a standalone one. Useful for people who are working a lot with databases and would like to have an SQL worksheet available all the time instead of clicking on a button every time to open up a database dialog.","title":"Purpose"},{"location":"adding_tabs_in_the_explorer/#implementation","text":"class is derived from javax.swing.JPanel and implements the weka.gui.explorer.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.explorer.Explorer.LogHandler interface, but that is only additional functionality): public class SqlPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"SQL\" ; * what ' s displayed as tab - title , e . g ., * Classify // } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Retrieving data from databases\" ; // the tooltip of the tab } /** ignored, since we *\"generate\"* data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements /** the actual SQL worksheet */ protected SqlViewer m_Viewer ; /** the panel for the buttons */ protected JPanel m_PanelButtons ; /** the Load button - makes the data available in the Explorer */ protected JButton m_ButtonLoad = new JButton ( \"Load data\" ); /** displays the current query */ protected JLabel m_LabelQuery = new JLabel ( \"\" ); * loading the data into the Explorer by clicking on the Load button will fire a property change: m_ButtonLoad . addActionListener ( new ActionListener () { public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { * load data InstanceQuery query = new InstanceQuery (); query . setDatabaseURL ( m_Viewer . getURL ()); query . setUsername ( m_Viewer . getUser ()); query . setPassword ( m_Viewer . getPassword ()); Instances data = query . retrieveInstances ( m_Viewer . getQuery ()); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); } } }); * In order to add our SqlPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.SqlPanel,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel","title":"Implementation"},{"location":"adding_tabs_in_the_explorer/#screenshot","text":"","title":"Screenshot"},{"location":"adding_tabs_in_the_explorer/#source","text":"SqlPanel.java ( stable-3.8 , developer )","title":"Source"},{"location":"adding_tabs_in_the_explorer/#artificial-data-generation","text":"","title":"Artificial data generation"},{"location":"adding_tabs_in_the_explorer/#purpose_1","text":"Instead of only having a Generate... button in the PreprocessPanel or using it from commandline, this example creates a new panel to be displayed as extra tab in the Explorer. This tab will be available regardless whether a dataset is already loaded or not (= standalone ).","title":"Purpose"},{"location":"adding_tabs_in_the_explorer/#implementation_1","text":"class is derived from javax.swing.JPanel and implements the weka.gui.Explorer.ExplorerPanel interface (the full source code also imports the weka.gui.Explorer.LogHandler interface, but that is only additional functionality): public class GeneratorPanel extends JPanel implements ExplorerPanel { * some basic members that we need to have (the same as for the SqlPanel class): /** the parent frame */ protected Explorer m_Explorer = null ; /** sends notifications when the set of working instances gets changed*/ protected PropertyChangeSupport m_Support = new PropertyChangeSupport ( this ); * methods we need to implement due to the used interfaces (almost identical to SqlPanel ): /** Sets the Explorer to use as parent frame */ public void setExplorer ( Explorer parent ) { m_Explorer = parent ; } /** returns the parent Explorer frame */ public Explorer getExplorer () { return m_Explorer ; } /** Returns the title for the tab in the Explorer */ public String getTabTitle () { return \"DataGeneration\" ; // what's displayed as tab-title, e.g., Classify } /** Returns the tooltip for the tab in the Explorer */ public String getTabTitleToolTip () { return \"Generating artificial datasets\" ; // the tooltip of the tab } /** ignored, since we \"generate\" data and not receive it */ public void setInstances ( Instances inst ) { } /** PropertyChangeListener who will be notified of value changes. */ public void addPropertyChangeListener ( PropertyChangeListener l ) { m_Support . addPropertyChangeListener ( l ); } /** Removes a PropertyChangeListener. */ public void removePropertyChangeListener ( PropertyChangeListener l ) { m_Support . removePropertyChangeListener ( l ); } * additional GUI elements: /** the GOE for the generators */ protected GenericObjectEditor m_GeneratorEditor = new GenericObjectEditor (); /** the text area for the output of the generated data */ protected JTextArea m_Output = new JTextArea (); /** the Generate button */ protected JButton m_ButtonGenerate = new JButton ( \"Generate\" ); /** the Use button */ protected JButton m_ButtonUse = new JButton ( \"Use\" ); * the Generate button doesn't load the generated data directly into the Explorer, but only outputs in the JTextArea (this is done with the Use button - see further down): m_ButtonGenerate . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ DataGenerator generator = ( DataGenerator ) m_GeneratorEditor . getValue (); String relName = generator . getRelationName (); String cname = generator . getClass (). getName (). replaceAll ( \".*\\\\.\" , \"\" ); String cmd = generator . getClass (). getName (); if ( generator instanceof OptionHandler ) cmd += \" \" + Utils . joinOptions ((( OptionHandler ) generator ). getOptions ()); try { * generate data StringWriter output = new StringWriter (); generator . setOutput ( new PrintWriter ( output )); DataGenerator . makeData ( generator , generator . getOptions ()); m_Output . setText ( output . toString ()); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } generator . setRelationName ( relName ); } }); * the Use button finally fires a property change event that will load the data into the Explorer: m_ButtonUse . addActionListener ( new ActionListener (){ public void actionPerformed ( ActionEvent evt ){ m_Support . firePropertyChange ( \"\" , null , null ); } }); * the propertyChange event will perform the actual loading of the data, hence we add an anonymous property change listener to our panel: addPropertyChangeListener ( new PropertyChangeListener () { public void propertyChange ( PropertyChangeEvent e ) { try { Instances data = new Instances ( new StringReader ( m_Output . getText ())); * set data in preprocess panel ( will also notify of capabilties changes ) getExplorer (). getPreprocessPanel (). setInstances ( data ); } catch ( Exception ex ) { ex . printStackTrace (); JOptionPane . showMessageDialog ( getExplorer (), \"Error generating data:\\n\" + ex . getMessage (), \"Error\" , JOptionPane . ERROR_MESSAGE ); } } }); * In order to add our GeneratorPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.GeneratorPanel:standalone,\\ weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel Note: the standalone option is used to make the tab available without requiring the preprocess panel to load a dataset first.","title":"Implementation"},{"location":"adding_tabs_in_the_explorer/#screenshot_1","text":"","title":"Screenshot"},{"location":"adding_tabs_in_the_explorer/#source_1","text":"GeneratorPanel.java ( stable-3.8 , developer )","title":"Source"},{"location":"adding_tabs_in_the_explorer/#experimenter-light","text":"","title":"Experimenter \"light\""},{"location":"adding_tabs_in_the_explorer/#purpose_2","text":"By default the Classify panel only performs 1 run of 10-fold cross-validation. Since most classifiers are rather sensitive to the order of the data being presented to them, those results can be too optimistic or pessimistic. Averaging the results over 10 runs with differently randomized train/test pairs returns more reliable results. And this is where this plugin comes in: it can be used to obtain statistical sound results for a specific classifier/dataset combination, without having to setup a whole experiment in the Experimenter.","title":"Purpose"},{"location":"adding_tabs_in_the_explorer/#implementation_2","text":"Since this plugin is rather bulky, we omit the implementation details, but the following can be said: based on the weka.gui.explorer.ClassifierPanel the actual code doing the work follows the example in Using the Experiment API article * In order to add our ExperimentPanel to the list of tabs displayed in the Explorer, we need to modify the Explorer.props file (just extract it from the weka.jar and place it in your home directory). The Tabs property must look like this: Tabs=weka.gui.explorer.ClassifierPanel,\\ weka.gui.explorer.ExperimentPanel,\\ weka.gui.explorer.ClustererPanel,\\ weka.gui.explorer.AssociationsPanel,\\ weka.gui.explorer.AttributeSelectionPanel,\\ weka.gui.explorer.VisualizePanel","title":"Implementation"},{"location":"adding_tabs_in_the_explorer/#screenshot_2","text":"","title":"Screenshot"},{"location":"adding_tabs_in_the_explorer/#source_2","text":"ExperimentPanel.java ( stable-3.6 , developer )","title":"Source"},{"location":"ant/","text":"What is ANT? This is how the ANT homepage defines its tool: Apache Ant is a Java-based build tool. In theory, it is kind of like Make, but without Make's wrinkles. Basics # the ANT build file is based on XML the usual name for the build file is build.xml invocation - the usual build file needs not be specified explicitly, if it's in the current directory; if not target is specified, the default one is used ant [-f <build-file>] [<target>] displaying all the available targets of a build file ant [-f <build-file>] -projecthelp Weka and ANT # a build file for Weka is available from git (it has been included in the weka-src.jar since version 3.4.8 and 3.5.3) it is located in the weka directory some targets of interest clean - Removes the build, dist and reports directories; also any class files in the source tree compile - Compile weka and deposit class files in ${path_modifier}/build/classes docs - Make javadocs into {${path_modifier}/doc}} exejar - Create an executable jar file in ${path_modifier}/dist Links # ANT homepage XML","title":"Ant"},{"location":"ant/#basics","text":"the ANT build file is based on XML the usual name for the build file is build.xml invocation - the usual build file needs not be specified explicitly, if it's in the current directory; if not target is specified, the default one is used ant [-f <build-file>] [<target>] displaying all the available targets of a build file ant [-f <build-file>] -projecthelp","title":"Basics"},{"location":"ant/#weka-and-ant","text":"a build file for Weka is available from git (it has been included in the weka-src.jar since version 3.4.8 and 3.5.3) it is located in the weka directory some targets of interest clean - Removes the build, dist and reports directories; also any class files in the source tree compile - Compile weka and deposit class files in ${path_modifier}/build/classes docs - Make javadocs into {${path_modifier}/doc}} exejar - Create an executable jar file in ${path_modifier}/dist","title":"Weka and ANT"},{"location":"ant/#links","text":"ANT homepage XML","title":"Links"},{"location":"auc/","text":"AUC = the A rea U nder the ROC C urve. Weka uses the Mann Whitney statistic to calculate the AUC via the weka.classifiers.evaluation.ThresholdCurve class. Explorer # See ROC curves . KnowledgeFlow # See ROC curves . Commandline # Classifiers can output the AUC if the -i option is provided. The -i option provides detailed information per class. Running the J48 classifier on the iris UCI Dataset with the following commandline: java [CLASSPATH|-classpath <your-classpath>] weka.classifiers.trees.J48 -t /some/where/iris.arff -i produces this output: == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure ROC Area Class 0.98 0 1 0.98 0.99 0.99 Iris-setosa 0.94 0.03 0.94 0.94 0.94 0.952 Iris-versicolor 0.96 0.03 0.941 0.96 0.95 0.961 Iris-virginica See also # ROC curves Mann Whitney statistic on WikiPedia Links # University of Nebraska Medical Center, Interpreting Diagnostic Tests weka.classifiers.evaluation.ThresholdCurve","title":"Auc"},{"location":"auc/#explorer","text":"See ROC curves .","title":"Explorer"},{"location":"auc/#knowledgeflow","text":"See ROC curves .","title":"KnowledgeFlow"},{"location":"auc/#commandline","text":"Classifiers can output the AUC if the -i option is provided. The -i option provides detailed information per class. Running the J48 classifier on the iris UCI Dataset with the following commandline: java [CLASSPATH|-classpath <your-classpath>] weka.classifiers.trees.J48 -t /some/where/iris.arff -i produces this output: == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure ROC Area Class 0.98 0 1 0.98 0.99 0.99 Iris-setosa 0.94 0.03 0.94 0.94 0.94 0.952 Iris-versicolor 0.96 0.03 0.941 0.96 0.95 0.961 Iris-virginica","title":"Commandline"},{"location":"auc/#see-also","text":"ROC curves Mann Whitney statistic on WikiPedia","title":"See also"},{"location":"auc/#links","text":"University of Nebraska Medical Center, Interpreting Diagnostic Tests weka.classifiers.evaluation.ThresholdCurve","title":"Links"},{"location":"batch_filtering/","text":"Batch filtering is used if a second dataset, normally the test set, needs to be processed with the same statistics as the the first dataset, normally the training set. For example, performing standardization with the Standardize filter on two datasets separately will most certainly create two differently standardized output files, since the mean and the standard deviation are based on the input data (and those will differ if the datasets are different). The same applies to the StringToWordVector : here the word dictionary will change, since word occurrences will differ in training and test set. The generated output will be two incompatible files. In order to create compatible train and test set, batch filtering is necessary. Here, the first input/output pair ( -i / -o ) initializes the filter's statistics and the second input/output pair ( -r / -s ) gets processed according to those statistics. To enable batch filtering, one has to provide the additional parameter -b on the commandline. Here is an example Java call: java weka.filters.unsupervised.attribute.Standardize \\ -b \\ -i train.arff \\ -o train_std.arff \\ -r test.arff \\ -s test_std.arff Note: The commandline outlined above is for a Linux/Unix bash (the backslash tells the shell that the command isn't finished yet and continues on the next line). In case of Windows or the SimpleCLI, just remove those backslashes and put everything on one line. See also # See section Batch filtering in the article Use Weka in your Java code , in case you need to perform batch filtering from within your own code","title":"Batch filtering"},{"location":"batch_filtering/#see-also","text":"See section Batch filtering in the article Use Weka in your Java code , in case you need to perform batch filtering from within your own code","title":"See also"},{"location":"binarize_attribute/","text":"Sometimes one wants to binarize a nominal attribute of a certain dataset by grouping all values except the one of interest together as a negation of this value. E.g., in the {{weather}} data the outlook attribute, where sunny is of interest and the other values, rainy and overcast , are grouped together as not-sunny . Original dataset: @relation weather @attribute outlook {sunny, overcast, rainy} @attribute temperature real @attribute humidity real @attribute windy {TRUE, FALSE} @attribute play {yes, no} @data sunny,85,85,FALSE,no sunny,80,90,TRUE,no overcast,83,86,FALSE,yes rainy,70,96,FALSE,yes rainy,68,80,FALSE,yes rainy,65,70,TRUE,no overcast,64,65,TRUE,yes sunny,72,95,FALSE,no sunny,69,70,FALSE,yes rainy,75,80,FALSE,yes sunny,75,70,TRUE,yes overcast,72,90,TRUE,yes overcast,81,75,FALSE,yes rainy,71,91,TRUE,no Desired output: @relation weather-sunny-and-not_sunny @attribute outlook {sunny,not_sunny} @attribute temperature numeric @attribute humidity numeric @attribute windy {TRUE,FALSE} @attribute play {yes,no} @data sunny,85,85,FALSE,no sunny,80,90,TRUE,no not_sunny,83,86,FALSE,yes not_sunny,70,96,FALSE,yes not_sunny,68,80,FALSE,yes not_sunny,65,70,TRUE,no not_sunny,64,65,TRUE,yes sunny,72,95,FALSE,no sunny,69,70,FALSE,yes not_sunny,75,80,FALSE,yes sunny,75,70,TRUE,yes not_sunny,72,90,TRUE,yes not_sunny,81,75,FALSE,yes not_sunny,71,91,TRUE,no The Weka filter NominalToBinary cannot be used directly, since it generates a new attribute for each value of the nominal attribute. As a postprocessing step one could delete all the attributes that are of no interest, but this is quite cumbersome. The Binarize.java class on the other hand generates directly several ARFF out of a given one in the desired format. Download # Binarize.java ( stable , developer )","title":"Binarize attribute"},{"location":"binarize_attribute/#download","text":"Binarize.java ( stable , developer )","title":"Download"},{"location":"citing_weka/","text":"The best reference for WEKA 3.8 and 3.9 is the online appendix on the WEKA workbench for the fourth edition of \"Data Mining: Practical Machine Learning Tools and Techniques\" by I.H. Witten, Eibe Frank, Mark A. Hall, and Chris J. Pal. The citation is Eibe Frank, Mark A. Hall, and Ian H. Witten (2016). The WEKA Workbench. Online Appendix for \"Data Mining: Practical Machine Learning Tools and Techniques\", Morgan Kaufmann, Fourth Edition, 2016. You may also want to consider the SIGKDD Explorations paper covering WEKA 3.6. The citation is Mark Hall, Eibe Frank, Geoffrey Holmes, Bernhard Pfahringer, Peter Reutemann, and Ian H. Witten (2009). The WEKA Data Mining Software: An Update. SIGKDD Explorations, Volume 11, Issue 1. The WEKA logo is available under the Creative Commons Attribution-ShareAlike 2.5 License .","title":"Citing Weka"},{"location":"classifying_large_datasets/","text":"Unless one has access to a 64-bit machine with lots of RAM, it can happen quite easy that one runs into an OutOfMemoryException running WEKA on large datasets. This article tries to present some solutions apart from buying new hardware. Sampling # The question is, does one really need to train with all the data, or is a subset of the data already sufficient? WEKA offers several filters for re-sampling a dataset and generating a new dataset reduced in size: weka.filters.supervised.instance.Resample This filter takes the class distribution into account for generating the sample, i.e., you can even adjust the distribution by adding a bias. weka.filters.unsupervised.instance.Resample The unsupervised filter does not take the class distribution into account for generating the output. weka.filters.supervised.instance.SpreadSubsample It allows you to specify the maximum \"spread\" between the rarest and most common class. See the respective Javadoc for more information ( book version , developer version ). Incremental classifiers # Most classifiers need to see all the data before they can be trained, e.g., J48 or SMO. But there are also schemes that can be trained in an incremental fashion, not just in batch mode. All classifiers implementing the weka.classifiers.UpdateableClassifier interface are able to process data in such a way. Running such a classifier from commandline will load the dataset incrementally (NB: not all data formats can be loaded incrementally; XRFF is one of them, ARFF on the other hand can be read incrementally) and feed the data instance by instance to the classifier. Check out the Javadoc of the UpdateableClassifier interface to see what schemes implement it ( book version , developer version ). Other tools # MOA - Massive Online Analysis A framework for learning from a continuous supply of examples, a data stream. Includes tools for evaluation and a collection of machine learning algorithms. Related to the WEKA project, also written in Java, while scaling to more demanding problems.","title":"Classifying large datasets"},{"location":"classifying_large_datasets/#sampling","text":"The question is, does one really need to train with all the data, or is a subset of the data already sufficient? WEKA offers several filters for re-sampling a dataset and generating a new dataset reduced in size: weka.filters.supervised.instance.Resample This filter takes the class distribution into account for generating the sample, i.e., you can even adjust the distribution by adding a bias. weka.filters.unsupervised.instance.Resample The unsupervised filter does not take the class distribution into account for generating the output. weka.filters.supervised.instance.SpreadSubsample It allows you to specify the maximum \"spread\" between the rarest and most common class. See the respective Javadoc for more information ( book version , developer version ).","title":"Sampling"},{"location":"classifying_large_datasets/#incremental-classifiers","text":"Most classifiers need to see all the data before they can be trained, e.g., J48 or SMO. But there are also schemes that can be trained in an incremental fashion, not just in batch mode. All classifiers implementing the weka.classifiers.UpdateableClassifier interface are able to process data in such a way. Running such a classifier from commandline will load the dataset incrementally (NB: not all data formats can be loaded incrementally; XRFF is one of them, ARFF on the other hand can be read incrementally) and feed the data instance by instance to the classifier. Check out the Javadoc of the UpdateableClassifier interface to see what schemes implement it ( book version , developer version ).","title":"Incremental classifiers"},{"location":"classifying_large_datasets/#other-tools","text":"MOA - Massive Online Analysis A framework for learning from a continuous supply of examples, a data stream. Includes tools for evaluation and a collection of machine learning algorithms. Related to the WEKA project, also written in Java, while scaling to more demanding problems.","title":"Other tools"},{"location":"classpath/","text":"The CLASSPATH environment variable tells Java where to look for classes. Since Java does the search in a ''first-come-first-serve'' kind of manner, you'll have to take care where and what to put in your CLASSPATH. I, personally, never use the environment variable, since I'm working often on a project in different versions in parallel. The CLASSPATH would just mess up things, if you're not careful (or just forget to remove an entry). ANT offers a nice way for building (and separating source code and class files) Java projects. But still, if you're only working on totally separate projects, it might be easiest for you to use the environment variable. Setting the CLASSPATH # In the following we add the mysql-connector-java-5.1.6-bin.jar to our CLASSPATH variable (this works for any other jar archive) to make it possible to access MySQL Databases via JDBC. Windows # We assume that the mysql-connector-java-5.1.6-bin.jar archive is located in the following directory: C:\\Program Files\\Weka-3-8 In the Control Panel click on System (or right click on This PC and select Properties ) and then go to the Advanced tab. There you will find a button called Environment Variables , click it. Depending on, whether you're the only person using this computer or it is a lab computer shared by many, you can either create a new system-wide (you are the only user) environment variable or a user dependent one (recommended for multi-user machines). Enter the following name for the variable CLASSPATH and add this value C:\\Program Files\\Weka-3-8\\mysql-connector-java-5.1.6-bin.jar If you want to add additional jars, you'll have to separate them with the path separator, the semicolon ; (no spaces!). Unix/Linux # I assume, that the mysql jar is located in the following directory: /home/johndoe/jars/ Open a shell and execute the following command, depending on the shell you're using: bash export CLASSPATH=$CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar c shell setenv CLASSPATH $CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar Unix/Linux uses the colon : as path separator, in contrast to Windows, which uses the semicolon ; . Note: the prefixing with $CLASSPATH adds the mysql jar at the end of the currently existing CLASSPATH . Cygwin # The process is like with Unix/Linux systems, but since the host system is Win32 and therefore the Java installation also a Windows application, you'll have to use the semicolon ; as separator for several jars.","title":"Classpath"},{"location":"classpath/#setting-the-classpath","text":"In the following we add the mysql-connector-java-5.1.6-bin.jar to our CLASSPATH variable (this works for any other jar archive) to make it possible to access MySQL Databases via JDBC.","title":"Setting the CLASSPATH"},{"location":"classpath/#windows","text":"We assume that the mysql-connector-java-5.1.6-bin.jar archive is located in the following directory: C:\\Program Files\\Weka-3-8 In the Control Panel click on System (or right click on This PC and select Properties ) and then go to the Advanced tab. There you will find a button called Environment Variables , click it. Depending on, whether you're the only person using this computer or it is a lab computer shared by many, you can either create a new system-wide (you are the only user) environment variable or a user dependent one (recommended for multi-user machines). Enter the following name for the variable CLASSPATH and add this value C:\\Program Files\\Weka-3-8\\mysql-connector-java-5.1.6-bin.jar If you want to add additional jars, you'll have to separate them with the path separator, the semicolon ; (no spaces!).","title":"Windows"},{"location":"classpath/#unixlinux","text":"I assume, that the mysql jar is located in the following directory: /home/johndoe/jars/ Open a shell and execute the following command, depending on the shell you're using: bash export CLASSPATH=$CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar c shell setenv CLASSPATH $CLASSPATH:/home/johndoe/jars/mysql-connector-java-5.1.6-bin.jar Unix/Linux uses the colon : as path separator, in contrast to Windows, which uses the semicolon ; . Note: the prefixing with $CLASSPATH adds the mysql jar at the end of the currently existing CLASSPATH .","title":"Unix/Linux"},{"location":"classpath/#cygwin","text":"The process is like with Unix/Linux systems, but since the host system is Win32 and therefore the Java installation also a Windows application, you'll have to use the semicolon ; as separator for several jars.","title":"Cygwin"},{"location":"classpath_problems/","text":"Having problems getting Weka to run from a DOS/UNIX command prompt? Getting java.lang.NoClassDefFoundError exceptions? Most likely your CLASSPATH environment variable is not set correctly - it needs to point to the weka.jar file that you downloaded with Weka (or the parent of the Weka directory if you have extracted the jar). Under DOS this can be achieved with: set CLASSPATH=c:\\weka-3-4\\weka.jar;%CLASSPATH% Under UNIX/Linux something like: export CLASSPATH = /home/weka/weka.jar: $CLASSPATH An easy way to get avoid setting the variable this is to specify the CLASSPATH when calling Java. For example, if the jar file is located at c:\\weka-3-4\\weka.jar you can use: java -cp c: \\w eka-3-4 \\w eka.jar weka.classifiers... See also the CLASSPATH article.","title":"Classpath problems"},{"location":"command_redirection/","text":"Console # With command redirection one can redirect standard streams like stdin , stdout and stderr to user-specified locations. Quite often it is useful to redirect the output of a program to a text file. redirecting stdout to a file someProgram >/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt (Windows command prompt) redirecting stderr to a file someProgram 2>/some/where/output.txt (Linux/Unix Bash) someProgram 2>c:\\some\\where\\output.txt (Windows command prompt) redirecting stdout and stderr to a file someProgram &>/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt 2>&1 (Windows command prompt) Note: under Weka quite often the output is printed to stderr , e.g., if one is using the -p 0 option from the commandline to print the predicted values for a test file: java weka.classifiers.trees.J48 -t train.arff -T test.arff -p 0 2> j48.txt or if one already has a trained model: java weka.classifiers.trees.J48 -l j48.model -T test.arff -p 0 2> j48.txt SimpleCLI # One can perform a basic redirection also in the SimpleCLI, e.g.: java weka.classifiers.trees.J48 -t test.arff > j48.txt Note: the > must be preceded and followed by a space , otherwise it is not recognized as redirection, but part of another parameter. Links # Linux Command redirection under Bash I/O Redirection under Bash Redirection under Unix (WikiPedia) Windows Command redirection under MS Windows Command redirection under MS DOS","title":"Console"},{"location":"command_redirection/#console","text":"With command redirection one can redirect standard streams like stdin , stdout and stderr to user-specified locations. Quite often it is useful to redirect the output of a program to a text file. redirecting stdout to a file someProgram >/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt (Windows command prompt) redirecting stderr to a file someProgram 2>/some/where/output.txt (Linux/Unix Bash) someProgram 2>c:\\some\\where\\output.txt (Windows command prompt) redirecting stdout and stderr to a file someProgram &>/some/where/output.txt (Linux/Unix Bash) someProgram >c:\\some\\where\\output.txt 2>&1 (Windows command prompt) Note: under Weka quite often the output is printed to stderr , e.g., if one is using the -p 0 option from the commandline to print the predicted values for a test file: java weka.classifiers.trees.J48 -t train.arff -T test.arff -p 0 2> j48.txt or if one already has a trained model: java weka.classifiers.trees.J48 -l j48.model -T test.arff -p 0 2> j48.txt","title":"Console"},{"location":"command_redirection/#simplecli","text":"One can perform a basic redirection also in the SimpleCLI, e.g.: java weka.classifiers.trees.J48 -t test.arff > j48.txt Note: the > must be preceded and followed by a space , otherwise it is not recognized as redirection, but part of another parameter.","title":"SimpleCLI"},{"location":"command_redirection/#links","text":"Linux Command redirection under Bash I/O Redirection under Bash Redirection under Unix (WikiPedia) Windows Command redirection under MS Windows Command redirection under MS DOS","title":"Links"},{"location":"compiling_weka/","text":"There are several ways of compiling the Weka source code: with ant takes care of compiling all the necessary classes and easily generates jar archives with maven similar to ant with an IDE, like IntelliJ IDEA, Eclipse or NetBeans can be very helpful for debugging tricky bugs","title":"Compiling weka"},{"location":"cost_matrix/","text":"Format # Format of the cost matrices: regular % Rows Columns 2 2 % Matrix elements 0.0 5.0 1.0 0.0 Matlab single-line format (see also the Matlab Primer ) [0.0 5.0; 1.0 0.0] Testing the format # The following code loads a cost matrix and prints its content to the console. Useful, if one wants to test whether the format is correct: import weka.classifiers.CostMatrix ; import java.io.BufferedReader ; import java.io.FileReader ; /** * Loads the cost matrix \"args[0]\" and prints its content to the console. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class CostMatrixLoader { public static void main ( String [] args ) throws Exception { CostMatrix matrix = new ** CostMatrix ** ( new BufferedReader ( new FileReader ( args [ 0 ] ))); System . out . println ( matrix ); } } See also # CostSensitiveClassifier MetaCost Downloads # CostMatrixLoader.java","title":"Format"},{"location":"cost_matrix/#format","text":"Format of the cost matrices: regular % Rows Columns 2 2 % Matrix elements 0.0 5.0 1.0 0.0 Matlab single-line format (see also the Matlab Primer ) [0.0 5.0; 1.0 0.0]","title":"Format"},{"location":"cost_matrix/#testing-the-format","text":"The following code loads a cost matrix and prints its content to the console. Useful, if one wants to test whether the format is correct: import weka.classifiers.CostMatrix ; import java.io.BufferedReader ; import java.io.FileReader ; /** * Loads the cost matrix \"args[0]\" and prints its content to the console. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class CostMatrixLoader { public static void main ( String [] args ) throws Exception { CostMatrix matrix = new ** CostMatrix ** ( new BufferedReader ( new FileReader ( args [ 0 ] ))); System . out . println ( matrix ); } }","title":"Testing the format"},{"location":"cost_matrix/#see-also","text":"CostSensitiveClassifier MetaCost","title":"See also"},{"location":"cost_matrix/#downloads","text":"CostMatrixLoader.java","title":"Downloads"},{"location":"cost_sensitive_classifier/","text":"A meta classifier that makes its base classifier cost-sensitive. Two methods can be used to introduce cost-sensitivity: reweighting training instances according to the total cost assigned to each class; or predicting the class with minimum expected misclassification cost (rather than the most likely class). Performance can often be improved by using a bagged classifier to improve the probability estimates of the base classifier. Since the classifier, in default mode (i.e., when using the reweighting method), normalizes the cost matrix before applying it, it can be hard coming up with a cost matrix, e.g., one that balances out imbalanced data. Here is an example: input cost matrix -3 1 1 1 -6 1 0 0 0 normalized cost matrix 0 7 1 4 0 1 3 6 0 The application of a cost matrix using the second, minimum-expected cost approach, which is also used by MetaCost , is more intuitive. See also # MetaCost CostMatrix","title":"Cost sensitive classifier"},{"location":"cost_sensitive_classifier/#see-also","text":"MetaCost CostMatrix","title":"See also"},{"location":"creating_instances/","text":"see Creating an ARFF file","title":"Creating instances"},{"location":"databases/","text":"CLASSPATH # See the CLASSPATH article for how to set up your CLASSPATH environment variable, in order to make the JDBC driver available for Weka. Configuration files # Thanks to JDBC it is easy to connect to Databases that provide a JDBC driver. Responsible for the setup is the following properties file, located in the weka.experiment package: DatabaseUtils.props You can get this properties file from the weka.jar or weka-src.jar jar-archive, both part of a normal Weka release. If you open up one of those files, you'll find the properties file in the sub-folder weka/experiment . Weka comes with example files for a wide range of databases: DatabaseUtils.props.hsql - HSQLDB DatabaseUtils.props.msaccess - MS Access (see the Windows Databases article for more information) DatabaseUtils.props.mssqlserver - MS SQL Server 2000 DatabaseUtils.props.mssqlserver2005 - MS SQL Server 2005 DatabaseUtils.props.mysql - MySQL DatabaseUtils.props.odbc - ODBC access via Sun's ODBC/JDBC bridge, e.g., for MS Sql Server (see the Windows Databases article for more information) DatabaseUtils.props.oracle - Oracle 10g DatabaseUtils.props.postgresql - PostgreSQL 7.4 DatabaseUtils.props.sqlite3 - sqlite 3.x The easiest way is just to place the extracted properties file into your HOME directory. For more information on how property files are processed, check out this article. Note: Weka only looks for the DatabaseUtils.props file. If you take one of the example files listed above, you need to rename it first. Setup # Under normal circumstances you only have to edit the following two properties: jdbcDriver jdbcURL Driver # jdbcDriver is the classname of the JDBC driver, necessary to connect to your database, e.g.: HSQLDB - org.hsqldb.jdbcDriver MS SQL Server 2000 (Desktop Edition) - com.microsoft.jdbc.sqlserver.SQLServerDriver MS SQL Server 2005 - com.microsoft.sqlserver.jdbc.SQLServerDriver MySQL - org.gjt.mm.mysql.Driver (or com.mysql.jdbc.Driver ) ODBC - part of Sun's JDKs/JREs, no external driver necessary - sun.jdbc.odbc.JdbcOdbcDriver Oracle - oracle.jdbc.driver.OracleDriver PostgreSQL - org.postgresql.Driver sqlite 3.x - org.sqlite.JDBC URL # jdbcURL specifies the JDBC URL pointing to your database (can be still changed in the Experimenter/Explorer), e.g. for the database MyDatabase on the server server.my.domain : HSQLDB - jdbc:hsqldb:hsql://server.my.domain/MyDatabase MS SQL Server 2000 (Desktop Edition) - jdbc:microsoft:sqlserver://server.my.comain:1433 Note: if you add ;databasename=*db-name* you can connect to a different database than the default one, e.g., MyDatabase MS SQL Server 2005 - jdbc:sqlserver://server.my.domain:1433 MySQL - jdbc:mysql://server.my.domain:3306/MyDatabase ODBC - jdbc:odbc:DSN_name (replace DSN_name with the DSN that you want to use) Oracle (thin driver) - jdbc:oracle:thin:@server.my.domain:1526:orcl Note: @machineName:port:SID for the Express Edition you can use: jdbc:oracle:thin:@server.my.domain:1521:XE PostgreSQL - jdbc:postgresql://server.my.domain:5432/MyDatabase You can also specify user and password directly in the URL: jdbc:postgresql://server.my.domain:5432/MyDatabase?user=<...>&password=<...> where you have to replace the <...> with the correct values sqlite 3.x - jdbc:sqlite:/path/to/database.db (you can access only local files) Missing Datatypes # Sometimes (e.g. with MySQL) it can happen that a column type cannot be interpreted. In that case it is necessary to map the name of the column type to the Java type it should be interpreted as. E.g. the MySQL type TEXT is returned as BLOB from the JDBC driver and has to be mapped to String ( 0 represents String - the mappings can be found in the comments of the properties file): BLOB=0 The article weka/experiment/DatabaseUtils.props contains more details on this topic. Stored Procedures # Let's say you're tired of typing the same query over and over again. A good way to shorten that, is to create a stored procedure. PostgreSQL 7.4.x # The following example creates a procedure called emplyoee_name that returns the names of all the employees in table employee . Even though it doesn't make much sense to create a stored procedure for this query, nonetheless, it shows how to create and call stored procedures in PostgreSQL. Create CREATE OR REPLACE FUNCTION public.employee_name() RETURNS SETOF text AS 'select name from employee' LANGUAGE 'sql' VOLATILE; SQL statement to call procedure SELECT * FROM employee_name() Retrieve data via InstanceQuery java weka.experiment.InstanceQuery -Q \"SELECT * FROM employee_name()\" -U <user> -P <password> Troubleshooting # In case you're experiencing problems connecting to your database, check out the mailing list . It is possible that somebody else encountered the same problem as you and you'll find a post containing the solution to your problem. Specific MS SQL Server 2000 Troubleshooting MS SQL Server 2005: TCP/IP is not enabled for SQL Server, or the server or port number specified is incorrect.Verify that SQL Server is listening with TCP/IP on the specified server and port. This might be reported with an exception similar to: \"The login has failed. The TCP/IP connection to the host has failed.\" This indicates one of the following: SQL Server is installed but TCP/IP has not been installed as a network protocol for SQL Server by using the SQL Server Network Utility for SQL Server 2000, or the SQL Server Configuration Manager for SQL Server 2005 TCP/IP is installed as a SQL Server protocol, but it is not listening on the port specified in the JDBC connection URL. The default port is 1433. The port that is used by the server has not been opened in the firewall The Added driver: ... output on the commandline does not mean that the actual class was found, but only that Weka will attempt to load the class later on in order to establish a database connection. The error message No suitable driver can be caused by the following: The JDBC driver you are attempting to load is not in the CLASSPATH (Note: using -jar in the java commandline overwrites the CLASSPATH environment variable!). Open the SimpleCLI, run the command java weka.core.SystemInfo and check whether the property java.class.path lists your database jar. If not correct your CLASSPATH or the Java call you start Weka with. The JDBC driver class is misspelled in the jdbcDriver property or you have multiple entries of jdbcDriver ( properties file s need unique keys!) The jdbcURL property has a spelling error and tries to use a non-existing protocol or you listed it multiple times, which doesn't work either (remember, properties file s need unique keys!) See also # weka/experiment/DatabaseUtils.props properties file CLASSPATH Links # HSQLDB homepage IBM Cloudscape homepage Microsoft SQL Server SQL Server 2000 (Desktop Engine) SQL Server 2000 JDBC Driver SP 3 SQL Server 2005 JDBC Driver MySQL homepage JDBC driver Oracle homepage JDBC driver JDBC FAQ PostgreSQL homepage JDBC driver sqlite homepage JDBC driver Weka Mailing list","title":"CLASSPATH"},{"location":"databases/#classpath","text":"See the CLASSPATH article for how to set up your CLASSPATH environment variable, in order to make the JDBC driver available for Weka.","title":"CLASSPATH"},{"location":"databases/#configuration-files","text":"Thanks to JDBC it is easy to connect to Databases that provide a JDBC driver. Responsible for the setup is the following properties file, located in the weka.experiment package: DatabaseUtils.props You can get this properties file from the weka.jar or weka-src.jar jar-archive, both part of a normal Weka release. If you open up one of those files, you'll find the properties file in the sub-folder weka/experiment . Weka comes with example files for a wide range of databases: DatabaseUtils.props.hsql - HSQLDB DatabaseUtils.props.msaccess - MS Access (see the Windows Databases article for more information) DatabaseUtils.props.mssqlserver - MS SQL Server 2000 DatabaseUtils.props.mssqlserver2005 - MS SQL Server 2005 DatabaseUtils.props.mysql - MySQL DatabaseUtils.props.odbc - ODBC access via Sun's ODBC/JDBC bridge, e.g., for MS Sql Server (see the Windows Databases article for more information) DatabaseUtils.props.oracle - Oracle 10g DatabaseUtils.props.postgresql - PostgreSQL 7.4 DatabaseUtils.props.sqlite3 - sqlite 3.x The easiest way is just to place the extracted properties file into your HOME directory. For more information on how property files are processed, check out this article. Note: Weka only looks for the DatabaseUtils.props file. If you take one of the example files listed above, you need to rename it first.","title":"Configuration files"},{"location":"databases/#setup","text":"Under normal circumstances you only have to edit the following two properties: jdbcDriver jdbcURL","title":"Setup"},{"location":"databases/#driver","text":"jdbcDriver is the classname of the JDBC driver, necessary to connect to your database, e.g.: HSQLDB - org.hsqldb.jdbcDriver MS SQL Server 2000 (Desktop Edition) - com.microsoft.jdbc.sqlserver.SQLServerDriver MS SQL Server 2005 - com.microsoft.sqlserver.jdbc.SQLServerDriver MySQL - org.gjt.mm.mysql.Driver (or com.mysql.jdbc.Driver ) ODBC - part of Sun's JDKs/JREs, no external driver necessary - sun.jdbc.odbc.JdbcOdbcDriver Oracle - oracle.jdbc.driver.OracleDriver PostgreSQL - org.postgresql.Driver sqlite 3.x - org.sqlite.JDBC","title":"Driver"},{"location":"databases/#url","text":"jdbcURL specifies the JDBC URL pointing to your database (can be still changed in the Experimenter/Explorer), e.g. for the database MyDatabase on the server server.my.domain : HSQLDB - jdbc:hsqldb:hsql://server.my.domain/MyDatabase MS SQL Server 2000 (Desktop Edition) - jdbc:microsoft:sqlserver://server.my.comain:1433 Note: if you add ;databasename=*db-name* you can connect to a different database than the default one, e.g., MyDatabase MS SQL Server 2005 - jdbc:sqlserver://server.my.domain:1433 MySQL - jdbc:mysql://server.my.domain:3306/MyDatabase ODBC - jdbc:odbc:DSN_name (replace DSN_name with the DSN that you want to use) Oracle (thin driver) - jdbc:oracle:thin:@server.my.domain:1526:orcl Note: @machineName:port:SID for the Express Edition you can use: jdbc:oracle:thin:@server.my.domain:1521:XE PostgreSQL - jdbc:postgresql://server.my.domain:5432/MyDatabase You can also specify user and password directly in the URL: jdbc:postgresql://server.my.domain:5432/MyDatabase?user=<...>&password=<...> where you have to replace the <...> with the correct values sqlite 3.x - jdbc:sqlite:/path/to/database.db (you can access only local files)","title":"URL"},{"location":"databases/#missing-datatypes","text":"Sometimes (e.g. with MySQL) it can happen that a column type cannot be interpreted. In that case it is necessary to map the name of the column type to the Java type it should be interpreted as. E.g. the MySQL type TEXT is returned as BLOB from the JDBC driver and has to be mapped to String ( 0 represents String - the mappings can be found in the comments of the properties file): BLOB=0 The article weka/experiment/DatabaseUtils.props contains more details on this topic.","title":"Missing Datatypes"},{"location":"databases/#stored-procedures","text":"Let's say you're tired of typing the same query over and over again. A good way to shorten that, is to create a stored procedure.","title":"Stored Procedures"},{"location":"databases/#postgresql-74x","text":"The following example creates a procedure called emplyoee_name that returns the names of all the employees in table employee . Even though it doesn't make much sense to create a stored procedure for this query, nonetheless, it shows how to create and call stored procedures in PostgreSQL. Create CREATE OR REPLACE FUNCTION public.employee_name() RETURNS SETOF text AS 'select name from employee' LANGUAGE 'sql' VOLATILE; SQL statement to call procedure SELECT * FROM employee_name() Retrieve data via InstanceQuery java weka.experiment.InstanceQuery -Q \"SELECT * FROM employee_name()\" -U <user> -P <password>","title":"PostgreSQL 7.4.x"},{"location":"databases/#troubleshooting","text":"In case you're experiencing problems connecting to your database, check out the mailing list . It is possible that somebody else encountered the same problem as you and you'll find a post containing the solution to your problem. Specific MS SQL Server 2000 Troubleshooting MS SQL Server 2005: TCP/IP is not enabled for SQL Server, or the server or port number specified is incorrect.Verify that SQL Server is listening with TCP/IP on the specified server and port. This might be reported with an exception similar to: \"The login has failed. The TCP/IP connection to the host has failed.\" This indicates one of the following: SQL Server is installed but TCP/IP has not been installed as a network protocol for SQL Server by using the SQL Server Network Utility for SQL Server 2000, or the SQL Server Configuration Manager for SQL Server 2005 TCP/IP is installed as a SQL Server protocol, but it is not listening on the port specified in the JDBC connection URL. The default port is 1433. The port that is used by the server has not been opened in the firewall The Added driver: ... output on the commandline does not mean that the actual class was found, but only that Weka will attempt to load the class later on in order to establish a database connection. The error message No suitable driver can be caused by the following: The JDBC driver you are attempting to load is not in the CLASSPATH (Note: using -jar in the java commandline overwrites the CLASSPATH environment variable!). Open the SimpleCLI, run the command java weka.core.SystemInfo and check whether the property java.class.path lists your database jar. If not correct your CLASSPATH or the Java call you start Weka with. The JDBC driver class is misspelled in the jdbcDriver property or you have multiple entries of jdbcDriver ( properties file s need unique keys!) The jdbcURL property has a spelling error and tries to use a non-existing protocol or you listed it multiple times, which doesn't work either (remember, properties file s need unique keys!)","title":"Troubleshooting"},{"location":"databases/#see-also","text":"weka/experiment/DatabaseUtils.props properties file CLASSPATH","title":"See also"},{"location":"databases/#links","text":"HSQLDB homepage IBM Cloudscape homepage Microsoft SQL Server SQL Server 2000 (Desktop Engine) SQL Server 2000 JDBC Driver SP 3 SQL Server 2005 JDBC Driver MySQL homepage JDBC driver Oracle homepage JDBC driver JDBC FAQ PostgreSQL homepage JDBC driver sqlite homepage JDBC driver Weka Mailing list","title":"Links"},{"location":"datasets/","text":"Some example datasets for analysis with Weka are included in the Weka distribution and can be found in the data folder of the installed software. Miscellaneous collections of datasets # A jarfile containing 37 classification problems originally obtained from the UCI repository of machine learning datasets ( datasets-UCI.jar , 1,190,961 Bytes). A jarfile containing 37 regression problems obtained from various sources ( datasets-numeric.jar , 169,344 Bytes). A jarfile containing 6 agricultural datasets obtained from agricultural researchers in New Zealand ( agridatasets.jar , 31,200 Bytes). A jarfile containing 30 regression datasets collected by Professor Luis Torgo ( regression-datasets.jar , 10,090,266 Bytes). A gzip'ed tar containing UCI ML and UCI KDD datasets ( uci-20070111.tar.gz , 17,952,832 Bytes) A gzip'ed tar containing StatLib datasets ( statlib-20050214.tar.gz , 12,785,582 Bytes) A gzip'ed tar containing ordinal, real-world datasets donated by Professor Arie Ben David ( datasets-arie_ben_david.tar.gz , 11,348 Bytes) A zip file containing 19 multi-class (1-of-n) text datasets donated by Dr George Forman ( 19MclassTextWc.zip , 14,084,828 Bytes) A bzip'ed tar file containing the Reuters21578 dataset split into separate files according to the ModApte split reuters21578-ModApte.tar.bz2 , 81,745,032 Bytes A zip file containing 41 drug design datasets formed using the Adriana.Code software donated by Dr Mehmet Fatih Amasyali ( Drug-datasets.zip , 11,376,153 Bytes) A zip file containing 80 artificial datasets generated from the Friedman function donated by Dr. M. Fatih Amasyali (Yildiz Technical Unversity) ( Friedman-datasets.zip , 5,802,204 Bytes) A zip file containing a new, image-based version of the classic iris data, with 50 images for each of the three species of iris. The images have size 600x600. Please see the ARFF file for further information ( iris_reloaded.zip , 92,267,000 Bytes). After expanding into a directory using your jar utility (or an archive program that handles tar-archives/zip files in case of the gzip'ed tars/zip files), these datasets may be used with Weka. Bioinformatics datasets # Some bioinformatics datasets in Weka's ARFF format. These are quite old but still available thanks to the Internet Archive. Protein datasets made available by Associate Professor Shuiwang Ji when he was a PhD student at Louisiana State University . Kent Ridge Biomedical Data Set Repository , which was put together by Professor Jinyan Li and Dr Huiqing Liu while they were at the Institute for Infocomm Research, Singapore . Repository for Epitope Datasets (RED) , maintained by Professor Yasser El-Manzalawy when he was at Iowa State University .","title":"Datasets"},{"location":"datasets/#miscellaneous-collections-of-datasets","text":"A jarfile containing 37 classification problems originally obtained from the UCI repository of machine learning datasets ( datasets-UCI.jar , 1,190,961 Bytes). A jarfile containing 37 regression problems obtained from various sources ( datasets-numeric.jar , 169,344 Bytes). A jarfile containing 6 agricultural datasets obtained from agricultural researchers in New Zealand ( agridatasets.jar , 31,200 Bytes). A jarfile containing 30 regression datasets collected by Professor Luis Torgo ( regression-datasets.jar , 10,090,266 Bytes). A gzip'ed tar containing UCI ML and UCI KDD datasets ( uci-20070111.tar.gz , 17,952,832 Bytes) A gzip'ed tar containing StatLib datasets ( statlib-20050214.tar.gz , 12,785,582 Bytes) A gzip'ed tar containing ordinal, real-world datasets donated by Professor Arie Ben David ( datasets-arie_ben_david.tar.gz , 11,348 Bytes) A zip file containing 19 multi-class (1-of-n) text datasets donated by Dr George Forman ( 19MclassTextWc.zip , 14,084,828 Bytes) A bzip'ed tar file containing the Reuters21578 dataset split into separate files according to the ModApte split reuters21578-ModApte.tar.bz2 , 81,745,032 Bytes A zip file containing 41 drug design datasets formed using the Adriana.Code software donated by Dr Mehmet Fatih Amasyali ( Drug-datasets.zip , 11,376,153 Bytes) A zip file containing 80 artificial datasets generated from the Friedman function donated by Dr. M. Fatih Amasyali (Yildiz Technical Unversity) ( Friedman-datasets.zip , 5,802,204 Bytes) A zip file containing a new, image-based version of the classic iris data, with 50 images for each of the three species of iris. The images have size 600x600. Please see the ARFF file for further information ( iris_reloaded.zip , 92,267,000 Bytes). After expanding into a directory using your jar utility (or an archive program that handles tar-archives/zip files in case of the gzip'ed tars/zip files), these datasets may be used with Weka.","title":"Miscellaneous collections of datasets"},{"location":"datasets/#bioinformatics-datasets","text":"Some bioinformatics datasets in Weka's ARFF format. These are quite old but still available thanks to the Internet Archive. Protein datasets made available by Associate Professor Shuiwang Ji when he was a PhD student at Louisiana State University . Kent Ridge Biomedical Data Set Repository , which was put together by Professor Jinyan Li and Dr Huiqing Liu while they were at the Institute for Infocomm Research, Singapore . Repository for Epitope Datasets (RED) , maintained by Professor Yasser El-Manzalawy when he was at Iowa State University .","title":"Bioinformatics datasets"},{"location":"development/","text":"We are following the Linux model of releases, where an even second digit of a release number indicates a \"stable\" release and an odd second digit indicates a \"development\" release (e.g., 3.0.x is a stable release and 3.1.x is a developmental release). If you are using a developmental release, there may be new features, but it is entirely possible that these features will be transient and/or unstable, and backward compatibility of the API and/or models is not guaranteed. If you require stability for teaching or deployment in applications, it is best to use a stable release of Weka. Source code repository # Weka's source code for a particular release is included in the distribution when you download it, in a .jar file (a form of .zip file) called weka-src.jar . However, it is also possible to read source code directly from the git source code repository for Weka. Code credits # The Weka developers would like to thank The MathWorks and the National Institute of Standards and Technology (NIST) for developing the Jama Matrix package and releasing it to the public domain, and to CERN (European Organization for Nuclear Research) for statistics-related code from their Jet libraries (now part of COLT ). The core Weka distributions include third-party library code from the MTJ project for fast matrix algebra in Java, the Java CUP project for generating parsers, the authentication dialog from the Bounce project , and the Apache Commons Compress library. For more information, see the lib folder of the source code repository. Weka, including the early non-Java predecessors of Weka 3, was developed at the Department of Computer Science of the University of Waikato in Hamilton , New Zealand . Most of Weka 3 was written by Eibe Frank, Mark Hall, Peter Reutemann, and Len Trigg, but many others have made significant contributions, in particular, Remco Bouckaert, Richard Kirkby, Ashraf Kibriya, Xin Xu, and Malcolm Ware. For complete info on the contributors, check the Javadoc extracted from the source code of Weka, which is part of the available documentation . Weka's package manager provides access to a large collection of optional libraries, many of which have been contributed by developers from other institutions. For information on the authors of these packages and the third-party libraries used within those Weka packages, please consult the Javadoc for the relevant package and the corresponding package lib folder.","title":"Development"},{"location":"development/#source-code-repository","text":"Weka's source code for a particular release is included in the distribution when you download it, in a .jar file (a form of .zip file) called weka-src.jar . However, it is also possible to read source code directly from the git source code repository for Weka.","title":"Source code repository"},{"location":"development/#code-credits","text":"The Weka developers would like to thank The MathWorks and the National Institute of Standards and Technology (NIST) for developing the Jama Matrix package and releasing it to the public domain, and to CERN (European Organization for Nuclear Research) for statistics-related code from their Jet libraries (now part of COLT ). The core Weka distributions include third-party library code from the MTJ project for fast matrix algebra in Java, the Java CUP project for generating parsers, the authentication dialog from the Bounce project , and the Apache Commons Compress library. For more information, see the lib folder of the source code repository. Weka, including the early non-Java predecessors of Weka 3, was developed at the Department of Computer Science of the University of Waikato in Hamilton , New Zealand . Most of Weka 3 was written by Eibe Frank, Mark Hall, Peter Reutemann, and Len Trigg, but many others have made significant contributions, in particular, Remco Bouckaert, Richard Kirkby, Ashraf Kibriya, Xin Xu, and Malcolm Ware. For complete info on the contributors, check the Javadoc extracted from the source code of Weka, which is part of the available documentation . Weka's package manager provides access to a large collection of optional libraries, many of which have been contributed by developers from other institutions. For information on the authors of these packages and the third-party libraries used within those Weka packages, please consult the Javadoc for the relevant package and the corresponding package lib folder.","title":"Code credits"},{"location":"discretizing_datasets/","text":"Once in a while one has numeric data but wants to use classifier that handles only nominal values. In that case one needs to discretize the data, which can be done with the following filters: weka.filters.supervised.attribute.Discretize uses either Fayyad & Irani's MDL method or Kononeko's MDL criterion weka.filters.unsupervised.attribute.Discretize uses simple binning But, since discretization depends on the data which presented to the discretization algorithm, one easily end up with incompatible train and test files. The following shows how to generate compatible discretized files out of a training and a test file by using the supervised version of the filter. The class takes four files as arguments: input training file input test file output training file output test file import java.io.* ; import weka.core.* ; import weka.filters.Filter ; import weka.filters.supervised.attribute.Discretize ; /** * Shows how to generate compatible train/test sets using the Discretize * filter. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class DiscretizeTest { /** * loads the given ARFF file and sets the class attribute as the last * attribute. * * @param filename the file to load * @throws Exception if somethings goes wrong */ protected static Instances load ( String filename ) throws Exception { Instances result ; BufferedReader reader ; reader = new BufferedReader ( new FileReader ( filename )); result = new Instances ( reader ); result . setClassIndex ( result . numAttributes () - 1 ); reader . close (); return result ; } /** * saves the data to the specified file * * @param data the data to save to a file * @param filename the file to save the data to * @throws Exception if something goes wrong */ protected static void save ( Instances data , String filename ) throws Exception { BufferedWriter writer ; writer = new BufferedWriter ( new FileWriter ( filename )); writer . write ( data . toString ()); writer . newLine (); writer . flush (); writer . close (); } /** * Takes four arguments: * <ol> * <li>input train file</li> * <li>input test file</li> * <li>output train file</li> * <li>output test file</li> * </ol> * * @param args the commandline arguments * @throws Exception if something goes wrong */ public static void main ( String [] args ) throws Exception { Instances inputTrain ; Instances inputTest ; Instances outputTrain ; Instances outputTest ; Discretize filter ; * load data ( class attribute is assumed to be last attribute ) inputTrain = load ( args [ 0 ] ); inputTest = load ( args [ 1 ] ); * setup filter filter = new Discretize (); filter . setInputFormat ( inputTrain ); * apply filter outputTrain = Filter . useFilter ( inputTrain , filter ); outputTest = Filter . useFilter ( inputTest , filter ); * save output save ( outputTrain , args [ 2 ] ); save ( outputTest , args [ 3 ] ); } } The same can be achieved from the commandline with this command ( batch filtering ): java weka.filters.supervised.attribute.Discretize -b -i < in -train> -o <out-train> -r < in -test> -s <out-test> -c <class-index> See also # Manual discretization (Using the MathExpression filter) Batch filtering Downloads # DiscretizeTest.java Links # Javadoc Discretize (supervised) Discretize (unsupervised)","title":"Discretizing datasets"},{"location":"discretizing_datasets/#see-also","text":"Manual discretization (Using the MathExpression filter) Batch filtering","title":"See also"},{"location":"discretizing_datasets/#downloads","text":"DiscretizeTest.java","title":"Downloads"},{"location":"discretizing_datasets/#links","text":"Javadoc Discretize (supervised) Discretize (unsupervised)","title":"Links"},{"location":"document_classification/","text":"See Text categorization with Weka","title":"Document classification"},{"location":"documentation/","text":"This wiki is not the only source of information on the Weka software. Weka comes with built-in help and includes a comprehensive manual. For an introduction to the machine learning techniques implemented in Weka, and the software itself, consider taking a look at the book Data Mining: Practical Machine Learning Tools and Techniques and its freely available online appendix on the Weka workbench , providing an overview of the software. Closely linked to the book, there are also free online courses on data mining with the machine learning techniques in Weka. A list of sources with information on Weka is provided below. General documentation # The online appendix The Weka Workbench , distributed as a free PDF, for the fourth edition of the book Data Mining: Practical Machine Learning Tools and Techniques . The manual for Weka 3.8 and the manual for Weka 3.9 , as included in the distribution of the software when you download it. The Javadoc for Weka 3.8 and the Javadoc for Weka 3.9 , extracted directly from the source code, providing information on the API and parameters for command-line usage of Weka. The videos and slides for the online courses on Data Mining with Weka , More Data Mining with Weka , and Advanced Data Mining with Weka . Weka packages # There is a list of packages for Weka that can be installed using the built-in package manager. Javadoc for a package is available at https://weka.sourceforge.io/doc.packages/ followed by the name of the package. Mailing list archive # The Weka mailing list is a very helpful source of information, spanning more than 15 years of questions and answers on Weka. Blogs # There is the official Weka blog that has Weka-related news items and the occasional article of interest to Weka users. There is also Mark Hall's blog with a lot of useful information on several important Weka packages in particular. Other sources of information # Weka can be used from several other software systems for data science, and there is a set of slides on WEKA in the Ecosystem for Scientific Computing covering Octave/Matlab, R, Python, and Hadoop. A page with with news and documentation on Weka's support for importing PMML models . A short tutorial on connecting Weka to MongoDB using a JDBC driver .","title":"Documentation"},{"location":"documentation/#general-documentation","text":"The online appendix The Weka Workbench , distributed as a free PDF, for the fourth edition of the book Data Mining: Practical Machine Learning Tools and Techniques . The manual for Weka 3.8 and the manual for Weka 3.9 , as included in the distribution of the software when you download it. The Javadoc for Weka 3.8 and the Javadoc for Weka 3.9 , extracted directly from the source code, providing information on the API and parameters for command-line usage of Weka. The videos and slides for the online courses on Data Mining with Weka , More Data Mining with Weka , and Advanced Data Mining with Weka .","title":"General documentation"},{"location":"documentation/#weka-packages","text":"There is a list of packages for Weka that can be installed using the built-in package manager. Javadoc for a package is available at https://weka.sourceforge.io/doc.packages/ followed by the name of the package.","title":"Weka packages"},{"location":"documentation/#mailing-list-archive","text":"The Weka mailing list is a very helpful source of information, spanning more than 15 years of questions and answers on Weka.","title":"Mailing list archive"},{"location":"documentation/#blogs","text":"There is the official Weka blog that has Weka-related news items and the occasional article of interest to Weka users. There is also Mark Hall's blog with a lot of useful information on several important Weka packages in particular.","title":"Blogs"},{"location":"documentation/#other-sources-of-information","text":"Weka can be used from several other software systems for data science, and there is a set of slides on WEKA in the Ecosystem for Scientific Computing covering Octave/Matlab, R, Python, and Hadoop. A page with with news and documentation on Weka's support for importing PMML models . A short tutorial on connecting Weka to MongoDB using a JDBC driver .","title":"Other sources of information"},{"location":"downloading_weka/","text":"There are two versions of Weka: Weka 3.8 is the latest stable version and Weka 3.9 is the development version. New releases of these two versions are normally made once or twice a year. The stable version receives only bug fixes and feature upgrades that do not break compatibility with its earlier releases, while the development version may receive new features that break compatibility with its earlier releases. Weka 3.8 and 3.9 feature a package management system that makes it easy for the Weka community to add new functionality to Weka. The package management system requires an internet connection in order to download and install packages. Stable version # Weka 3.8 is the latest stable version of Weka. This branch of Weka only receives bug fixes and upgrades that do not break compatibility with earlier 3.8 releases, although major new features may become available in packages. There are different options for downloading and installing it on your system: Windows # Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-windows.exe; 133.2 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka. Mac OS - Intel processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-8-6-azul-zulu-osx.dmg; 180.2 MB) Mac OS - ARM processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-8-6-azul-zulu-arm-osx.dmg; 166.3 MB) Linux # Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-linux.zip; 146.9 MB) First unzip the zip file. This will create a new directory called weka-3-8-5. To run Weka, change into that directory and type ./weka.sh Other platforms # Click here to download a zip archive containing Weka (weka-3-8-6.zip; 59.6 MB) First unzip the zip file. This will create a new directory called weka-3-8-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note that using -jar will override your current CLASSPATH variable and only use the weka.jar . Developer version # This is the main development trunk of Weka and continues from the stable Weka 3.8 code line. It may receive new features that break backwards compatibility. Windows # Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-windows.exe; 133.0 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka. Mac OS - Intel processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-9-6-azul-zulu-osx.dmg; 180.0 MB) Mac OS - ARM processors # Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-9-6-azul-zulu-arm-osx.dmg; 166.3 MB) Linux # Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-linux.zip; 146.7 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type ./weka.sh Other platforms # Click here to download a zip archive containing Weka (weka-3-9-6.zip; 59.4 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note, that using -jar will override your current CLASSPATH variable and only use the weka.jar . Old versions # All old versions of Weka are available from the Sourceforge website . Upgrading from Weka 3.7 # In case you are upgrading an existing Weka 3.7 installation, if the Weka 3.8 package manager does not start up, please delete the file installedPackageCache.ser in the packages folder that resides in the wekafiles folder in your user home. Also, serialized Weka models created in 3.7 are incompatible with 3.8. The model migrator tool can migrate some models to 3.8 (a known exception is RandomForest). Usage is as follows: java -cp <path to modelMigrator.jar>:<path to weka.jar> weka.core.ModelMigrator -i <path to old serialized weka mode> -o <upgraded model file name>","title":"Downloading and installing Weka"},{"location":"downloading_weka/#stable-version","text":"Weka 3.8 is the latest stable version of Weka. This branch of Weka only receives bug fixes and upgrades that do not break compatibility with earlier 3.8 releases, although major new features may become available in packages. There are different options for downloading and installing it on your system:","title":"Stable version"},{"location":"downloading_weka/#windows","text":"Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-windows.exe; 133.2 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka.","title":"Windows"},{"location":"downloading_weka/#mac-os-intel-processors","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-8-6-azul-zulu-osx.dmg; 180.2 MB)","title":"Mac OS - Intel processors"},{"location":"downloading_weka/#mac-os-arm-processors","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-8-6-azul-zulu-arm-osx.dmg; 166.3 MB)","title":"Mac OS - ARM processors"},{"location":"downloading_weka/#linux","text":"Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-8-6-azul-zulu-linux.zip; 146.9 MB) First unzip the zip file. This will create a new directory called weka-3-8-5. To run Weka, change into that directory and type ./weka.sh","title":"Linux"},{"location":"downloading_weka/#other-platforms","text":"Click here to download a zip archive containing Weka (weka-3-8-6.zip; 59.6 MB) First unzip the zip file. This will create a new directory called weka-3-8-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note that using -jar will override your current CLASSPATH variable and only use the weka.jar .","title":"Other platforms"},{"location":"downloading_weka/#developer-version","text":"This is the main development trunk of Weka and continues from the stable Weka 3.8 code line. It may receive new features that break backwards compatibility.","title":"Developer version"},{"location":"downloading_weka/#windows_1","text":"Click here to download a self-extracting executable for 64-bit Windows that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-windows.exe; 133.0 MB) This executable will install Weka in your Program Menu. Launching via the Program Menu or shortcuts will automatically use the included JVM to run Weka.","title":"Windows"},{"location":"downloading_weka/#mac-os-intel-processors_1","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for Intel Macs. (weka-3-9-6-azul-zulu-osx.dmg; 180.0 MB)","title":"Mac OS - Intel processors"},{"location":"downloading_weka/#mac-os-arm-processors_1","text":"Click here to download a disk image for Mac OS that contains a Mac application including Azul's 64-bit OpenJDK Java VM 17 for ARM Macs. (weka-3-9-6-azul-zulu-arm-osx.dmg; 166.3 MB)","title":"Mac OS - ARM processors"},{"location":"downloading_weka/#linux_1","text":"Click here to download a zip archive for Linux that includes Azul's 64-bit OpenJDK Java VM 17 (weka-3-9-6-azul-zulu-linux.zip; 146.7 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type ./weka.sh","title":"Linux"},{"location":"downloading_weka/#other-platforms_1","text":"Click here to download a zip archive containing Weka (weka-3-9-6.zip; 59.4 MB) First unzip the zip file. This will create a new directory called weka-3-9-6. To run Weka, change into that directory and type java -jar weka.jar Note that Java needs to be installed on your system for this to work. Also note, that using -jar will override your current CLASSPATH variable and only use the weka.jar .","title":"Other platforms"},{"location":"downloading_weka/#old-versions","text":"All old versions of Weka are available from the Sourceforge website .","title":"Old versions"},{"location":"downloading_weka/#upgrading-from-weka-37","text":"In case you are upgrading an existing Weka 3.7 installation, if the Weka 3.8 package manager does not start up, please delete the file installedPackageCache.ser in the packages folder that resides in the wekafiles folder in your user home. Also, serialized Weka models created in 3.7 are incompatible with 3.8. The model migrator tool can migrate some models to 3.8 (a known exception is RandomForest). Usage is as follows: java -cp <path to modelMigrator.jar>:<path to weka.jar> weka.core.ModelMigrator -i <path to old serialized weka mode> -o <upgraded model file name>","title":"Upgrading from Weka 3.7"},{"location":"ensemble_selection/","text":"Notes # This bug has now been fixed. (12/2014) There is a bug in the code to build a library -- trying to build any model specification with three layers (e.g., Bagging a REPTree) causes the form to freeze up and/or crash. The documentation on how to run from the command line is outdated. Some corrections: The \"-D\" option no longer exists. The command shown for training a library from the command line: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 5 -S 1 -O -t yourTrainingInstances.arff fails for me with an exception that \"Folds 1 and 5 are not equal.\" A command line that works is to set the folds to 1: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 1 -S 1 -O -t yourTrainingInstances.arff Links # Ensemble_selection.pdf - Documentation on how to use Ensemble Selection in Weka Ensemble Selection from Libraries of Models, ICML'04","title":"Notes"},{"location":"ensemble_selection/#notes","text":"This bug has now been fixed. (12/2014) There is a bug in the code to build a library -- trying to build any model specification with three layers (e.g., Bagging a REPTree) causes the form to freeze up and/or crash. The documentation on how to run from the command line is outdated. Some corrections: The \"-D\" option no longer exists. The command shown for training a library from the command line: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 5 -S 1 -O -t yourTrainingInstances.arff fails for me with an exception that \"Folds 1 and 5 are not equal.\" A command line that works is to set the folds to 1: java weka.classifiers.meta.EnsembleSelection -no-cv -v -L path/to/your/mode/list/file.model.xml -W /path/to/your/working/directory -A library -X 1 -S 1 -O -t yourTrainingInstances.arff","title":"Notes"},{"location":"ensemble_selection/#links","text":"Ensemble_selection.pdf - Documentation on how to use Ensemble Selection in Weka Ensemble Selection from Libraries of Models, ICML'04","title":"Links"},{"location":"extending_weka/","text":"The following articles describe how you can extend Weka: Writing a new Filter Writing a new Classifier Writing your own Classifier Article","title":"Extending Weka"},{"location":"faq/","text":"General # What are the principal release branches of Weka? Where can I get old versions of WEKA? How do I get the latest bugfixes? Can I check my CLASSPATH from within WEKA? Where is my home directory located? Can I check how much memory is available for WEKA? Can I use WEKA in commercial applications? Basic usage # Can I use CSV files? How do I perform CSV file conversion? How do I divide a dataset into training and test set? How do I generate compatible train and test sets that get processed with a filter? How do I perform attribute selection? How do I perform clustering? Where do I find visualization of classifiers, etc.? How do I perform text classification? How can I perform multi-instance learning in WEKA? How do I perform cost-sensitive classification? How do I make predictions with a trained model? Why am I missing certain nominal or string values from sparse instances? Can I use WEKA for time series analysis? Does WEKA support multi-label classification? How do I perform one-class classification? Can I make a screenshot of a plot or graph directly in WEKA? How do I use the package manager? What do I do if the package manager does not start? Advanced usage # How can I track instances in WEKA? How do I use ID attributes? How do I connect to a database? How do I use WEKA from the command line? Can I tune the parameters of a classifier? How do I generate Learning curves? Where can I find information regarding ROC curves? I have unbalanced data - now what? Can I run an experiment using clusterers in the Experimenter? How can I use transactional data in Weka? How can I use Weka with Matlab or Octave? How can I speed up Weka? Can I use GPUs to speed up Weka? Customizing Weka # Can I change the colors (background, axes, etc.) of the plots in WEKA? How do I add a new classifier, filter, kernel, etc Using third-party tools # How do I use libsvm in WEKA? The snowball stemmers don't work, what am I doing wrong? Developing with WEKA # Where can I get WEKA's source code? How do I compile WEKA? What is Git and what do I need to do to access it? How do I use WEKA's classes in my own code? How do I write a new classifier or filter? Can I compile WEKA into native code? Can I use WEKA from C#? Can I use WEKA from Python? Can I use WEKA from Groovy? Serialization is nice, but what about generating actual Java code from WEKA classes? How are packages structured for the package management system? Pluggable evaluation metrics for classification/regression How can I contribute to WEKA? Windows # How do I modify the CLASSPATH? How do I modify the RunWeka.bat file? Can I process UTF-8 datasets or files? How do I run the Windows Weka installer in silent mode? Troubleshooting # I have Weka download problems - what's going wrong? My ARFF file doesn't load - why? What does nominal value not declared in header, read Token[X], line Y mean? ) How do I get rid of this OutOfMemoryException? How do I deal with a StackOverflowError? Why do I get the error message 'training and test set are not compatible'? Couldn't read from database: unknown data type Trying to add JDBC driver: ... - Error, not in CLASSPATH? I cannot process large datasets - any ideas? See Troubleshooting article for more troubleshooting.","title":"FAQ"},{"location":"faq/#general","text":"What are the principal release branches of Weka? Where can I get old versions of WEKA? How do I get the latest bugfixes? Can I check my CLASSPATH from within WEKA? Where is my home directory located? Can I check how much memory is available for WEKA? Can I use WEKA in commercial applications?","title":"General"},{"location":"faq/#basic-usage","text":"Can I use CSV files? How do I perform CSV file conversion? How do I divide a dataset into training and test set? How do I generate compatible train and test sets that get processed with a filter? How do I perform attribute selection? How do I perform clustering? Where do I find visualization of classifiers, etc.? How do I perform text classification? How can I perform multi-instance learning in WEKA? How do I perform cost-sensitive classification? How do I make predictions with a trained model? Why am I missing certain nominal or string values from sparse instances? Can I use WEKA for time series analysis? Does WEKA support multi-label classification? How do I perform one-class classification? Can I make a screenshot of a plot or graph directly in WEKA? How do I use the package manager? What do I do if the package manager does not start?","title":"Basic usage"},{"location":"faq/#advanced-usage","text":"How can I track instances in WEKA? How do I use ID attributes? How do I connect to a database? How do I use WEKA from the command line? Can I tune the parameters of a classifier? How do I generate Learning curves? Where can I find information regarding ROC curves? I have unbalanced data - now what? Can I run an experiment using clusterers in the Experimenter? How can I use transactional data in Weka? How can I use Weka with Matlab or Octave? How can I speed up Weka? Can I use GPUs to speed up Weka?","title":"Advanced usage"},{"location":"faq/#customizing-weka","text":"Can I change the colors (background, axes, etc.) of the plots in WEKA? How do I add a new classifier, filter, kernel, etc","title":"Customizing Weka"},{"location":"faq/#using-third-party-tools","text":"How do I use libsvm in WEKA? The snowball stemmers don't work, what am I doing wrong?","title":"Using third-party tools"},{"location":"faq/#developing-with-weka","text":"Where can I get WEKA's source code? How do I compile WEKA? What is Git and what do I need to do to access it? How do I use WEKA's classes in my own code? How do I write a new classifier or filter? Can I compile WEKA into native code? Can I use WEKA from C#? Can I use WEKA from Python? Can I use WEKA from Groovy? Serialization is nice, but what about generating actual Java code from WEKA classes? How are packages structured for the package management system? Pluggable evaluation metrics for classification/regression How can I contribute to WEKA?","title":"Developing with WEKA"},{"location":"faq/#windows","text":"How do I modify the CLASSPATH? How do I modify the RunWeka.bat file? Can I process UTF-8 datasets or files? How do I run the Windows Weka installer in silent mode?","title":"Windows"},{"location":"faq/#troubleshooting","text":"I have Weka download problems - what's going wrong? My ARFF file doesn't load - why? What does nominal value not declared in header, read Token[X], line Y mean? ) How do I get rid of this OutOfMemoryException? How do I deal with a StackOverflowError? Why do I get the error message 'training and test set are not compatible'? Couldn't read from database: unknown data type Trying to add JDBC driver: ... - Error, not in CLASSPATH? I cannot process large datasets - any ideas? See Troubleshooting article for more troubleshooting.","title":"Troubleshooting"},{"location":"feature_extraction_from_images/","text":"ImageJ can be used to extract features from images. ImageJ contains a macro language with which it is easy to extract features and then dump them into an ARFF file. Links # ImageJ homepage","title":"Feature extraction from images"},{"location":"feature_extraction_from_images/#links","text":"ImageJ homepage","title":"Links"},{"location":"filtered_classifier_updateable/","text":"Description # Incremental version of weka.classifiers.meta.FilteredClassifier , which takes only incremental base classifiers (i.e., classifiers implementing weka.classifiers.UpdateableClassifier ). Reference # -none- Package # weka.classifiers.meta Download # Source code: FilteredClassifierUpdateable.java Example class: FilteredUpdateableTest.java Additional Information # -none- Version # Tested with source code from git (= trunk/weka ) as of 10/11/2008.","title":"Description"},{"location":"filtered_classifier_updateable/#description","text":"Incremental version of weka.classifiers.meta.FilteredClassifier , which takes only incremental base classifiers (i.e., classifiers implementing weka.classifiers.UpdateableClassifier ).","title":"Description"},{"location":"filtered_classifier_updateable/#reference","text":"-none-","title":"Reference"},{"location":"filtered_classifier_updateable/#package","text":"weka.classifiers.meta","title":"Package"},{"location":"filtered_classifier_updateable/#download","text":"Source code: FilteredClassifierUpdateable.java Example class: FilteredUpdateableTest.java","title":"Download"},{"location":"filtered_classifier_updateable/#additional-information","text":"-none-","title":"Additional Information"},{"location":"filtered_classifier_updateable/#version","text":"Tested with source code from git (= trunk/weka ) as of 10/11/2008.","title":"Version"},{"location":"generating_and_saving_a_precision_recall_curve/","text":"The following Java class evaluates a NaiveBayes classifier using cross-validation with a dataset provided by the user and saves a precision-recall curve for the first class label as a JPEG file, based on a user-specified file name. Source code: import java.awt.* ; import java.io.* ; import java.util.* ; import javax.swing.* ; import weka.core.* ; import weka.classifiers.* ; import weka.classifiers.bayes.NaiveBayes ; import weka.classifiers.evaluation.Evaluation ; import weka.classifiers.evaluation.ThresholdCurve ; import weka.gui.visualize.* ; /** * Generates and saves a precision-recall curve. Uses a cross-validation * with NaiveBayes to make the curve. * * @author FracPete * @author Eibe Frank */ public class SavePrecisionRecallCurve { /** * takes two arguments: dataset in ARFF format (expects class to * be last attribute) and name of file with output */ public static void main ( String [] args ) throws Exception { // load data Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); // train classifier Classifier cl = new NaiveBayes (); Evaluation eval = new Evaluation ( data ); eval . crossValidateModel ( cl , data , 10 , new Random ( 1 )); // generate curve ThresholdCurve tc = new ThresholdCurve (); int classIndex = 0 ; Instances result = tc . getCurve ( eval . predictions (), classIndex ); // plot curve ThresholdVisualizePanel vmc = new ThresholdVisualizePanel (); PlotData2D tempd = new PlotData2D ( result ); // specify which points are connected boolean [] cp = new boolean [ result . numInstances () ] ; for ( int n = 1 ; n < cp . length ; n ++ ) cp [ n ] = true ; tempd . setConnectPoints ( cp ); // add plot vmc . addPlot ( tempd ); // We want a precision-recall curve vmc . setXIndex ( result . attribute ( \"Recall\" ). index ()); vmc . setYIndex ( result . attribute ( \"Precision\" ). index ()); // Make window with plot but don't show it JFrame jf = new JFrame (); jf . setSize ( 500 , 400 ); jf . getContentPane (). add ( vmc ); jf . pack (); // Save to file specified as second argument (can use any of // BMPWriter, JPEGWriter, PNGWriter, PostscriptWriter for different formats) JComponentWriter jcw = new JPEGWriter ( vmc . getPlotPanel (), new File ( args [ 1 ] )); jcw . toOutput (); System . exit ( 1 ); } } See also # ROC curves Visualizing ROC curve Plotting multiple ROC curves Version # Needs the developer version >=3.5.1 or 3.6.x","title":"Generating and saving a precision recall curve"},{"location":"generating_and_saving_a_precision_recall_curve/#see-also","text":"ROC curves Visualizing ROC curve Plotting multiple ROC curves","title":"See also"},{"location":"generating_and_saving_a_precision_recall_curve/#version","text":"Needs the developer version >=3.5.1 or 3.6.x","title":"Version"},{"location":"generating_classifier_evaluation_output_manually/","text":"In the following some code snippets that explain how to generate the output Weka generates when one runs a classifier from the commandline. When referring to the Evaluation class, the weka.classifiers.Evaluation class is meant. This article provides only a quick overview, for more details, please see the Javadoc of the Evaluation class. Model # A classifier's model, if that classifier supports the output of it, can be simply output by using the toString() method after it got trained: Instances data = ... // from somewhere Classifier cls = new weka . classifiers . trees . J48 (); cls . buildClassifier ( data ); System . out . println ( cls ); NB: Weka always outputs the model based on the full training set (provided with the option -t ), no matter whether cross-validation is used or a designated test set (via -T ). The 10 models generated during a 10-fold cross-validation run are never output. If you want to output these models you have to simulate the crossValidateModel method yourself, use the KnowledgeFlow (see article Displaying results of cross-validation folds ). Statistics # The statistics, also called the summary of an evaluation, can be be generated via the toSummaryString methods. Here is an example of the summary from a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toSummaryString ()); Detailed class statistics # In order to generate the detailed statistics per class (on the commandline via option -i ), one can use the toClassDetailsString methods. Once again a code snippet featuring a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toClassDetailsString ()); Confusion matrix # The confusion matrix is simply output with the toMatrixString() or toMatrixString(String) method of the Evaluation class. In the following an example of cross-validating J48 on a dataset and outputting the confusion matrix to stdout. Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toMatrixString ()); See also # Use Weka in your Java code - general overview of the Weka API","title":"Generating classifier evaluation output manually"},{"location":"generating_classifier_evaluation_output_manually/#model","text":"A classifier's model, if that classifier supports the output of it, can be simply output by using the toString() method after it got trained: Instances data = ... // from somewhere Classifier cls = new weka . classifiers . trees . J48 (); cls . buildClassifier ( data ); System . out . println ( cls ); NB: Weka always outputs the model based on the full training set (provided with the option -t ), no matter whether cross-validation is used or a designated test set (via -T ). The 10 models generated during a 10-fold cross-validation run are never output. If you want to output these models you have to simulate the crossValidateModel method yourself, use the KnowledgeFlow (see article Displaying results of cross-validation folds ).","title":"Model"},{"location":"generating_classifier_evaluation_output_manually/#statistics","text":"The statistics, also called the summary of an evaluation, can be be generated via the toSummaryString methods. Here is an example of the summary from a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toSummaryString ());","title":"Statistics"},{"location":"generating_classifier_evaluation_output_manually/#detailed-class-statistics","text":"In order to generate the detailed statistics per class (on the commandline via option -i ), one can use the toClassDetailsString methods. Once again a code snippet featuring a cross-validated J48: Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toClassDetailsString ());","title":"Detailed class statistics"},{"location":"generating_classifier_evaluation_output_manually/#confusion-matrix","text":"The confusion matrix is simply output with the toMatrixString() or toMatrixString(String) method of the Evaluation class. In the following an example of cross-validating J48 on a dataset and outputting the confusion matrix to stdout. Classifier cls = new J48 (); Evaluation eval = new Evaluation ( data ); Random rand = new Random ( 1 ); // using seed = 1 int folds = 10 ; eval . crossValidateModel ( cls , data , folds , rand ); System . out . println ( eval . toMatrixString ());","title":"Confusion matrix"},{"location":"generating_classifier_evaluation_output_manually/#see-also","text":"Use Weka in your Java code - general overview of the Weka API","title":"See also"},{"location":"generating_cv_folds/","text":"You have two choices of generating cross-validation folds: Filter approach - uses a bash script to generate the train/test pairs beforehand Java approach - to be used from within your own Java code, creates train/test pairs on the fly","title":"Generating cv folds"},{"location":"generating_cv_folds_filter/","text":"The filter RemoveFolds (package weka.filters.unsupervised.instance ) can be used to generate the train/test splits used in cross-validation (for stratified folds, use weka.filters.supervised.instance.StratifiedRemoveFolds ). The filter has to be used twice for each train/test split, first to generate the train set and then to obtain the test set. Since this is rather cumbersome by hand, one can also put this into a bash script: #!/bin/bash # # expects the weka.jar as first parameter and the datasets to work on as # second parameter. # # FracPete, 2007-04-10 if [ ! $# -eq 2 ] then echo echo \"usage: folds.sh <weka.jar> <dataset>\" echo exit 1 fi JAR = $1 DATASET = $2 FOLDS = 10 FILTER = weka.filters.unsupervised.instance.RemoveFolds SEED = 1 for (( i = 1 ; i < = $FOLDS ; i++ )) do echo \"Generating pair $i / $FOLDS ...\" OUTFILE = ` echo $DATASET | sed s/ \"\\.arff\" //g ` # train set java -cp $JAR $FILTER -V -N $FOLDS -F $i -S $SEED -i $DATASET -o \" $OUTFILE -train- $i -of- $FOLDS .arff\" # test set java -cp $JAR $FILTER -N $FOLDS -F $i -S $SEED -i $DATASET -o \" $OUTFILE -test- $i -of- $FOLDS .arff\" done The script expects two parameters: the weka.jar (or the path to the Weka classes) the dataset to generate the train/test pairs from Example: ./folds.sh /some/where/weka.jar /some/where/else/dataset.arff This example will create the train/test splits for a 10-fold cross-validation at the same location as the original dataset, i.e., in the directory /some/where/else/ . Downloads # folds.sh","title":"Generating cv folds filter"},{"location":"generating_cv_folds_filter/#downloads","text":"folds.sh","title":"Downloads"},{"location":"generating_cv_folds_java/","text":"This article describes how to generate train/test splits for cross-validation using the Weka API directly. The following variables are given: Instances data = ...; // contains the full dataset we wann create train/test sets from int seed = ...; // the seed for randomizing the data int folds = ...; // the number of folds to generate, >=2 Randomize the data # First, randomize your data: Random rand = new Random ( seed ); // create seeded number generator randData = new Instances ( data ); // create copy of original data randData . randomize ( rand ); // randomize data with number generator In case your data has a nominal class and you wanna perform stratified cross-validation: randData . stratify ( folds ); Generate the folds # Single run # Next thing that we have to do is creating the train and the test set: for ( int n = 0 ; n < folds ; n ++ ) { Instances train = randData . trainCV ( folds , n , rand ); Instances test = randData . testCV ( folds , n ); // further processing, classification, etc. ... } Note: the above code is used by the weka.filters.supervised.instance.StratifiedRemoveFolds filter the weka.classifiers.Evaluation class and the Explorer/Experimenter would use this method for obtaining the train set: Instances train = randData . trainCV ( folds , n , rand ); Multiple runs # The example above only performs one run of a cross-validation. In case you want to run 10 runs of 10-fold cross-validation, use the following loop: Instances data = ...; // our dataset again, obtained from somewhere int runs = 10 ; for ( int i = 0 ; i < runs ; i ++ ) { seed = i + 1 ; // every run gets a new, but defined seed value // see: randomize the data ... // see: generate the folds ... } See also # Use Weka in your Java code - for general use of the Weka API Downloads # CrossValidationSingleRun.java ( stable , developer ) - simulates a single run of 10-fold cross-validation CrossValidationSingleRunVariant.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but outputs the confusion matrices for each single train/test pair as well. CrossValidationMultipleRuns.java ( stable , developer ) - simulates 10 runs of 10-fold cross-validation CrossValidationAddPrediction.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but also adds the classification/distribution/error flag to the test data (uses the AddClassification filter)","title":"Generating cv folds java"},{"location":"generating_cv_folds_java/#randomize-the-data","text":"First, randomize your data: Random rand = new Random ( seed ); // create seeded number generator randData = new Instances ( data ); // create copy of original data randData . randomize ( rand ); // randomize data with number generator In case your data has a nominal class and you wanna perform stratified cross-validation: randData . stratify ( folds );","title":"Randomize the data"},{"location":"generating_cv_folds_java/#generate-the-folds","text":"","title":"Generate the folds"},{"location":"generating_cv_folds_java/#single-run","text":"Next thing that we have to do is creating the train and the test set: for ( int n = 0 ; n < folds ; n ++ ) { Instances train = randData . trainCV ( folds , n , rand ); Instances test = randData . testCV ( folds , n ); // further processing, classification, etc. ... } Note: the above code is used by the weka.filters.supervised.instance.StratifiedRemoveFolds filter the weka.classifiers.Evaluation class and the Explorer/Experimenter would use this method for obtaining the train set: Instances train = randData . trainCV ( folds , n , rand );","title":"Single run"},{"location":"generating_cv_folds_java/#multiple-runs","text":"The example above only performs one run of a cross-validation. In case you want to run 10 runs of 10-fold cross-validation, use the following loop: Instances data = ...; // our dataset again, obtained from somewhere int runs = 10 ; for ( int i = 0 ; i < runs ; i ++ ) { seed = i + 1 ; // every run gets a new, but defined seed value // see: randomize the data ... // see: generate the folds ... }","title":"Multiple runs"},{"location":"generating_cv_folds_java/#see-also","text":"Use Weka in your Java code - for general use of the Weka API","title":"See also"},{"location":"generating_cv_folds_java/#downloads","text":"CrossValidationSingleRun.java ( stable , developer ) - simulates a single run of 10-fold cross-validation CrossValidationSingleRunVariant.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but outputs the confusion matrices for each single train/test pair as well. CrossValidationMultipleRuns.java ( stable , developer ) - simulates 10 runs of 10-fold cross-validation CrossValidationAddPrediction.java ( stable , developer ) - simulates a single run of 10-fold cross-validation, but also adds the classification/distribution/error flag to the test data (uses the AddClassification filter)","title":"Downloads"},{"location":"generating_roc_curve/","text":"The following little Java class trains a NaiveBayes classifier with a dataset provided by the user and displays the ROC curve for the first class label. Source code: import java.awt.* ; import java.io.* ; import java.util.* ; import javax.swing.* ; import weka.core.* ; import weka.classifiers.* ; import weka.classifiers.bayes.NaiveBayes ; import weka.classifiers.evaluation.Evaluation ; import weka.classifiers.evaluation.ThresholdCurve ; import weka.gui.visualize.* ; /** * Generates and displays a ROC curve from a dataset. Uses a default * NaiveBayes to generate the ROC data. * * @author FracPete */ public class GenerateROC { /** * takes one argument: dataset in ARFF format (expects class to * be last attribute) */ public static void main ( String [] args ) throws Exception { // load data Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); // train classifier Classifier cl = new NaiveBayes (); Evaluation eval = new Evaluation ( data ); eval . crossValidateModel ( cl , data , 10 , new Random ( 1 )); // generate curve ThresholdCurve tc = new ThresholdCurve (); int classIndex = 0 ; Instances result = tc . getCurve ( eval . predictions (), classIndex ); // plot curve ThresholdVisualizePanel vmc = new ThresholdVisualizePanel (); vmc . setROCString ( \"(Area under ROC = \" + Utils . doubleToString ( tc . getROCArea ( result ), 4 ) + \")\" ); vmc . setName ( result . relationName ()); PlotData2D tempd = new PlotData2D ( result ); tempd . setPlotName ( result . relationName ()); tempd . addInstanceNumberAttribute (); // specify which points are connected boolean [] cp = new boolean [ result . numInstances () ] ; for ( int n = 1 ; n < cp . length ; n ++ ) cp [ n ] = true ; tempd . setConnectPoints ( cp ); // add plot vmc . addPlot ( tempd ); // display curve String plotName = vmc . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vmc , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } } See also # ROC curves Visualizing ROC curve Plotting multiple ROC curves Downloads # GenerateROC.java ( stable , developer )","title":"Generating roc curve"},{"location":"generating_roc_curve/#see-also","text":"ROC curves Visualizing ROC curve Plotting multiple ROC curves","title":"See also"},{"location":"generating_roc_curve/#downloads","text":"GenerateROC.java ( stable , developer )","title":"Downloads"},{"location":"generating_source_code_from_weka_classes/","text":"Some of the schemes in Weka can generate Java source code that represents their current internal state. At the moment these are classifiers (book and developer version) and filters (>3.5.6). The generated code can be used within Weka as normal classifier/filter, since this code will be derived from the same superclass ( weka.classifiers.Classifier or weka.filters.Filter ) as the generating code. Note: The commands listed here are for a Linux/Unix bash (the backslash tells the shell that the command isn't finished yet and continues on the next line). In case of Windows or the SimpleCLI, just remove the backslashes and put everything on one line. Classifiers # Instead of using a serialized filter to perform further classifications/predictions, one can also obtain source code from a trained classifier and use this instead. The advantage of this is being less dependent on version changes and incompatible serialized files. All classifiers implementing the weka.classifiers.Sourcable interface can turn their model into Java source code (check the Javadoc of this interface for all the classifiers implementing it). Here's an example of generating source code from a trained J48 (the source code is saved in a file called WekaWrapper.java ): java weka.classifiers.trees.J48 \\ -t /some/where/data.arff \\ -z SourcedJ48 \\ # name of the inner class, gets called by wrapper class WekaWrapper > /else/where/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.classifiers package. Make sure that you place the source code and/or class files in the correct location. The generated classifier can be used from the commandline or GUI like any other classifier within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the classifier in ( weka.classifiers is not listed by default). The following command calls the generated classifier with a training set (training has no effect, of course) and outputs the predictions for this dataset to stdout : java weka.classifiers.WekaWrapper \\ -t /some/file.arff \\ -p 0 # output predictions for training set Note: the Explorer can output source code as well, you only have to check the Output source code option in the More options dialog. Filters # With versions of Weka later than 3.5.6 of the developer version, one can now also turn filters into source code. The process is basically the same as with classifiers outlined above. All filters that implement the weka.filters.Sourcable interface can be turned into Java code (again, check out the Javadoc for this interface, to see the filters implementing it). The following command turns an initialized ReplaceMissingValues filter into source code: java weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -i /somewhere1/input.arff \\ -o /somewhere2/output.arff \\ -z SourcedRMV \\ # name of the inner class, gets called by wrapper class WekaWrapper > /some/place/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.filters package. Make sure that you place the source code and/or class files in the correct location. The generated filter can be used from the commandline or GUI like any other filter within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the filter in. And again a little demonstration of how to call the generated source code: java weka.filters.WekaWrapper \\ -i /some/where/input.arff \\ # must have the same structure as **/somewhere1/input.arff**, of course -o /other/place/output.arff See also # Serialization - can be used for all classifiers and filters to save them in a persistent state.","title":"Generating source code from weka classes"},{"location":"generating_source_code_from_weka_classes/#classifiers","text":"Instead of using a serialized filter to perform further classifications/predictions, one can also obtain source code from a trained classifier and use this instead. The advantage of this is being less dependent on version changes and incompatible serialized files. All classifiers implementing the weka.classifiers.Sourcable interface can turn their model into Java source code (check the Javadoc of this interface for all the classifiers implementing it). Here's an example of generating source code from a trained J48 (the source code is saved in a file called WekaWrapper.java ): java weka.classifiers.trees.J48 \\ -t /some/where/data.arff \\ -z SourcedJ48 \\ # name of the inner class, gets called by wrapper class WekaWrapper > /else/where/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.classifiers package. Make sure that you place the source code and/or class files in the correct location. The generated classifier can be used from the commandline or GUI like any other classifier within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the classifier in ( weka.classifiers is not listed by default). The following command calls the generated classifier with a training set (training has no effect, of course) and outputs the predictions for this dataset to stdout : java weka.classifiers.WekaWrapper \\ -t /some/file.arff \\ -p 0 # output predictions for training set Note: the Explorer can output source code as well, you only have to check the Output source code option in the More options dialog.","title":"Classifiers"},{"location":"generating_source_code_from_weka_classes/#filters","text":"With versions of Weka later than 3.5.6 of the developer version, one can now also turn filters into source code. The process is basically the same as with classifiers outlined above. All filters that implement the weka.filters.Sourcable interface can be turned into Java code (again, check out the Javadoc for this interface, to see the filters implementing it). The following command turns an initialized ReplaceMissingValues filter into source code: java weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -i /somewhere1/input.arff \\ -o /somewhere2/output.arff \\ -z SourcedRMV \\ # name of the inner class, gets called by wrapper class WekaWrapper > /some/place/WekaWrapper.java # redirecting the output of the code into a file The package of the wrapper class is by default the weka.filters package. Make sure that you place the source code and/or class files in the correct location. The generated filter can be used from the commandline or GUI like any other filter within Weka, you only need to make sure that your GenericObjectEditor lists the package you place the filter in. And again a little demonstration of how to call the generated source code: java weka.filters.WekaWrapper \\ -i /some/where/input.arff \\ # must have the same structure as **/somewhere1/input.arff**, of course -o /other/place/output.arff","title":"Filters"},{"location":"generating_source_code_from_weka_classes/#see-also","text":"Serialization - can be used for all classifiers and filters to save them in a persistent state.","title":"See also"},{"location":"generic_object_editor/","text":"The GenericObjectEditor is the core component in Weka for modifying schemes, like classifiers and filters in the GUI. It has to be configured correctly in order to show default and additional schemes. See the following articles for more details: GenericObjectEditor (book version) GenericObjectEditor (developer version)","title":"Generic object editor"},{"location":"generic_object_editor_book_version/","text":"Introduction # As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off. It is assumed that you already placed the GenericPropertiesCreator.props (GPC) file in your home directory (this file is located in directory weka/gui of either the weka.jar or weka-src.jar ZIP archive) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . For disabling dynamic class discovery, you need to set the boolean constant USE_DYNAMIC of the weka.gui.GenericObjectEditor class to false . See article Compiling WEKA for more information on how to compile a modified version of WEKA. A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file). File Structure # The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules Class Discovery # Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Multiple Class Hierarchies # In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here: Links # GenericObjectEditor (developer version) CLASSPATH Properties file GenericPropertiesCreator.props","title":"Introduction"},{"location":"generic_object_editor_book_version/#introduction","text":"As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off. It is assumed that you already placed the GenericPropertiesCreator.props (GPC) file in your home directory (this file is located in directory weka/gui of either the weka.jar or weka-src.jar ZIP archive) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . For disabling dynamic class discovery, you need to set the boolean constant USE_DYNAMIC of the weka.gui.GenericObjectEditor class to false . See article Compiling WEKA for more information on how to compile a modified version of WEKA. A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file).","title":"Introduction"},{"location":"generic_object_editor_book_version/#file-structure","text":"The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules","title":"File Structure"},{"location":"generic_object_editor_book_version/#class-discovery","text":"Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes.","title":"Class Discovery"},{"location":"generic_object_editor_book_version/#multiple-class-hierarchies","text":"In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here:","title":"Multiple Class Hierarchies"},{"location":"generic_object_editor_book_version/#links","text":"GenericObjectEditor (developer version) CLASSPATH Properties file GenericPropertiesCreator.props","title":"Links"},{"location":"generic_object_editor_developer_version/","text":"Introduction # As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). In some versions (3.5.8, 3.6.0) this facility was not enabled by default as it is a bit slower than the GOE file approach, and, furthermore, does not function in environments that do not have a CLASSPATH (e.g., application servers). Later versions (3.6.1, 3.7.0) enabled the dynamic discovery again, as WEKA can now distinguish between being a standalone Java application or being run in a non-CLASSPATH environment. If you wish to enable or disable dynamic class discovery, the relevant file to edit is GenericPropertiesCreator.props (GPC). You can obtain this file either from the weka.jar or weka-src.jar archive. Open one of these files with an archive manager that can handle ZIP files (for Windows users, you can use 7-Zip for this) and navigate to the weka/gui directory, where the GPC file is located. All that is required, is to change the UseDynamic property in this file from false to true (for enabling it) or the other way round (for disabling it). After changing the file, you just place it in your home directory. In order to find out the location of your home directory, do the following: Linux/Unix Open a terminal run the following command: echo $HOME Windows Open a command-primpt run the following command: echo %USERPROFILE% If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off again. It is assumed that you already place the GPC file in your home directory (see steps above) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file). File Structure # The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70 in WEKA 3.4.4!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules Exclusion # It may not always be desired to list all the classes that can be found along the CLASSPATH . Sometimes, classes cannot be declared abstract but still shouldn't be listed in the GOE. For that reason one can list classes, interfaces, superclasses for certain packages to be excluded from display. This exclusion is done with the following file: weka/gui/GenericPropertiesCreator.excludes The format of this properties file is fairly easy: <key>=<prefix>:<class>[,<prefix>:<class>] Where the <key> corresponds to a key in the GenericPropertiesCreator.props file and the <prefix> can be one of the following: S - Superclass any class class derived from this will be excluded I - Interface any class implementing this interface will be excluded C - Class exactly this class will be excluded Here are a few examples: # exclude all ResultListeners that also implement the ResultProducer interface # (all ResultProducers do that!) weka.experiment.ResultListener = \\ I:weka.experiment.ResultProducer # exclude J48 and all SingleClassifierEnhancers weka.classifiers.Classifier = \\ C:weka.classifiers.trees.J48, \\ S:weka.classifiers.SingleClassifierEnhancer Class Discovery # Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Multiple Class Hierarchies # In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here: Capabilities # Version 3.5.3 of Weka introduces the notion of Capabilities . Capabilities basically list what kind of data a certain object can handle, e.g., one classifier can handle numeric classes, but another cannot. In case a class supports capabilities the additional buttons Filter... and Remove filter will be available in the GOE. The Filter... button pops up a dialog which lists all available Capabilities: One can then choose those capabilities an object, e.g., a classifier, should have. If one is looking for classification problem, then the Nominal class Capability can be selected. On the other hand, if one needs a regression scheme, then the Capability Numeric class can be selected. This filtering mechanism makes the search for an appropriate learning scheme easier. After applying that filter, the tree with the objects will be displayed again and lists all objects that can handle all the selected Capabilities black , the ones that cannot red (starting with 3.5.8: silver ) and the ones that might be able to handle them blue (e.g., meta classifiers which depend on their base classifier(s)). Links # GenericObjectEditor (book version) CLASSPATH Properties file GenericPropertiesCreator.props GenericPropertiesCreator.excludes","title":"Introduction"},{"location":"generic_object_editor_developer_version/#introduction","text":"As of version 3.4.4 it is possible for WEKA to dynamically discover classes at runtime (rather than using only those specified in the GenericObjectEditor.props (GOE) file). In some versions (3.5.8, 3.6.0) this facility was not enabled by default as it is a bit slower than the GOE file approach, and, furthermore, does not function in environments that do not have a CLASSPATH (e.g., application servers). Later versions (3.6.1, 3.7.0) enabled the dynamic discovery again, as WEKA can now distinguish between being a standalone Java application or being run in a non-CLASSPATH environment. If you wish to enable or disable dynamic class discovery, the relevant file to edit is GenericPropertiesCreator.props (GPC). You can obtain this file either from the weka.jar or weka-src.jar archive. Open one of these files with an archive manager that can handle ZIP files (for Windows users, you can use 7-Zip for this) and navigate to the weka/gui directory, where the GPC file is located. All that is required, is to change the UseDynamic property in this file from false to true (for enabling it) or the other way round (for disabling it). After changing the file, you just place it in your home directory. In order to find out the location of your home directory, do the following: Linux/Unix Open a terminal run the following command: echo $HOME Windows Open a command-primpt run the following command: echo %USERPROFILE% If dynamic class discovery is too slow, e.g., due to an enormous CLASSPATH, you can generate a new GenericObjectEditor.props file and then turn dynamic class discovery off again. It is assumed that you already place the GPC file in your home directory (see steps above) and that the weka.jar jar archive with the WEKA classes is in your CLASSPATH (otherwise you have to add it to the java call using the -classpath option). For generating the GOE file, execute the following steps: generate a new GenericObjectEditor.props file using the following command: Linux/Unix java weka.gui.GenericPropertiesCreator \\ $HOME/GenericPropertiesCreator.props \\ $HOME/GenericObjectEditor.props Windows (command must be in one line) java weka.gui.GenericPropertiesCreator %USERPROFILE%\\GenericPropertiesCreator.props %USERPROFILE%\\GenericObjectEditor.props edit the GenericPropertiesCreator.props file in your home directory and set UseDynamic to false . A limitation of the GOE prior to 3.4.4 was, that additional classifiers, filters, etc., had to fit into the same package structure as the already existing ones, i.e., all had to be located below weka . WEKA can now display multiple class hierarchies in the GUI, which makes adding new functionality quite easy as we will see later in an example (it is not restricted to classifiers only, but also works with all the other entries in the GPC file).","title":"Introduction"},{"location":"generic_object_editor_developer_version/#file-structure","text":"The structure of the GOE so far was a key-value-pair, separated by an equals -sign. The value is a comma separated list of classes that are all derived from the superclass/superinterface key . The GPC is slightly different, instead of declaring all the classes/interfaces one need only to specify all the packages descendants are located in (only non-abstract ones are then listed). E.g., the weka.classifiers.Classifier entry in the GOE file looks like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes.AODE, \\ weka.classifiers.bayes.BayesNet, \\ weka.classifiers.bayes.ComplementNaiveBayes, \\ weka.classifiers.bayes.NaiveBayes, \\ weka.classifiers.bayes.NaiveBayesMultinomial, \\ weka.classifiers.bayes.NaiveBayesSimple, \\ weka.classifiers.bayes.NaiveBayesUpdateable, \\ weka.classifiers.functions.LeastMedSq, \\ weka.classifiers.functions.LinearRegression, \\ weka.classifiers.functions.Logistic, \\ ... The entry producing the same output for the classifiers in the GPC looks like this (7 lines instead of over 70 in WEKA 3.4.4!): weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules","title":"File Structure"},{"location":"generic_object_editor_developer_version/#exclusion","text":"It may not always be desired to list all the classes that can be found along the CLASSPATH . Sometimes, classes cannot be declared abstract but still shouldn't be listed in the GOE. For that reason one can list classes, interfaces, superclasses for certain packages to be excluded from display. This exclusion is done with the following file: weka/gui/GenericPropertiesCreator.excludes The format of this properties file is fairly easy: <key>=<prefix>:<class>[,<prefix>:<class>] Where the <key> corresponds to a key in the GenericPropertiesCreator.props file and the <prefix> can be one of the following: S - Superclass any class class derived from this will be excluded I - Interface any class implementing this interface will be excluded C - Class exactly this class will be excluded Here are a few examples: # exclude all ResultListeners that also implement the ResultProducer interface # (all ResultProducers do that!) weka.experiment.ResultListener = \\ I:weka.experiment.ResultProducer # exclude J48 and all SingleClassifierEnhancers weka.classifiers.Classifier = \\ C:weka.classifiers.trees.J48, \\ S:weka.classifiers.SingleClassifierEnhancer","title":"Exclusion"},{"location":"generic_object_editor_developer_version/#class-discovery","text":"Unlike the Class.forName(String) method that grabs the first class it can find in the CLASSPATH , and therefore fixes the location of the package it found the class in, the dynamic discovery examines the complete CLASSPATH you're starting the Java Virtual Machine (JVM) with. This means that you can have several parallel directories with the same WEKA package structure, e.g. the standard release of WEKA in one directory ( /distribution/weka.jar ) and another one with your own classes ( /development/weka/... ), and display all of the classifiers in the GUI. In case of a name conflict, i.e. two directories contain the same class, the first one that can be found is used. In a nutshell, your java call of the GUIChooser could look like this: java -classpath \"/development:/distribution/weka.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes.","title":"Class Discovery"},{"location":"generic_object_editor_developer_version/#multiple-class-hierarchies","text":"In case you're developing your own framework, but still want to use your classifiers within WEKA that wasn't possible so far. With the release 3.4.4 it is possible to have multiple class hierarchies being displayed in the GUI. If you've developed a modified version of J48, let's call it MyJ48 and it's located in the package dummy.classifiers then you'll have to add this package to the classifiers list in the GPC file like this: weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.trees, \\ weka.classifiers.rules, \\ dummy.classifiers Your java call for the GUIChooser might look like this: java -classpath \"weka.jar:dummy.jar\" weka.gui.GUIChooser Note: Windows users have to replace the \":\" with \";\" and the forward slashes with backslashes. Starting up the GUI you'll now have another root node in the tree view of the classifiers, called root , and below it the weka and the dummy package hierarchy as you can see here:","title":"Multiple Class Hierarchies"},{"location":"generic_object_editor_developer_version/#capabilities","text":"Version 3.5.3 of Weka introduces the notion of Capabilities . Capabilities basically list what kind of data a certain object can handle, e.g., one classifier can handle numeric classes, but another cannot. In case a class supports capabilities the additional buttons Filter... and Remove filter will be available in the GOE. The Filter... button pops up a dialog which lists all available Capabilities: One can then choose those capabilities an object, e.g., a classifier, should have. If one is looking for classification problem, then the Nominal class Capability can be selected. On the other hand, if one needs a regression scheme, then the Capability Numeric class can be selected. This filtering mechanism makes the search for an appropriate learning scheme easier. After applying that filter, the tree with the objects will be displayed again and lists all objects that can handle all the selected Capabilities black , the ones that cannot red (starting with 3.5.8: silver ) and the ones that might be able to handle them blue (e.g., meta classifiers which depend on their base classifier(s)).","title":"Capabilities"},{"location":"generic_object_editor_developer_version/#links","text":"GenericObjectEditor (book version) CLASSPATH Properties file GenericPropertiesCreator.props GenericPropertiesCreator.excludes","title":"Links"},{"location":"get_latest_bugfixes/","text":"Weka is actively developed, that means that bugs are fixed and new functionality is added (only to the developer version) all the time. Every now and then (about every 6-12 months), when there was a sufficiently large number of improvements or fixes, a release is made and uploaded to Sourceforget.net . If you don't want to wait that long, you can get the latest source code from Git and compile it yourself. See the following articles for more information: obtaining the source code from Git , either book or developer version compiling the source code","title":"Get latest bugfixes"},{"location":"getting_help/","text":"In addition to consulting the available documentation , try searching a mailing list archive or community forum to check whether a solution to your problem has already been posted there. Please consult these sources of information before posting a query on the Weka mailing list or elsewhere. And please never email individual Weka developers directly. When you do post a message regarding a problem you encountered with Weka, please include as much as information as possible. In particular, consider running Weka with a console window open so that you can see the entire error output from Java (including the Java stack trace). This makes it much more likely that you will get useful help. When posting questions, comments, or bug reports to the Weka mailing list, consider the mailing list etiquette . Mailing list archive and mirrors # Consider searching the archive of the Weka mailing list (wekalist) or its mirror marc.info . Forums offering help # You should also consider looking for a solution at stackoverflow.com , the old forum for Weka at pentaho.com , or the newer forum at hitachivantara.com . Bug reports # Bug reports can be send to the Weka mailing list or posted at the JIRA . IRC channel for discussing Weka # ##weka on freenode.","title":"Getting help"},{"location":"getting_help/#mailing-list-archive-and-mirrors","text":"Consider searching the archive of the Weka mailing list (wekalist) or its mirror marc.info .","title":"Mailing list archive and mirrors"},{"location":"getting_help/#forums-offering-help","text":"You should also consider looking for a solution at stackoverflow.com , the old forum for Weka at pentaho.com , or the newer forum at hitachivantara.com .","title":"Forums offering help"},{"location":"getting_help/#bug-reports","text":"Bug reports can be send to the Weka mailing list or posted at the JIRA .","title":"Bug reports"},{"location":"getting_help/#irc-channel-for-discussing-weka","text":"##weka on freenode.","title":"IRC channel for discussing Weka"},{"location":"git/","text":"General # The main trunk of the Weka Git repository is accessible and browseable via the following URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/main/trunk Other branches can be accessed via https://git.cms.waikato.ac.nz/weka/weka For example, if you want to obtain the source code of the 3.8 version, use this URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8 Specific version # Whenever a release of Weka is generated, the repository gets tagged . The tag for a development version has the form dev-X-Y-Z For example, WEKA 3.9.6 corresponds to the tag dev-3-9-6. The tag for a stable version is stable-X-Y-Z The WEKA 3.8 version is one of those stable versions, e.g., stable-3-8-6 will be the tag for Weka 3.8.6.","title":"General"},{"location":"git/#general","text":"The main trunk of the Weka Git repository is accessible and browseable via the following URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/main/trunk Other branches can be accessed via https://git.cms.waikato.ac.nz/weka/weka For example, if you want to obtain the source code of the 3.8 version, use this URL: https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8","title":"General"},{"location":"git/#specific-version","text":"Whenever a release of Weka is generated, the repository gets tagged . The tag for a development version has the form dev-X-Y-Z For example, WEKA 3.9.6 corresponds to the tag dev-3-9-6. The tag for a stable version is stable-X-Y-Z The WEKA 3.8 version is one of those stable versions, e.g., stable-3-8-6 will be the tag for Weka 3.8.6.","title":"Specific version"},{"location":"gpgpu/","text":"See : this post I am looking for input from WEKA users. Please leave a comment on the website and I'll respond back. The input/help I need from WEKA users is as follows: I need to know what algorithms would be desired to have optimized first. For now, I'm working on Bayes (for starters). I need willing volunteers to use the revised code that I create (I am not making any changes to the algorithms just diverting the mathematical calculations from the CPU to the GPU to increase speed) and let me know of any performance changes observed.","title":"Gpgpu"},{"location":"gui_chooser_starts_but_not_experimenter_or_explorer/","text":"The GUIChooser starts, but the Explorer and Experimenter do not start and output an Exception like this in the terminal: /usr/share/themes/Mist/gtk-2.0/gtkrc:48: Engine \"mist\" is unsupported, ignoring ---Registering Weka Editors--- java.lang.NullPointerException at weka.gui.explorer.PreprocessPanel.addPropertyChangeListener(PreprocessPanel.java:519) at javax.swing.plaf.synth.SynthPanelUI.installListeners(SynthPanelUI.java:49) at javax.swing.plaf.synth.SynthPanelUI.installUI(SynthPanelUI.java:38) at javax.swing.JComponent.setUI(JComponent.java:652) at javax.swing.JPanel.setUI(JPanel.java:131) ... This behavior happens only under Java 5/6 and Gnome/Linux, KDE doesn't produce this error. The reason for this is, that Weka tries to look more \"native\" and therefore sets a platform-specific Swing theme. Unfortunately, this doesn't seem to be working correctly in Java 5/6 together with Gnome. A workaround for this is to set the cross-platform Metal theme. In order to use another theme one only has to create the following properties file: LookAndFeel.props with this content: Theme = javax.swing.plaf.metal.MetalLookAndFeel","title":"Gui chooser starts but not experimenter or explorer"},{"location":"history/","text":"Book 1st ed. version (3.0) Old GUI version (3.2) Stable/Book 2nd ed. version (3.4) Stable/Book 3rd ed. version (3.6) Stable/Book 4th ed. version (3.8) Development version (3.9) 3.8.6 (pkgs) 3.9.6 (pkgs) 3.8.5 (pkgs) 3.9.5 (pkgs) 3.8.4 (pkgs) 3.9.4 (pkgs) 3.8.3 (pkgs) 3.9.3 (pkgs) 3.8.2 (pkgs) 3.9.2 (pkgs) 3.6.15 3.8.1 (pkgs) 3.9.1 (pkgs) 3.6.14 3.8.0 (pkgs) 3.9.0 (pkgs) 3.6.13 3.7.13 (pkgs) 3.6.12 3.7.12 (pkgs) 3.6.11 3.7.11 (pkgs) 3.6.10 3.7.10 (pkgs) 3.7.9 (pkgs) 3.6.9 3.7.8 (pkgs) 3.6.8 3.7.7 (pkgs) 3.6.7 3.7.6 (pkgs) 3.6.6 3.7.5 (pkgs) 3.4.19 3.6.5 3.7.4 (pkgs) 3.4.18 3.6.4 3.7.3 (pkgs) 3.4.17 3.6.3 3.7.2 (pkgs) 3.4.16 3.6.2 3.7.1 3.4.15 3.6.1 3.7.0 3.4.14 3.6.0 3.4.13 3.5.8 3.4.12 3.5.7 3.4.11 3.5.6 3.4.10 3.5.5 3.4.9 3.5.4 3.4.8 3.5.3 3.4.7 3.5.2 3.4.6 3.5.1 3.4.5 3.5.0 3.4.4 3.4.3 3.4.2 3.4.1 3.4 3.3.6 3.3.5 3.3.4 3.3.3 3.2.3 3.3.2 3.0.6 3.2.2 3.3.1 3.0.5 3.2.1 3.3 3.0.4 3.2 3.0.3 3.1.9 3.0.2 3.1.8 3.0.1 3.1.7 3.0 3.1.6 Prerelease 6 3.1.5 Prerelease 5 3.1.4 Prerelease 4","title":"History"},{"location":"how_do_i_modify_the_classpath/","text":"See the article CLASSPATH and check out this section for changing the environment variable. This article explains how to add a MySQL jar to the variable. With version 3.5.4 or later you can also just use the RunWEKA.ini file to modify your CLASSPATH.","title":"How do i modify the classpath"},{"location":"how_do_i_use_the_associator_generalized_sequential_patterns/","text":"The article GeneralizedSequentialPatterns contains more information on this associator.","title":"How do i use the associator generalized sequential patterns"},{"location":"how_to_run_weka_schemes_from_commandline/","text":"It is quite often the case that one has to run a classifier, filter, attribute selection, etc. from commandline, leaving the comfort of the GUI (most likely the Explorer). Due to the vast amount of options the Weka schemes offer, it can be quite tedious setting up a scheme on the commandline. In the following, a few different approaches are listed that can be used for running a scheme from the commandline: Hardcore approach (works for all versions of Weka) one just uses the -h option to display the commandline help with all available options and chooses the ones that apply, e.g.: java weka.classifiers.functions.SMO -h The drawback of this method is, that one has to take care of escaping nested quotes oneself. As soon as one has to use meta-classifiers, this gets real messy. An introduction to the commandline use can be found in the Primer . copy/paste approach With this approach, one doesn't have to worry about correct nesting, since Weka takes care of that, returning correctly nested and escaped options. Since version 3.5.3, one can right-click (or <Alt>+<Shift> left-click for Mac users) any GenericObjectEditor panel and select the Copy configuration to clipboard option to copy the currently shown configuration to the clipboard and then just paste it into the commandline. One only needs to add the appropriate java call and other general options, like datasets, class index, etc. Another copy/paste approach is copying the configurations from the Explorer log, which is available since version 3.5.4. Every action in the Explorer, like applying a filter, running a classifier, attribute selection, etc. outputs the command to the log as well. This makes is fairly easy copying it to the clipboard and using it in the console, only the java call and other general options need to be added. See also # Primer - introduction to Weka from the commandline CLASSPATH - how to load all necessary libraries or welcome to the JAR hell Command redirection - shows how to redirect output in files","title":"How to run weka schemes from commandline"},{"location":"how_to_run_weka_schemes_from_commandline/#see-also","text":"Primer - introduction to Weka from the commandline CLASSPATH - how to load all necessary libraries or welcome to the JAR hell Command redirection - shows how to redirect output in files","title":"See also"},{"location":"ikvm_with_weka_tutorial/","text":"This tutorial walks you through the creation of a Microsoft C# program that uses Weka , and some Java API classes, via IKVM . The process will be similar for other .NET languages. Set up / Installation # You will first need to install IKVM, which can be found here . You will also need a C# compiler/VM - Mono is an excellent open source solution for both linux and windows, or you could just use Microsoft Visual Studio .NET. Conversion from Java to a .NET dll # With that out of the way, the first thing you will want to do is to convert the Weka .jar file into a .NET dll. To do this, we will use ikvmc , which is the IKVM static compiler. On the console, go to the directory which contains weka.jar, and type: > ikvmc -target:library weka.jar The -target:library call causes ikvmc to create a .dll library instead of an executable. Note that the IKVM tutorial tells you that you should add -reference:/usr/lib/IKVM.GNU.Classpath.dll (or appropriate path) to the above command, it tells IKVM where to find the GNU Classpath library. However, IKVM.GNU.Classpath.dll Is no longer included in the download package, and is from very old versions of IKVM. When Sun open sources Java, it got replaced by the IKVM.OpenJDK.*.dll files. You should now have a file called \"weka.dll\", which is a .NET version of the entire weka API. That's exactly what we want! Use the dll in a .NET application # To try it out, lets use a small C# program that I wrote. The program simply runs the J48 classifier on the Iris dataset with a 66% test/data split, and prints out the correctness percentage. It also uses a few Java classes, and is already about 95% legal Java code. The code is here: //start of file Main.cs using System ; class MainClass { public static void Main ( string [] args ) { Console . WriteLine ( \"Hello Java, from C#!\" ); classifyTest (); } const int percentSplit = 66 ; public static void classifyTest () { try { weka . core . Instances insts = new weka . core . Instances ( new java . io . FileReader ( \"iris.arff\" )); insts . setClassIndex ( insts . numAttributes () - 1 ); weka . classifiers . Classifier cl = new weka . classifiers . trees . J48 (); Console . WriteLine ( \"Performing \" + percentSplit + \"% split evaluation.\" ); //randomize the order of the instances in the dataset. weka . filters . Filter myRandom = new weka . filters . unsupervised . instance . Randomize (); myRandom . setInputFormat ( insts ); insts = weka . filters . Filter . useFilter ( insts , myRandom ); int trainSize = insts . numInstances () * percentSplit / 100 ; int testSize = insts . numInstances () - trainSize ; weka . core . Instances train = new weka . core . Instances ( insts , 0 , trainSize ); cl . buildClassifier ( train ); int numCorrect = 0 ; for ( int i = trainSize ; i < insts . numInstances (); i ++ ) { weka . core . Instance currentInst = insts . instance ( i ); double predictedClass = cl . classifyInstance ( currentInst ); if ( predictedClass == insts . instance ( i ). classValue ()) numCorrect ++ ; } Console . WriteLine ( numCorrect + \" out of \" + testSize + \" correct (\" + ( double )(( double ) numCorrect / ( double ) testSize * 100.0 ) + \"%)\" ); } catch ( java . lang . Exception ex ) { ex . printStackTrace (); } } } //end of file Main.cs Compile and run it # Now we just need to compile it. If you are using MonoDevelop or Visual Studio, you will need to add references to weka.dll, and all of the IKVM.OpenJDK.*.dll files, and lastly IKVM.Runtime.dll into your project. Otherwise, on the command line, you can type: NOTE: replace IKVM.OpenJDK. .dll with the remaining IKVM.openJDK files. >mcs Main.cs -r:weka.dll,IKVM.Runtime.dll,IKVM.OpenJDK.core.dll, IKVM.OpenJDK.*.dll to run the Mono C# compiler with references to the appropriate dlls (according to the Mono documentation, the command line arguments for Visual Studio are the same). And there you go! Now you can run the program. But make sure that the Iris.arff dataset is in the same directory first. For mono: >mono Main.exe or if you are using visual studio, just: >Main.exe Hopefully you will get as output: Hello Java, from C#! Performing 66% split evaluation. 49 out of 51 correct (96.078431372549%) And there you have it. Now we have a working program that uses Weka classes, and some classes from the standard Java API, in a C# program for the .NET framework. Links # An Introduction to IKVM IKVM.NET Mono The official IKVM tutorial Use Weka with the Microsoft .NET Framework","title":"Ikvm with weka tutorial"},{"location":"ikvm_with_weka_tutorial/#set-up-installation","text":"You will first need to install IKVM, which can be found here . You will also need a C# compiler/VM - Mono is an excellent open source solution for both linux and windows, or you could just use Microsoft Visual Studio .NET.","title":"Set up / Installation"},{"location":"ikvm_with_weka_tutorial/#conversion-from-java-to-a-net-dll","text":"With that out of the way, the first thing you will want to do is to convert the Weka .jar file into a .NET dll. To do this, we will use ikvmc , which is the IKVM static compiler. On the console, go to the directory which contains weka.jar, and type: > ikvmc -target:library weka.jar The -target:library call causes ikvmc to create a .dll library instead of an executable. Note that the IKVM tutorial tells you that you should add -reference:/usr/lib/IKVM.GNU.Classpath.dll (or appropriate path) to the above command, it tells IKVM where to find the GNU Classpath library. However, IKVM.GNU.Classpath.dll Is no longer included in the download package, and is from very old versions of IKVM. When Sun open sources Java, it got replaced by the IKVM.OpenJDK.*.dll files. You should now have a file called \"weka.dll\", which is a .NET version of the entire weka API. That's exactly what we want!","title":"Conversion from Java to a .NET dll"},{"location":"ikvm_with_weka_tutorial/#use-the-dll-in-a-net-application","text":"To try it out, lets use a small C# program that I wrote. The program simply runs the J48 classifier on the Iris dataset with a 66% test/data split, and prints out the correctness percentage. It also uses a few Java classes, and is already about 95% legal Java code. The code is here: //start of file Main.cs using System ; class MainClass { public static void Main ( string [] args ) { Console . WriteLine ( \"Hello Java, from C#!\" ); classifyTest (); } const int percentSplit = 66 ; public static void classifyTest () { try { weka . core . Instances insts = new weka . core . Instances ( new java . io . FileReader ( \"iris.arff\" )); insts . setClassIndex ( insts . numAttributes () - 1 ); weka . classifiers . Classifier cl = new weka . classifiers . trees . J48 (); Console . WriteLine ( \"Performing \" + percentSplit + \"% split evaluation.\" ); //randomize the order of the instances in the dataset. weka . filters . Filter myRandom = new weka . filters . unsupervised . instance . Randomize (); myRandom . setInputFormat ( insts ); insts = weka . filters . Filter . useFilter ( insts , myRandom ); int trainSize = insts . numInstances () * percentSplit / 100 ; int testSize = insts . numInstances () - trainSize ; weka . core . Instances train = new weka . core . Instances ( insts , 0 , trainSize ); cl . buildClassifier ( train ); int numCorrect = 0 ; for ( int i = trainSize ; i < insts . numInstances (); i ++ ) { weka . core . Instance currentInst = insts . instance ( i ); double predictedClass = cl . classifyInstance ( currentInst ); if ( predictedClass == insts . instance ( i ). classValue ()) numCorrect ++ ; } Console . WriteLine ( numCorrect + \" out of \" + testSize + \" correct (\" + ( double )(( double ) numCorrect / ( double ) testSize * 100.0 ) + \"%)\" ); } catch ( java . lang . Exception ex ) { ex . printStackTrace (); } } } //end of file Main.cs","title":"Use the dll in a .NET application"},{"location":"ikvm_with_weka_tutorial/#compile-and-run-it","text":"Now we just need to compile it. If you are using MonoDevelop or Visual Studio, you will need to add references to weka.dll, and all of the IKVM.OpenJDK.*.dll files, and lastly IKVM.Runtime.dll into your project. Otherwise, on the command line, you can type: NOTE: replace IKVM.OpenJDK. .dll with the remaining IKVM.openJDK files. >mcs Main.cs -r:weka.dll,IKVM.Runtime.dll,IKVM.OpenJDK.core.dll, IKVM.OpenJDK.*.dll to run the Mono C# compiler with references to the appropriate dlls (according to the Mono documentation, the command line arguments for Visual Studio are the same). And there you go! Now you can run the program. But make sure that the Iris.arff dataset is in the same directory first. For mono: >mono Main.exe or if you are using visual studio, just: >Main.exe Hopefully you will get as output: Hello Java, from C#! Performing 66% split evaluation. 49 out of 51 correct (96.078431372549%) And there you have it. Now we have a working program that uses Weka classes, and some classes from the standard Java API, in a C# program for the .NET framework.","title":"Compile and run it"},{"location":"ikvm_with_weka_tutorial/#links","text":"An Introduction to IKVM IKVM.NET Mono The official IKVM tutorial Use Weka with the Microsoft .NET Framework","title":"Links"},{"location":"instance_id/","text":"People often want to tag their instances with identifiers , so they can keep track of them and the predictions made on them. Adding the ID # A new ID attribute is added real easy: one only needs to run the AddID filter over the dataset and it's done. Here's an example (at a DOS/Unix command prompt): java weka.filters.unsupervised.attribute.AddID -i data_without_id.arff -o data_with_id.arff (all on a single line) Note: the AddID filter adds a numeric attribute, not a String attribute to the dataset. If you want to remove this ID attribute for the classifier in a FilteredClassifier environment again, use the Remove filter instead of the RemoveType filter (same package). Removing the ID # If you run from the command line you can use the -p option to output predictions plus any other attributes you are interested in. So it is possible to have a string attribute in your data that acts as an identifier. A problem is that most classifiers don't like String attributes, but you can get around this by using the RemoveType (this removes String attributes by default). Here's an example. Lets say you have a training file named train.arff , a testing file named test.arff , and they have an identifier String attribute as their 5th attribute. You can get the predictions from J48 along with the identifier strings by issuing the following command (at a DOS/Unix command prompt): java weka.classifiers.meta.FilteredClassifier -F weka.filters.unsupervised.attribute.RemoveType -W weka.classifiers.trees.J48 -t train.arff -T test.arff -p 5 (all on a single line) If you want, you can redirect the output to a file by adding \" > output.txt \" to the end of the line. In the Explorer GUI you could try a similar trick of using the String attribute identifiers here as well. Choose the FilteredClassifier , with the RemoveType as the filter, and whatever classifier you prefer. When you visualize the results you will need click through each instance to see the identifier listed for each.","title":"Instance id"},{"location":"instance_id/#adding-the-id","text":"A new ID attribute is added real easy: one only needs to run the AddID filter over the dataset and it's done. Here's an example (at a DOS/Unix command prompt): java weka.filters.unsupervised.attribute.AddID -i data_without_id.arff -o data_with_id.arff (all on a single line) Note: the AddID filter adds a numeric attribute, not a String attribute to the dataset. If you want to remove this ID attribute for the classifier in a FilteredClassifier environment again, use the Remove filter instead of the RemoveType filter (same package).","title":"Adding the ID"},{"location":"instance_id/#removing-the-id","text":"If you run from the command line you can use the -p option to output predictions plus any other attributes you are interested in. So it is possible to have a string attribute in your data that acts as an identifier. A problem is that most classifiers don't like String attributes, but you can get around this by using the RemoveType (this removes String attributes by default). Here's an example. Lets say you have a training file named train.arff , a testing file named test.arff , and they have an identifier String attribute as their 5th attribute. You can get the predictions from J48 along with the identifier strings by issuing the following command (at a DOS/Unix command prompt): java weka.classifiers.meta.FilteredClassifier -F weka.filters.unsupervised.attribute.RemoveType -W weka.classifiers.trees.J48 -t train.arff -T test.arff -p 5 (all on a single line) If you want, you can redirect the output to a file by adding \" > output.txt \" to the end of the line. In the Explorer GUI you could try a similar trick of using the String attribute identifiers here as well. Choose the FilteredClassifier , with the RemoveType as the filter, and whatever classifier you prefer. When you visualize the results you will need click through each instance to see the identifier listed for each.","title":"Removing the ID"},{"location":"j48_weighter_patch/","text":"Description # J48-Weighter patch: Modification of J48 for Weighted Data. Reference # -none- Package # Patches to: weka.classifiers.trees.j48 weka.core weka.filters.unsupervised.attribute Download # Patch for Weka 3.4.5: j48-weighter.patch Additional Information # This patch addresses two separate but related issues: The proposed filter \"Weighter\" allows one to specify a numeric attribute to be used as an instance weight. As mentioned on Wekalist, tests using weighted sample-survey data indicated possible problems in the J48 decision tree algorithm. The Weighter filter # Weighter is a general-purpose filter independent of J48 or other classifiers, but to preserve the weight assignment it initially had to be run under FilteredClassifier. To make weights persistent via .arff files, some changes were made in Instances and Instance, while retaining compatibility with the existing ARFF format. Briefly, if Weighter is applied to an attribute, e.g. \"fnlwgt\" in the \"adult\" dataset from the UCI repository, that attribute is removed and its value is used as instance weight. Upon Save, the weight is appended to each instance under the attribute name \"::weight::fnlwgt\"; reading the .arff file inverts the Save process, transparent to the user. Repeated application of Weighter multiplies the weight and extends its name. The special case of invoking Weighter without an attribute argument restores the unweighted dataset, with an appended attribute named as above. J48 with instance weights # The simple rescaling inserted in weka.classifiers.trees.j48.Stats is intended to: use the correct sample size in the normal approximation to the binomial, make the scale of the .5 continuity correction consistent with the data, base the minimum-leaf-count option (-M) on unweighted counts. These changes make pruning more effective with weighted data, and help to reduce apparent overfitting. This should be the case whether the weights reflect missing value imputation (as is common in Weka), or survey-sampling probabilities (e.g. \"fnlwgt\" in the UCI \"adult\" sample). The modification to j48.Stats would not have worked on its own. In particular, j48.Distribution had been written to maintain one set of counts only. To work on weighted data statistical algorithms often require both weighted and unweighted counts. A few other minor modifications were introduced to change the way \"-M\" works. One effect is that, for this purpose, instances with missing x-values are no longer counted; they are considered missing.","title":"Description"},{"location":"j48_weighter_patch/#description","text":"J48-Weighter patch: Modification of J48 for Weighted Data.","title":"Description"},{"location":"j48_weighter_patch/#reference","text":"-none-","title":"Reference"},{"location":"j48_weighter_patch/#package","text":"Patches to: weka.classifiers.trees.j48 weka.core weka.filters.unsupervised.attribute","title":"Package"},{"location":"j48_weighter_patch/#download","text":"Patch for Weka 3.4.5: j48-weighter.patch","title":"Download"},{"location":"j48_weighter_patch/#additional-information","text":"This patch addresses two separate but related issues: The proposed filter \"Weighter\" allows one to specify a numeric attribute to be used as an instance weight. As mentioned on Wekalist, tests using weighted sample-survey data indicated possible problems in the J48 decision tree algorithm.","title":"Additional Information"},{"location":"j48_weighter_patch/#the-weighter-filter","text":"Weighter is a general-purpose filter independent of J48 or other classifiers, but to preserve the weight assignment it initially had to be run under FilteredClassifier. To make weights persistent via .arff files, some changes were made in Instances and Instance, while retaining compatibility with the existing ARFF format. Briefly, if Weighter is applied to an attribute, e.g. \"fnlwgt\" in the \"adult\" dataset from the UCI repository, that attribute is removed and its value is used as instance weight. Upon Save, the weight is appended to each instance under the attribute name \"::weight::fnlwgt\"; reading the .arff file inverts the Save process, transparent to the user. Repeated application of Weighter multiplies the weight and extends its name. The special case of invoking Weighter without an attribute argument restores the unweighted dataset, with an appended attribute named as above.","title":"The Weighter filter"},{"location":"j48_weighter_patch/#j48-with-instance-weights","text":"The simple rescaling inserted in weka.classifiers.trees.j48.Stats is intended to: use the correct sample size in the normal approximation to the binomial, make the scale of the .5 continuity correction consistent with the data, base the minimum-leaf-count option (-M) on unweighted counts. These changes make pruning more effective with weighted data, and help to reduce apparent overfitting. This should be the case whether the weights reflect missing value imputation (as is common in Weka), or survey-sampling probabilities (e.g. \"fnlwgt\" in the UCI \"adult\" sample). The modification to j48.Stats would not have worked on its own. In particular, j48.Distribution had been written to maintain one set of counts only. To work on weighted data statistical algorithms often require both weighted and unweighted counts. A few other minor modifications were introduced to change the way \"-M\" works. One effect is that, for this purpose, instances with missing x-values are no longer counted; they are considered missing.","title":"J48 with instance weights"},{"location":"java_virtual_machine/","text":"The Java virtual machine (JVM) is the platform dependent interpreter of the Java bytecode (i.e., the classes ). It translates the bytecode into machine specific instructions. Amount of available memory # If you start the virtual machine without any parameters it takes default values for stack and heap. In case you run into OutOfMemory exceptions, try to start your JVM with a bigger maximum heap size. (However, there's a limit, depending on your OS. See the 32-Bit and 64-Bit sections.) 32-bit # With a 32-Bit machine you can address at most 4GB of virtual memory . Different operating systems divide up the memory further into //system/kernel and user space*. From experience, you can achieve the following maximum sizes for the heap on Windows and Linux: Windows: 1.4GB Linux: 1.7GB 64-bit # Larger heap sizes are available when using 64-bit Java in a conjunction with a 64-bit operating system. There is more information available here .","title":"Java virtual machine"},{"location":"java_virtual_machine/#amount-of-available-memory","text":"If you start the virtual machine without any parameters it takes default values for stack and heap. In case you run into OutOfMemory exceptions, try to start your JVM with a bigger maximum heap size. (However, there's a limit, depending on your OS. See the 32-Bit and 64-Bit sections.)","title":"Amount of available memory"},{"location":"java_virtual_machine/#32-bit","text":"With a 32-Bit machine you can address at most 4GB of virtual memory . Different operating systems divide up the memory further into //system/kernel and user space*. From experience, you can achieve the following maximum sizes for the heap on Windows and Linux: Windows: 1.4GB Linux: 1.7GB","title":"32-bit"},{"location":"java_virtual_machine/#64-bit","text":"Larger heap sizes are available when using 64-bit Java in a conjunction with a 64-bit operating system. There is more information available here .","title":"64-bit"},{"location":"jupyter_notebooks/","text":"Jupyter notebooks are extremely popular in the Python world, simply because it is great to combine documentation and code in a visually appealing way. Great tool for teaching! Thanks to the IJava kernel and the JDK 9+ JShell feature, it is possible to run Java within Notebooks without compiling the code now as well. Installation on Linux # The following worked on Linux Mint 18.2: create a directory called weka-notebooks mkdir weka-notebooks change into the directory and create a Python virtual environment: cd weka-notebooks virtualenv -p /usr/bin/python3.5 venv install Jupyter notebooks and its dependencies: venv/bin/pip install jupyter then download the latest IJava release (at time of writing, this was 1.20 ) into this directory unzip the IJava archive: unzip -q ijava*.zip install the Java kernel into the virtual environment, using the IJava installer: venv/bin/python install.py --sys-prefix after that, fire up Jupyter using: venv/bin/jupyter-notebook now you can create new (Java) notebooks! Installation on Windows (using anaconda) # open a command prompt create a new environment using anaconda (e.g., for Python 3.5) conda create -n py35-ijava python=3.5 activate environment activate py35-ijava install Jupyter pip install jupyter download the latest IJava release (at time of writing, this was 1.20 ) unzip the IJava release (e.g., with your File browser or 7-Zip) change into the directory where you extracted the release, containing the install.py , e.g.: cd C:\\Users\\fracpete\\Downloads\\ijava-1.2.0 install the kernel python install.py --sys-prefix start Jupyter jupyter-notebook now you can create new (Java) notebooks!","title":"Jupyter notebooks"},{"location":"jupyter_notebooks/#installation-on-linux","text":"The following worked on Linux Mint 18.2: create a directory called weka-notebooks mkdir weka-notebooks change into the directory and create a Python virtual environment: cd weka-notebooks virtualenv -p /usr/bin/python3.5 venv install Jupyter notebooks and its dependencies: venv/bin/pip install jupyter then download the latest IJava release (at time of writing, this was 1.20 ) into this directory unzip the IJava archive: unzip -q ijava*.zip install the Java kernel into the virtual environment, using the IJava installer: venv/bin/python install.py --sys-prefix after that, fire up Jupyter using: venv/bin/jupyter-notebook now you can create new (Java) notebooks!","title":"Installation on Linux"},{"location":"jupyter_notebooks/#installation-on-windows-using-anaconda","text":"open a command prompt create a new environment using anaconda (e.g., for Python 3.5) conda create -n py35-ijava python=3.5 activate environment activate py35-ijava install Jupyter pip install jupyter download the latest IJava release (at time of writing, this was 1.20 ) unzip the IJava release (e.g., with your File browser or 7-Zip) change into the directory where you extracted the release, containing the install.py , e.g.: cd C:\\Users\\fracpete\\Downloads\\ijava-1.2.0 install the kernel python install.py --sys-prefix start Jupyter jupyter-notebook now you can create new (Java) notebooks!","title":"Installation on Windows (using anaconda)"},{"location":"just_in_time_jit_compiler/","text":"For maximum enjoyment, use a virtual machine that incorporates a just-in-time compiler . This can speed things up quite significantly. Note also that there can be large differences in execution time between different virtual machines. The Sun JDK/JRE all include a JIT compiler (\"hotspot\").","title":"Just in time jit compiler"},{"location":"jvm/","text":"see Java Virtual Machine","title":"Jvm"},{"location":"knowledge_flow_toolbars_are_empty/","text":"In the terminal, you will most likely see this output as well: Failed to instantiate: weka.gui.beans.Loader This behavior can happen under Gnome with Java 5/6, see GUIChooser starts but not Experimenter or Explorer for a solution.","title":"Knowledge flow toolbars are empty"},{"location":"learning_resources/","text":"Videos # Youtube channel of Data Mining with Weka MOOCs Tutorials # Learn Data Science Online MOOCs # Data Mining with Weka More Data Mining with Weka Advanced Data Mining with Weka","title":"Videos"},{"location":"learning_resources/#videos","text":"Youtube channel of Data Mining with Weka MOOCs","title":"Videos"},{"location":"learning_resources/#tutorials","text":"Learn Data Science Online","title":"Tutorials"},{"location":"learning_resources/#moocs","text":"Data Mining with Weka More Data Mining with Weka Advanced Data Mining with Weka","title":"MOOCs"},{"location":"lib_svm/","text":"Description # Wrapper class for the LibSVM library by Chih-Chung Chang and Chih-Jen Lin. The original wrapper, named WLSVM, was developed by Yasser EL-Manzalawy. The current version is complete rewrite of the wrapper, using Reflection in order to avoid compilation errors, in case the libsvm.jar is not in the CLASSPATH . Important note: From WEKA >= 3.7.2 installation and use of LibSVM in WEKA has been simplified by the creation of a LibSVM package that can be installed using either the graphical or command line package manager . Reference (Weka <= 3.6.8) # LibSVM WLSVM Package # weka.classifiers.functions Download # The wrapper class is part of WEKA since version 3.5.2. But LibSVM , as a third-party-tool needs to be downloaded separately. It is recommended to upgrade to a post-3.5.3 version (or git ) for bug-fixes and extensions (contains now the distributionForInstance method). CLASSPATH # Add the libsvm.jar from the LibSVM distribution to your CLASSPATH to make it available. Note: Do NOT start WEKA then with java -jar weka.jar . The -jar option overwrites the CLASSPATH , not augments it (a very common trap to fall into). Instead use something like this on Linux: java -classpath $CLASSPATH :weka.jar:libsvm.jar weka.gui.GUIChooser or this on Win32 (if you're starting it from commandline): java -classpath \"%CLASSPATH%;weka.jar;libsvm.jar\" weka.gui.GUIChooser If you're starting WEKA from the Start Menu on Windows, you'll have to add the libsvm.jar to your CLASSPATH environment variable. The following steps are for Windows XP (unfortunately, the GUI changes among the different Windows versions): right-click on My Computer and select Properties from the menu choose the Advanced tab and click on Environment variables at the bottom either add or modify a variable called CLASSPATH and add the libsvm.jar with full path to it Troubleshooting # LibSVM classes not in CLASSPATH! Check whether the libsvm.jar is really in your CLASSPATH. Execute the following command in the SimpleCLI : java weka.core.SystemInfo The property java.class.path must list the libsvm.jar . If it is listed, check whether the path is correct. If you're on Windows and you find %CLASSPATH% there, see next bullet point to fix this. On Windows, if you added the libsvm.jar to your CLASSPATH environment variable, it can still happen that WEKA pops up the error message that the LibSVM classes are not in your CLASSPATH. This can happen where the %CLASSPATH% does not get expanded to its actual value in starting up WEKA. You can inspect your current CLASSPATH with which WEKA got started up with the SimpleCLI (see previous bullet point). If %CLASSPATH% is listed there, your system has the same problem. You can also explicitly add a .jar file to RunWeka.ini . Note: backslashes have to be escaped, not only once, but twice (they get interpreted by Java twice!). In other words, instead of one you have to use four : C:\\some\\where then turns into C:\\\\\\\\some\\\\\\\\where . Issues with libsvm.jar that were discussed on the Weka list in April 2007 (and may no longer be relevant) # The following changes were not incorporated in WEKA, since it also means modifying the LibSVM Java code, which (I think) is autogenerated from the C code. The authors of LibSVM might have to consider that update. It's left to the reader to incorporate these changes. libsvm.svm uses Math.random # libsvm.svm calls Math.random so the model it returns is usually different for the same training set and svm parameters over time. Obviously, if you call libsvm.svm from weka.classifiers.functions.libsvm, and you call it again from libsvm.svm_train, the results are also different. You can use libsvm.svm_save_model to record the svms into files, and then compare the model file from WEKA LibSVM with the model file from libsvm.svm_predict. Then you can see that ProbA values use to be different. WEKA experimenter is based on using always the same random sequences in order to repeat experiments with the same results. So, I'm afraid some important design changes are required on libsvm.jar and weka.classifiers.functions.libsvm.class to keep such behaviour. We made a quick fix adding an static Random attribute to libsvm.svm class: static java . util . Random ranGen = new Random ( 0 ); We have changed all Math.random() invokations to ranGen.nextdouble(). Then we have obtained the same svm from weka LibSVM than from LibSVM train_svm. However, WEKA accuracy results on primary_tumor data were still worse, so there's something wrong when weka uses the svm model at testing step. Classes without instances # ARFF format provides some meta-information (i.e. attributes name and type, set of possible values for nominal attributes), but LibSVM format doesn't. So if there are classes in the dataset with zero occurrences through all the instances, LibSVM thinks that these classes don't exist whereas WEKA knows they exist. For example, there is a class in primary tumor dataset that never appears. When WEKA experimenter makes testing, it calls to: public static double svm_predict_probability ( svm_model model , svm_node [] x , double [] prob_estimates ) passing the array prob_estimates plenty of zeros (array cells are initialized to zero). The size of the array is equal to the number of classes (= 22). On the other hand, if this method is invoked from libsvm.svm_predict, the class that never appears is ignored, so the array dimension is now equal to 21. So accuracy results are different depending on origin of svm_predict_probability method invocation. I think that better results are obtained if classes without instances are ignored, but I don't know if it is very fair. In fact, accuracies from weka.libsvm and from libsvm.predict_svm seem to be the same if the class that never appears is removed from ARFF file. Note that this problem only appears when testing, because the training code uses always the svm_group_classes method to compute the number of classes, so Instances.numClasses() value is never used for training. Moreover, maybe the mismatch between the training number of classes and the testing number of classes is the reason behind worse accuracy results when svm_predict_probability invocation is made from WEKA, but I haven't proved it yet. Note that this problem does also happen when you have a class with less examples than the number of folds. For some folds, the class will not have training examples. We also made a quick fix for this problem: Add this public method to libsvm.svm_model class public int getNr_class(){return nr_class;} Make the following changes into distributionforInstance Method at weka.classifiers.functions.LibSVM First line of the method: int [] labels = new int [ instance . numClasses () ] ; could be changed to int [] labels = new int [ (( svm_model ) m_Model ). getNr_class () ] ; Last line in \"if(m_ProbablityEstimates)\" block: prob_estimates = new double [ instance . numClasses () ] ; could be changed to prob_estimates = new double [ (( svm_model ) m_Model ). getNr_class () ] ;","title":"Lib svm"},{"location":"lib_svm/#description","text":"Wrapper class for the LibSVM library by Chih-Chung Chang and Chih-Jen Lin. The original wrapper, named WLSVM, was developed by Yasser EL-Manzalawy. The current version is complete rewrite of the wrapper, using Reflection in order to avoid compilation errors, in case the libsvm.jar is not in the CLASSPATH . Important note: From WEKA >= 3.7.2 installation and use of LibSVM in WEKA has been simplified by the creation of a LibSVM package that can be installed using either the graphical or command line package manager .","title":"Description"},{"location":"lib_svm/#reference-weka-368","text":"LibSVM WLSVM","title":"Reference (Weka &lt;= 3.6.8)"},{"location":"lib_svm/#package","text":"weka.classifiers.functions","title":"Package"},{"location":"lib_svm/#download","text":"The wrapper class is part of WEKA since version 3.5.2. But LibSVM , as a third-party-tool needs to be downloaded separately. It is recommended to upgrade to a post-3.5.3 version (or git ) for bug-fixes and extensions (contains now the distributionForInstance method).","title":"Download"},{"location":"lib_svm/#classpath","text":"Add the libsvm.jar from the LibSVM distribution to your CLASSPATH to make it available. Note: Do NOT start WEKA then with java -jar weka.jar . The -jar option overwrites the CLASSPATH , not augments it (a very common trap to fall into). Instead use something like this on Linux: java -classpath $CLASSPATH :weka.jar:libsvm.jar weka.gui.GUIChooser or this on Win32 (if you're starting it from commandline): java -classpath \"%CLASSPATH%;weka.jar;libsvm.jar\" weka.gui.GUIChooser If you're starting WEKA from the Start Menu on Windows, you'll have to add the libsvm.jar to your CLASSPATH environment variable. The following steps are for Windows XP (unfortunately, the GUI changes among the different Windows versions): right-click on My Computer and select Properties from the menu choose the Advanced tab and click on Environment variables at the bottom either add or modify a variable called CLASSPATH and add the libsvm.jar with full path to it","title":"CLASSPATH"},{"location":"lib_svm/#troubleshooting","text":"LibSVM classes not in CLASSPATH! Check whether the libsvm.jar is really in your CLASSPATH. Execute the following command in the SimpleCLI : java weka.core.SystemInfo The property java.class.path must list the libsvm.jar . If it is listed, check whether the path is correct. If you're on Windows and you find %CLASSPATH% there, see next bullet point to fix this. On Windows, if you added the libsvm.jar to your CLASSPATH environment variable, it can still happen that WEKA pops up the error message that the LibSVM classes are not in your CLASSPATH. This can happen where the %CLASSPATH% does not get expanded to its actual value in starting up WEKA. You can inspect your current CLASSPATH with which WEKA got started up with the SimpleCLI (see previous bullet point). If %CLASSPATH% is listed there, your system has the same problem. You can also explicitly add a .jar file to RunWeka.ini . Note: backslashes have to be escaped, not only once, but twice (they get interpreted by Java twice!). In other words, instead of one you have to use four : C:\\some\\where then turns into C:\\\\\\\\some\\\\\\\\where .","title":"Troubleshooting"},{"location":"lib_svm/#issues-with-libsvmjar-that-were-discussed-on-the-weka-list-in-april-2007-and-may-no-longer-be-relevant","text":"The following changes were not incorporated in WEKA, since it also means modifying the LibSVM Java code, which (I think) is autogenerated from the C code. The authors of LibSVM might have to consider that update. It's left to the reader to incorporate these changes.","title":"Issues with libsvm.jar that were discussed on the Weka list in April 2007 (and may no longer be relevant)"},{"location":"lib_svm/#libsvmsvm-uses-mathrandom","text":"libsvm.svm calls Math.random so the model it returns is usually different for the same training set and svm parameters over time. Obviously, if you call libsvm.svm from weka.classifiers.functions.libsvm, and you call it again from libsvm.svm_train, the results are also different. You can use libsvm.svm_save_model to record the svms into files, and then compare the model file from WEKA LibSVM with the model file from libsvm.svm_predict. Then you can see that ProbA values use to be different. WEKA experimenter is based on using always the same random sequences in order to repeat experiments with the same results. So, I'm afraid some important design changes are required on libsvm.jar and weka.classifiers.functions.libsvm.class to keep such behaviour. We made a quick fix adding an static Random attribute to libsvm.svm class: static java . util . Random ranGen = new Random ( 0 ); We have changed all Math.random() invokations to ranGen.nextdouble(). Then we have obtained the same svm from weka LibSVM than from LibSVM train_svm. However, WEKA accuracy results on primary_tumor data were still worse, so there's something wrong when weka uses the svm model at testing step.","title":"libsvm.svm uses Math.random"},{"location":"lib_svm/#classes-without-instances","text":"ARFF format provides some meta-information (i.e. attributes name and type, set of possible values for nominal attributes), but LibSVM format doesn't. So if there are classes in the dataset with zero occurrences through all the instances, LibSVM thinks that these classes don't exist whereas WEKA knows they exist. For example, there is a class in primary tumor dataset that never appears. When WEKA experimenter makes testing, it calls to: public static double svm_predict_probability ( svm_model model , svm_node [] x , double [] prob_estimates ) passing the array prob_estimates plenty of zeros (array cells are initialized to zero). The size of the array is equal to the number of classes (= 22). On the other hand, if this method is invoked from libsvm.svm_predict, the class that never appears is ignored, so the array dimension is now equal to 21. So accuracy results are different depending on origin of svm_predict_probability method invocation. I think that better results are obtained if classes without instances are ignored, but I don't know if it is very fair. In fact, accuracies from weka.libsvm and from libsvm.predict_svm seem to be the same if the class that never appears is removed from ARFF file. Note that this problem only appears when testing, because the training code uses always the svm_group_classes method to compute the number of classes, so Instances.numClasses() value is never used for training. Moreover, maybe the mismatch between the training number of classes and the testing number of classes is the reason behind worse accuracy results when svm_predict_probability invocation is made from WEKA, but I haven't proved it yet. Note that this problem does also happen when you have a class with less examples than the number of folds. For some folds, the class will not have training examples. We also made a quick fix for this problem: Add this public method to libsvm.svm_model class public int getNr_class(){return nr_class;} Make the following changes into distributionforInstance Method at weka.classifiers.functions.LibSVM First line of the method: int [] labels = new int [ instance . numClasses () ] ; could be changed to int [] labels = new int [ (( svm_model ) m_Model ). getNr_class () ] ; Last line in \"if(m_ProbablityEstimates)\" block: prob_estimates = new double [ instance . numClasses () ] ; could be changed to prob_estimates = new double [ (( svm_model ) m_Model ). getNr_class () ] ;","title":"Classes without instances"},{"location":"literature/","text":"Apart from Data Mining: Practical Machine Learning Tools and Techniques , there are several other books with material on Weka: Jason Bell (2020) Machine Learning: Hands-On for Developers and Technical Professionals, Second Edition , Wiley. Richard J. Roiger (2020) Just Enough R! An Interactive Approach to Machine Learning and Analytics , CRC Press. Parteek Bhatia (2019) Data Mining and Data Warehousing Principles and Practical Techniques , Cambridge University Press. Mark Wickham (2018) Practical Java Machine Learning Projects with Google Cloud Platform and Amazon Web Services , APress. AshishSingh Bhatia, Bostjan Kaluza (2018) Machine Learning in Java - Second Edition , Packt Publishing. Richard J. Roiger (2016) Data Mining: A Tutorial-Based Primer , CRC Press. Mei Yu Yuan (2016) Data Mining and Machine Learning: WEKA Technology and Practice , Tsinghua University Press (in Chinese). J\u00fcrgen Cleve, Uwe L\u00e4mmel (2016) Data Mining , De Gruyter (in German). Eric Rochester (2015) Clojure Data Analysis Cookbook - Second Edition , Packt Publishing. Bo\u0161tjan Kalu\u017ea (2013) Instant Weka How-to , Packt Publishing. Hongbo Du (2010) Data Mining Techniques and Applications , Cengage Learning. A book explaining why Weka won't learn (discovered by Stuart Inglis).","title":"Literature"},{"location":"mailing_list/","text":"The WEKA Mailing list can be found here: List for subscribing/unsubscribing to the list. Archives for searching previous posted messages. Before posting, please read the mailing list etiquette . Once you have subscribed to the list, you can send posts to the list using the following email address: weka-users@lists.sourceforge.net NB: The mailing list moved to Sourceforge.net in mid-December 2024, due to the university mailman server being decommissioned. You can find the old archives on this mirror .","title":"Mailing list"},{"location":"making_predictions/","text":"Command line # The following sections show how to obtain predictions/classifications without writing your own Java code via the command line. Classifiers # After a model has been saved , one can make predictions for a test set, whether that set contains valid class values or not. The output will contain both the actual and predicted class. (Note that if the test class contains simply '?' for the class label for each instance, the \"actual\" class label for each instance will not contain useful information, but the predicted class label will.) The -T <test_set> command-line switch specifies the dataset of instances whose classes are to be predicted, while the -p <attribute_range> switch allows the user to write out a range of attributes (examples: \"1-2\" for the first and second attributes, or \"0\" for no attributes). Sample command line: java weka.classifiers.trees.J48 -T unclassified.arff -l j48.model -p 0 The format of the output is as follows: <test_instance_index> <actual_class_index>:<actual_class_val> <pred_class_index>:<pred_class_val> [+| ] <prob_of_pred_class_val> where \"+\" occurs only for those items that were mispredicted. Note that if the actual class label is always \"?\" (i.e., the dataset does not include known class labels), the error column will always be empty. Sample output: inst# actual predicted error prediction 1 1:? 1:0 0.757 2 1:? 1:0 0.824 3 1:? 1:0 0.807 4 1:? 1:0 0.807 5 1:? 1:0 0.79 6 1:? 2:1 0.661 ... In this case, taken directly from a test dataset where all class attributes were marked by \"?\", the \"actual\" column, which can be ignored, simply states that each class belongs to an unknown class. The \"predicted\" column shows that instances 1 through 5 are predicted to be of class 1, whose value is 0, and instance 6 is predicted to be of class 2, whose value is 1. The error field is empty; if predictions were being performed on a labeled test set, each instance where the prediction failed to match the label would contain a \"+\". The probability that instance 1 actually belongs to class 0 is estimated at 0.757. Notes: Since Weka 3.5.4 you can also output the complete class distribution, not just the prediction, by using the parameter -distribution in conjunction with the -p option. In this case, \"*\" is placed beside the probability in the distribution that corresponds to the predicted class value. If you have an ID attribute in your dataset as first attribute (you can always add one with the AddID filter), you could output it with -p 1 instead of using -p 0 . This works only for explicit train/test sets, but you can use the Explorer for cross-validation. Using the -classifications option instead of -p ... you can also use different output formats, like CSV : -classifications \"weka.classifiers.evaluation.output.prediction.CSV -p ...\" (the -p option takes the indices of the additional attributes to output). Filters # The AddClassification filter (package weka.filters.supervised.attribute ) can either train a classifier on the input data and transform this or load a serialized model to transform the input data (even though the filter was introduced in 3.5.4, due to a bug in the commandline option handling, it is recommended to download a version >3.5.5 from the Weka homepage). This filter can add the classification, class distribution and the error per row as extra attributes to the dataset. training the classifier, e.g., J48, on the input data and replacing the class values with the ones of the trained classifier: java \\ weka.filters.supervised.attribute.AddClassification \\ -W \"weka.classifiers.trees.J48\" \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last * using a serialized model, e.g., a J48 model, to replace the class values with the ones predicted by the serialized model: java \\ weka.filters.supervised.attribute.AddClassification \\ -serialized /some/where/j48.model \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last GUI # The Weka GUI allows you as well to output predictions based on a previously saved model. Explorer # See the Explorer section of the Saving and loading models article to setup the Explorer. Additionally, you need to check the Output predictions options in the More options dialog. Right-clicking on the respective results history item and selecting Re-evaluate model on current test set will output then the predictions as well (the statistics will be useless due to missing class values in the test set, so just ignore them). The output is similar to the one produced by the commandline. Example output for the anneal UCI dataset: == Predictions on test set == inst#, actual, predicted, error, probability distribution 1 ? 3:3 + 0 0 *1 0 0 0 2 ? 3:3 + 0 0 *1 0 0 0 3 ? 3:3 + 0 0 *1 0 0 0 ... 17 ? 6:U + 0 0 0 0 0 *1 18 ? 6:U + 0 0 0 0 0 *1 19 ? 3:3 + 0 0 *1 0 0 0 20 ? 3:3 + 0 0 *1 0 0 0 ... Note: The developer version (>3.5.6) can also output additional attributes like the commandline with the -p option. In the More options... dialog you can specify those attribute indices with Output additional attributes , e.g., first or 1-7 . In contrast to the commandline, this output also works for cross-validation. KnowledgeFlow # Using the PredictionAppender # With the PredictionAppender (from the Evaluation toolbar) you cannot use an already saved model, but you can train a classifier on a dataset and output an ARFF file with the predictions appended as additional attribute. Here's an example setup: /---dataSet--> TrainingSetMaker ---trainingSet--\\ ArffLoader --< >--> J48... \\---dataSet--> TestSetMaker -------testSet------/ ...J48 --batchClassifier--> PredictionAppender --testSet--> ArffSaver Using the AddClassification filter # The AddClassification filter can be used in the KnowledgeFlow as well, either for training a model, or for using a serialized model to perform the predictions. An example setup could look like this: ArffLoader --dataSet--> ClassAssigner --dataSet--> AddClassification --dataSet--> ArffSaver Java # If you want to perform the classification within your own code, see the classifying instances section of this article , explaining the Weka API in general. See also # Saving and loading models Use Weka in your Java code - general information about using the Weka API Using ID attributes Version # The developer version shortly before the release of 3.5.6 was used as basis for this article.","title":"Command line"},{"location":"making_predictions/#command-line","text":"The following sections show how to obtain predictions/classifications without writing your own Java code via the command line.","title":"Command line"},{"location":"making_predictions/#classifiers","text":"After a model has been saved , one can make predictions for a test set, whether that set contains valid class values or not. The output will contain both the actual and predicted class. (Note that if the test class contains simply '?' for the class label for each instance, the \"actual\" class label for each instance will not contain useful information, but the predicted class label will.) The -T <test_set> command-line switch specifies the dataset of instances whose classes are to be predicted, while the -p <attribute_range> switch allows the user to write out a range of attributes (examples: \"1-2\" for the first and second attributes, or \"0\" for no attributes). Sample command line: java weka.classifiers.trees.J48 -T unclassified.arff -l j48.model -p 0 The format of the output is as follows: <test_instance_index> <actual_class_index>:<actual_class_val> <pred_class_index>:<pred_class_val> [+| ] <prob_of_pred_class_val> where \"+\" occurs only for those items that were mispredicted. Note that if the actual class label is always \"?\" (i.e., the dataset does not include known class labels), the error column will always be empty. Sample output: inst# actual predicted error prediction 1 1:? 1:0 0.757 2 1:? 1:0 0.824 3 1:? 1:0 0.807 4 1:? 1:0 0.807 5 1:? 1:0 0.79 6 1:? 2:1 0.661 ... In this case, taken directly from a test dataset where all class attributes were marked by \"?\", the \"actual\" column, which can be ignored, simply states that each class belongs to an unknown class. The \"predicted\" column shows that instances 1 through 5 are predicted to be of class 1, whose value is 0, and instance 6 is predicted to be of class 2, whose value is 1. The error field is empty; if predictions were being performed on a labeled test set, each instance where the prediction failed to match the label would contain a \"+\". The probability that instance 1 actually belongs to class 0 is estimated at 0.757. Notes: Since Weka 3.5.4 you can also output the complete class distribution, not just the prediction, by using the parameter -distribution in conjunction with the -p option. In this case, \"*\" is placed beside the probability in the distribution that corresponds to the predicted class value. If you have an ID attribute in your dataset as first attribute (you can always add one with the AddID filter), you could output it with -p 1 instead of using -p 0 . This works only for explicit train/test sets, but you can use the Explorer for cross-validation. Using the -classifications option instead of -p ... you can also use different output formats, like CSV : -classifications \"weka.classifiers.evaluation.output.prediction.CSV -p ...\" (the -p option takes the indices of the additional attributes to output).","title":"Classifiers"},{"location":"making_predictions/#filters","text":"The AddClassification filter (package weka.filters.supervised.attribute ) can either train a classifier on the input data and transform this or load a serialized model to transform the input data (even though the filter was introduced in 3.5.4, due to a bug in the commandline option handling, it is recommended to download a version >3.5.5 from the Weka homepage). This filter can add the classification, class distribution and the error per row as extra attributes to the dataset. training the classifier, e.g., J48, on the input data and replacing the class values with the ones of the trained classifier: java \\ weka.filters.supervised.attribute.AddClassification \\ -W \"weka.classifiers.trees.J48\" \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last * using a serialized model, e.g., a J48 model, to replace the class values with the ones predicted by the serialized model: java \\ weka.filters.supervised.attribute.AddClassification \\ -serialized /some/where/j48.model \\ -classification \\ -remove-old-class \\ -i train.arff \\ -o train_classified.arff \\ -c last","title":"Filters"},{"location":"making_predictions/#gui","text":"The Weka GUI allows you as well to output predictions based on a previously saved model.","title":"GUI"},{"location":"making_predictions/#explorer","text":"See the Explorer section of the Saving and loading models article to setup the Explorer. Additionally, you need to check the Output predictions options in the More options dialog. Right-clicking on the respective results history item and selecting Re-evaluate model on current test set will output then the predictions as well (the statistics will be useless due to missing class values in the test set, so just ignore them). The output is similar to the one produced by the commandline. Example output for the anneal UCI dataset: == Predictions on test set == inst#, actual, predicted, error, probability distribution 1 ? 3:3 + 0 0 *1 0 0 0 2 ? 3:3 + 0 0 *1 0 0 0 3 ? 3:3 + 0 0 *1 0 0 0 ... 17 ? 6:U + 0 0 0 0 0 *1 18 ? 6:U + 0 0 0 0 0 *1 19 ? 3:3 + 0 0 *1 0 0 0 20 ? 3:3 + 0 0 *1 0 0 0 ... Note: The developer version (>3.5.6) can also output additional attributes like the commandline with the -p option. In the More options... dialog you can specify those attribute indices with Output additional attributes , e.g., first or 1-7 . In contrast to the commandline, this output also works for cross-validation.","title":"Explorer"},{"location":"making_predictions/#knowledgeflow","text":"","title":"KnowledgeFlow"},{"location":"making_predictions/#using-the-predictionappender","text":"With the PredictionAppender (from the Evaluation toolbar) you cannot use an already saved model, but you can train a classifier on a dataset and output an ARFF file with the predictions appended as additional attribute. Here's an example setup: /---dataSet--> TrainingSetMaker ---trainingSet--\\ ArffLoader --< >--> J48... \\---dataSet--> TestSetMaker -------testSet------/ ...J48 --batchClassifier--> PredictionAppender --testSet--> ArffSaver","title":"Using the PredictionAppender"},{"location":"making_predictions/#using-the-addclassification-filter","text":"The AddClassification filter can be used in the KnowledgeFlow as well, either for training a model, or for using a serialized model to perform the predictions. An example setup could look like this: ArffLoader --dataSet--> ClassAssigner --dataSet--> AddClassification --dataSet--> ArffSaver","title":"Using the AddClassification filter"},{"location":"making_predictions/#java","text":"If you want to perform the classification within your own code, see the classifying instances section of this article , explaining the Weka API in general.","title":"Java"},{"location":"making_predictions/#see-also","text":"Saving and loading models Use Weka in your Java code - general information about using the Weka API Using ID attributes","title":"See also"},{"location":"making_predictions/#version","text":"The developer version shortly before the release of 3.5.6 was used as basis for this article.","title":"Version"},{"location":"mathematical_functions/","text":"Mathematical functions implemented on dataset instances, like tan, cos, exp, log, and so on can be achived using one of the following filters: AddExpression (Stable version) MathExpression (Stable version)","title":"Mathematical functions"},{"location":"maven/","text":"Maven is another build tool. But unlike Ant , it is a more high-level tool. Though its configuration file, pom.xml is written in XML as well, Maven uses a different approach to the build process. In Ant, you tell it where to find Java classes for compilation, what libraries to compile against, where to put the compiled ones and then how to combine them into a jar. With Maven, you only specify dependent libraries, a compile and a jar plugin and maybe tweak the options a bit. For this to work, Maven enforces a strict directory structure (though you can tweak that, if you need to). So why another build tool? # Whereas Ant scripts quite often create a fat jar , i.e., a jar that contains not only the project's code, but also the contain of libraries the code was compiled against. Handy if you only want to have a single jar. However, this is a nightmare, if you need to update a single library, but all you have is a single, enormous jar. Maven handles dependencies automatically , relying on libraries (they call them artifacts) to be publicly available, e.g., on Maven Central . It allows you to use newer versions of libraries than defined by the dependent libraries (e.g., critical bug fixes), without having to modify any jars manually. Though Maven can also generate fat jar files, it is not considered good practice, as it defeats Maven's automatic version resolution. In order to make Weka, and most of its packages, available to a wider audience (e.g., other software developers), we also publish on Maven Central. Compiling # For compiling Weka, you would issue a command like this (in the same directory as pom.xml ): mvn clean install If you don't want the tests to run, use this: mvn clean install -DskipTests = true","title":"Maven"},{"location":"maven/#so-why-another-build-tool","text":"Whereas Ant scripts quite often create a fat jar , i.e., a jar that contains not only the project's code, but also the contain of libraries the code was compiled against. Handy if you only want to have a single jar. However, this is a nightmare, if you need to update a single library, but all you have is a single, enormous jar. Maven handles dependencies automatically , relying on libraries (they call them artifacts) to be publicly available, e.g., on Maven Central . It allows you to use newer versions of libraries than defined by the dependent libraries (e.g., critical bug fixes), without having to modify any jars manually. Though Maven can also generate fat jar files, it is not considered good practice, as it defeats Maven's automatic version resolution. In order to make Weka, and most of its packages, available to a wider audience (e.g., other software developers), we also publish on Maven Central.","title":"So why another build tool?"},{"location":"maven/#compiling","text":"For compiling Weka, you would issue a command like this (in the same directory as pom.xml ): mvn clean install If you don't want the tests to run, use this: mvn clean install -DskipTests = true","title":"Compiling"},{"location":"memory_consumption_and_garbage_collector/","text":"There is the ability to print how much memory is available in the Explorer and Experimenter and to run the garbage collector. Just right click over the Status area in the Explorer/Experimenter.","title":"Memory consumption and garbage collector"},{"location":"message_classifier/","text":"In the following you'll find some information about the MessageClassifier from the 2nd edition of the Data Mining book by Witten and Frank. Source code # Depending on the version of the book, download the corresponding version (this article is based on the 2nd edition): 1st Edition: MessageClassifier 2nd Edition: MessageClassifier ( book , stable-3.8 , developer ) Compiling # compile the source code like this, if the weka.jar is already in your CLASSPATH environment variable: javac MessageClassifier.java * otherwise, use this command line (of course, replace /path/to/ with the correct path on your system): javac - classpath / path / to / weka . jar MessageClassifier . java Note: The classpath handling is omitted from here on. Training # If you run the MessageClassifier for the first time, you need to provide labeled examples to build a classifier from, i.e., messages (\" -m \") and the corresponding classes (\" -c \"). Since the data and the model are kept for future use, one has to specify a filename, where the MessageClassifier is serialized to (\" -t \"). Here's an example, that labels the message email1.txt as miss : java MessageClassifier -m email1.txt -c miss -t messageclassifier.model Repeat this for all the messages you want to have classified. Classifying # Classifying an unseen message is quite straight-forward, one just omits the class option (\" -c \"). The following call java MessageClassifier -m email1023.txt -t messageclassifier.model will produce something like this: Message classified as : miss","title":"Message classifier"},{"location":"message_classifier/#source-code","text":"Depending on the version of the book, download the corresponding version (this article is based on the 2nd edition): 1st Edition: MessageClassifier 2nd Edition: MessageClassifier ( book , stable-3.8 , developer )","title":"Source code"},{"location":"message_classifier/#compiling","text":"compile the source code like this, if the weka.jar is already in your CLASSPATH environment variable: javac MessageClassifier.java * otherwise, use this command line (of course, replace /path/to/ with the correct path on your system): javac - classpath / path / to / weka . jar MessageClassifier . java Note: The classpath handling is omitted from here on.","title":"Compiling"},{"location":"message_classifier/#training","text":"If you run the MessageClassifier for the first time, you need to provide labeled examples to build a classifier from, i.e., messages (\" -m \") and the corresponding classes (\" -c \"). Since the data and the model are kept for future use, one has to specify a filename, where the MessageClassifier is serialized to (\" -t \"). Here's an example, that labels the message email1.txt as miss : java MessageClassifier -m email1.txt -c miss -t messageclassifier.model Repeat this for all the messages you want to have classified.","title":"Training"},{"location":"message_classifier/#classifying","text":"Classifying an unseen message is quite straight-forward, one just omits the class option (\" -c \"). The following call java MessageClassifier -m email1023.txt -t messageclassifier.model will produce something like this: Message classified as : miss","title":"Classifying"},{"location":"metacost/","text":"This metaclassifier makes its base classifier cost-sensitive using the method specified in: Pedro Domingos: MetaCost: A general method for making classifiers cost-sensitive. In: Fifth International Conference on Knowledge Discovery and Data Mining, 155-164, 1999. This classifier should produce similar results to one created by passing the base learner to Bagging, which is in turn passed to a CostSensitiveClassifier operating on minimum expected cost. The difference is that MetaCost produces a single cost-sensitive classifier of the base learner, giving the benefits of fast classification and interpretable output (if the base learner itself is interpretable). This implementation uses all bagging iterations when reclassifying training data (the MetaCost paper reports a marginal improvement when only those iterations containing each training instance are used in reclassifying that instance). Examples # The following cost matrix is used for a 3-class problem: -3 1 1 1 -6 1 0 0 0 MetaCost will compute the costs ( Costs ) based on the class distribution the bagged base learner returns ( Class probs ) and select the class with the lowest cost ( Chosen class ): +---------------+-----------------+--------------+ | Class probs | Costs | Chosen class | +---------------+-----------------+--------------+ | 1.0, 0.0, 0.0 | -3.0, 1.0, 1.0 | 1 | | 0.0, 1.0, 0.0 | 1.0, -6.0, 1.0 | 2 | | 0.0, 0.0, 1.0 | 0.0, 0.0, 0.0 | 1 * | | 0.7, 0.1, 0.2 | -2.0, 0.1, 0.8 | 1 | | 0.2, 0.7, 0.1 | 0.1, -4.0. 0.9 | 2 | | 0.1, 0.2, 0.7 | -0.1, -1.1, 0.3 | 2 | +---------------+-----------------+--------------+ * in case of a tie, the first one will be picked. See also # CostSensitiveClassifier CostMatrix Links # Publication on CiteSeer","title":"Metacost"},{"location":"metacost/#examples","text":"The following cost matrix is used for a 3-class problem: -3 1 1 1 -6 1 0 0 0 MetaCost will compute the costs ( Costs ) based on the class distribution the bagged base learner returns ( Class probs ) and select the class with the lowest cost ( Chosen class ): +---------------+-----------------+--------------+ | Class probs | Costs | Chosen class | +---------------+-----------------+--------------+ | 1.0, 0.0, 0.0 | -3.0, 1.0, 1.0 | 1 | | 0.0, 1.0, 0.0 | 1.0, -6.0, 1.0 | 2 | | 0.0, 0.0, 1.0 | 0.0, 0.0, 0.0 | 1 * | | 0.7, 0.1, 0.2 | -2.0, 0.1, 0.8 | 1 | | 0.2, 0.7, 0.1 | 0.1, -4.0. 0.9 | 2 | | 0.1, 0.2, 0.7 | -0.1, -1.1, 0.3 | 2 | +---------------+-----------------+--------------+ * in case of a tie, the first one will be picked.","title":"Examples"},{"location":"metacost/#see-also","text":"CostSensitiveClassifier CostMatrix","title":"See also"},{"location":"metacost/#links","text":"Publication on CiteSeer","title":"Links"},{"location":"ms_sql_server_2000_desktop_engine/","text":"Installation # Download the Desktop Engine (see Links ) Extract the files by running the downloaded executable Edit the setup.ini file and add a strong password for the sa account: SAPWD=*password* Note: the default password is empty, which can prevent the setup from continuing the installation Run the setup Testing # This article lists Java code for testing the connection Troubleshooting # Error Establishing Socket with JDBC Driver Add TCP/IP to the list of protocols as stated in this article Login failed for user 'sa'. Reason: Not associated with a trusted SQL Server connection. For changing the authentication to mixed mode see this article Links # Microsoft SQL Server 2000 (Desktop Engine) Microsoft SQL Server 2000 JDBC Driver SP 3","title":"Installation"},{"location":"ms_sql_server_2000_desktop_engine/#installation","text":"Download the Desktop Engine (see Links ) Extract the files by running the downloaded executable Edit the setup.ini file and add a strong password for the sa account: SAPWD=*password* Note: the default password is empty, which can prevent the setup from continuing the installation Run the setup","title":"Installation"},{"location":"ms_sql_server_2000_desktop_engine/#testing","text":"This article lists Java code for testing the connection","title":"Testing"},{"location":"ms_sql_server_2000_desktop_engine/#troubleshooting","text":"Error Establishing Socket with JDBC Driver Add TCP/IP to the list of protocols as stated in this article Login failed for user 'sa'. Reason: Not associated with a trusted SQL Server connection. For changing the authentication to mixed mode see this article","title":"Troubleshooting"},{"location":"ms_sql_server_2000_desktop_engine/#links","text":"Microsoft SQL Server 2000 (Desktop Engine) Microsoft SQL Server 2000 JDBC Driver SP 3","title":"Links"},{"location":"mtj_with_nvblas/","text":"(The following is based on a post from Eibe Frank on the Weka mailing list.) Here is an example of running MTJ with NVBLAS (NVIDIA's BLAS wrapper) on Ubuntu: Installed https://prdownloads.sourceforge.net/weka/weka-3-8-6-azul-zulu-linux.zip in /home/eibe/Desktop Ran ~/Desktop/weka-3-8-6/weka.sh -main weka.core.WekaPackageManager -install-package netlibNativeLinux To install CPU-based system BLAS/LAPACK, ran sudo apt-get install libblas-dev liblapack-dev sudo ln -s /usr/lib/x86_64-linux-gnu/libblas.so.3 /usr/lib/libblas.so.3 sudo ln -s /usr/lib/x86_64-linux-gnu/liblapack.so.3 /usr/lib/liblapack.so.3 Downloaded and installed CUDA 11.6 from https://developer.nvidia.com/cuda-downloads Copied example nvblas.conf from https://docs.nvidia.com/cuda/nvblas/ into local directory using cat > nvblas.conf Edited nvblas.conf to have NVBLAS_CPU_BLAS_LIB /usr/lib/x86_64-linux-gnu/blas/libblas.so.3 Now, by adapting what's given at https://github.com/fommil/netlib-java/wiki/NVBLAS , issued export LD_LIBRARY_PATH=/usr/local/cuda-11.6/lib64:/usr/lib/x86_64-linux-gnu/blas/libblas.so.3 Then, ~/Desktop/weka-3-8-6/weka.sh -main weka.Run .RandomRBF -a 5000 > RandomRBF.a5000.arff LD_PRELOAD=libnvblas.so ~/Desktop/weka-3-8-6/weka.sh -main weka.Run .attributeSelection.PrincipalComponents -i RandomRBF.a5000.arff Observation: Memory is being allocated on the GPU. Looking at nvblas.log , the GPU is used, but only for some dgemm operations. However, according to https://docs.nvidia.com/cuda/nvblas/ , the tremm operation (which is executed on the CPU) should also be supported by the GPU.","title":"Mtj with nvblas"},{"location":"multi_instance_classification/","text":"Multi-instance (MI) classification is a supervised learning technique, but differs from normal supervised learning: it has multiple instances in an example only one class label is observable for all the instances in an example Classifiers # Multi-instance classifiers were originally available through a separate software package, Multi-Instance Learning Kit (= MILK). Weka handles relational attributes now natively since 3.5.3 and the multi-instance classifiers are available through the multiInstanceLearning package and filters through the multiInstanceFilters . Once the packages have been installed, the classifiers can be found in the following package: weka.classifiers.mi Data format # The data format for multi-instance classifiers is fairly simple: bag-id - nominal attribute; unique identifier for each bag bag - relational attribute; contains the instances of an example class - the class label for the examples Weka offers two filters to convert from flat file format (or propositional format), which is normally used in supervised classification, to multi-instance format and vice versa: weka.filters.unsupervised.attribute.PropositionalToMultiInstance weka.filters.unsupervised.attribute.MultiInstanceToPropositional Here is an example of the musk1 UCI dataset, used quite often in publications covering MI learning (Note: ... denotes omission): propositional format: This ARFF file lists all the attributes, molecule_name (which is the bag-id), f1 to f166 (containing the actual data of the instances) and the class attribute. @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @attribute class {0,1} @data MUSK-188,42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30,1 MUSK-188,42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30,1 ... multi-instance format: Using the relational attribute, one only has three attributes on the first level: molecule_name , bag and class . The relational attribute contains the instances for each example, consisting of the attributes f1 to f166 . The data of the relational attribute is surrounded by quotes and the single instances inside the bag are separated by line-feeds (= \\n ). @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} @data MUSK-188,\"42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30\\n42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30\\n...\",1 ... See also # Use Weka in your Java code - general article about using the Weka API Creating an ARFF file - explains how to create an ARFF file from within Java, incl. relational attributes Links # Xin Xu. Statistical learning in multiple instance problem. Master's thesis, University of Waikato, Hamilton, NZ, 2003. 0657.594. Download MILK homepage multiInstanceLearning Javadoc multiInstanceFilters Javadoc","title":"Multi instance classification"},{"location":"multi_instance_classification/#classifiers","text":"Multi-instance classifiers were originally available through a separate software package, Multi-Instance Learning Kit (= MILK). Weka handles relational attributes now natively since 3.5.3 and the multi-instance classifiers are available through the multiInstanceLearning package and filters through the multiInstanceFilters . Once the packages have been installed, the classifiers can be found in the following package: weka.classifiers.mi","title":"Classifiers"},{"location":"multi_instance_classification/#data-format","text":"The data format for multi-instance classifiers is fairly simple: bag-id - nominal attribute; unique identifier for each bag bag - relational attribute; contains the instances of an example class - the class label for the examples Weka offers two filters to convert from flat file format (or propositional format), which is normally used in supervised classification, to multi-instance format and vice versa: weka.filters.unsupervised.attribute.PropositionalToMultiInstance weka.filters.unsupervised.attribute.MultiInstanceToPropositional Here is an example of the musk1 UCI dataset, used quite often in publications covering MI learning (Note: ... denotes omission): propositional format: This ARFF file lists all the attributes, molecule_name (which is the bag-id), f1 to f166 (containing the actual data of the instances) and the class attribute. @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @attribute class {0,1} @data MUSK-188,42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30,1 MUSK-188,42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30,1 ... multi-instance format: Using the relational attribute, one only has three attributes on the first level: molecule_name , bag and class . The relational attribute contains the instances for each example, consisting of the attributes f1 to f166 . The data of the relational attribute is surrounded by quotes and the single instances inside the bag are separated by line-feeds (= \\n ). @relation musk1 @attribute molecule_name {MUSK-jf78,MUSK-jf67,MUSK-jf59,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric @attribute f2 numeric @attribute f3 numeric @attribute f4 numeric @attribute f5 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} @data MUSK-188,\"42,-198,-109,-75,-117,11,23,-88,-28,-27,...,48,-37,6,30\\n42,-191,-142,-65,-117,55,49,-170,-45,5,...,48,-37,5,30\\n...\",1 ...","title":"Data format"},{"location":"multi_instance_classification/#see-also","text":"Use Weka in your Java code - general article about using the Weka API Creating an ARFF file - explains how to create an ARFF file from within Java, incl. relational attributes","title":"See also"},{"location":"multi_instance_classification/#links","text":"Xin Xu. Statistical learning in multiple instance problem. Master's thesis, University of Waikato, Hamilton, NZ, 2003. 0657.594. Download MILK homepage multiInstanceLearning Javadoc multiInstanceFilters Javadoc","title":"Links"},{"location":"not_so_faq/","text":"Associators # How do I use the associator GeneralizedSequentialPatterns? Classifiers # What do those numbers mean in a J48 tree?","title":"Not so FAQ"},{"location":"not_so_faq/#associators","text":"How do I use the associator GeneralizedSequentialPatterns?","title":"Associators"},{"location":"not_so_faq/#classifiers","text":"What do those numbers mean in a J48 tree?","title":"Classifiers"},{"location":"optimizing_parameters/","text":"Since finding the optimal parameters for a classifier can be a rather tedious process, Weka offers some ways of automating this process a bit. The following meta-classifiers allow you to optimize some parameters of your base classifier: weka.classifiers.meta.CVParameterSelection weka.classifiers.meta.GridSearch (only developer version) weka.classifiers.meta.MultiSearch ( external package for 3.7.11+) Auto-WEKA ( external package package for 3.7.13+) After finding the best possible setup, the meta-classifiers then train an instance of the base classifier with these parameters and use it for subsequent predictions. CVParameterSelection # This meta-classifier can optimize over an arbitrary number of parameters, with only one drawback (apart from the obvious explosion of possible parameter combinations): one cannot optimize on nested options, only direct options of the base classifier. What does that mean? It means, that you can optimize the C parameter of weka.classifiers.functions.SMO , but not the C of an weka.classifiers.functions.SMO within a weka.classifiers.meta.FilteredClassifier . Here are a few examples: J48 and it's confidence interval (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.trees.J48 as base classifier within CVParameterSelection open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 0.1 0.5 5 - This will test the confidence parameter from 0.1 to 0.5 with step size 0.1 (= 5 steps) close dialogs and start the classifier you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.trees.J48 Cross-validation Parameter: '-C' ranged from 0.1 to 0.5 with 5.0 steps Classifier Options: **-C 0.1** -M 2 SMO and it's complexity parameter (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.functions.SMO as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 2 8 4 This will test the complexity parameters 2, 4, 6 and 8 (= 4 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.SMO Cross-validation Parameter: '-C' ranged from 2.0 to 8.0 with 4.0 steps Classifier Options: **-C 8** -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.01\" * LibSVM and the gamma parameter of the RBF kernel (\"-G\") * load your dataset in the Explorer * choose weka.classifiers.meta.CVParameterSelection as classifier * select [weka.classifiers.functions.LibSVM](lib_svm.md) as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel * open the ArrayEditor for CVParameters and enter the following string (and click on Add ): G 0.01 0.1 10 This will iterate over the gamma parameter, using values from 0.01 to 0.1 (= 10 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.LibSVM Cross-validation Parameter: '-G' ranged from 0.01 to 0.1 with 10.0 steps Classifier Options: **-G 0.09** -S 0 -K 2 -D 3 -R 0.0 -N 0.5 -M 40.0 -C 1.0 -E 0.0010 -P 0.1 GridSearch # weka.classifiers.meta.GridSearch is a meta-classifier for exploring 2 parameters, hence the grid in the name. If one turns the log on, the classifier will create output suitable for gnuplot , i.e., sections of the log will contain script and data sections. Instead of just using a classifier, one can specify a base classifier and a filter, which both of them can be optimized (one parameter each). In contrast to CVParameterSelection , GridSearch is not limited to first-level parameters of the base classifier, since it's using Java Beans Introspection and one can specify paths to the properties one wants to optimize. A property here is the string of the parameter displayed in the GenericObjectEditor (generated though Introspection), e.g., bagSizePercent or classifier of weka.classifiers.meta.Bagging . Due to some important bugfixes, one should obtain a version of Weka >3.5.6 later than 11 Sept 2007. For each of the two axes, X and Y, one can specify the following parameters: property The dot-separated path pointing to the property to be optimized. In order to distinguish between paths for the filter or the classifier, one needs to prefix the path either with filter. or classifier. for filter or classifier path respectively. expression The mathematical expression to generate the value for the property, processed with the weka.core.MathematicalExpression class, which supports the following functions: abs , sqrt , log , exp , sin , cos , tan , rint , floor , pow , ceil . These variables are available in the expression: BASE , FROM , TO , STEP , I ; with I ranging from FROM to TO . min The minimum value to start from. max The maximum value. step The step size used to get from min to max . base Used in pow() calculations. GridSearch can also optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP) [only when using Weka packages] Note: Correlation coefficient is only available for numeric classes and Accuracy only for nominal ones. Here are a some examples (taken from the Javadoc of the classifier): Optimizing SMO with RBFKernel (C and gamma) Start the Explorer and load your dataset with nominal class. Set the evaluation to Accuracy . Set the filter to weka.filters.AllFilter since we don't need any special data processing and we don't optimize the filter in this case (data gets always passed through filter!). Set weka.classifiers.functions.SMO as classifier with weka.classifiers.functions.supportVector.RBFKernel as kernel. Set the XProperty to \"classifier.c\", XMin to \"1\", XMax to \"16\", XStep to \"1\" and the XExpression to \"I\". This will test the \"C\" parameter of SMO for the values from 1 to 16. Set the YProperty to \"classifier.kernel.gamma\", YMin to \"-5\", YMax to \"2\", YStep to \"1\", YBase to \"10\" and YExpression to \"pow(BASE,I)\". This will test the gamma of the RBFKernel with the values 10 -5 , 10 -4 ,..,10 2 . Output will be similar to this one here: Filter: weka.filters.AllFilter Classifier: weka.classifiers.functions.SMO -C 2.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.0\" X property: classifier.c Y property: classifier.kernel.gamma Evaluation: Accuracy Coordinates: [2.0, 0.0] Values: **2.0** (X coordinate), **1.0** (Y coordinate) * Optimizing PLSFilter with LinearRegression (# of components and ridge) - default setup * Start the Explorer and load your dataset with numeric class. * Set the evaluation to Correlation coefficient. * Set the filter to weka.filters.supervised.attribute.PLSFilter . * Set weka.classifiers.functions.LinearRegression as classifier and use no attribute selection and no elimination of colinear attributes (speeds up LinearRegression significantly!). * Set the XProperty to \"filter.numComponents\", XMin to \"5\", XMax to \"20\" (this depends heavily on your dataset, should be no more than the number of attributes!), XStep to \"1\" and XExpression to \"I\". This will test the number of components the PLSFilter will produce from 5 to 20. * Set the YProperty to \"classifier.ridge\", XMin to \"-10\", XMax to \"5\", YStep to \"1\" and YExpression to \"pow(BASE,I)\". This will try ridge parameters from 10 -10 to 10 5 . * Output will be similar to this one: Filter: weka.filters.supervised.attribute.PLSFilter -C 5 -M -A PLS1 -P center Classifier: weka.classifiers.functions.LinearRegression -S 1 -C -R 5.0 X property: filter.numComponents Y property: classifier.ridge Evaluation: Correlation coefficient Coordinates: [5.0, 5.0] Values: **5.0** (X coordinate), **100000.0** (Y coordinate) Notes: a property for the classifier starts with classifier. a property for the filter starts with filter. Arrays of objects are addressed with [ ] , with the index being 0-based. E.g., using a weka.filters.MultiFilter in GridSearch consisting of a ReplaceMissingValues and a PLSFilter filter one can address the numComponents property of the PLSFilter with filter.filter[1].numComponents MultiSearch # weka.classifiers.meta.MultiSearch is available through this Weka package (requires Weka 3.7.11 or later; for downloads see the Releases section). MultiSearch is similar to GridSearch, more general and simpler at the same time. More general, because it allows the optimization of an arbitrary number of parameters, not just two. Simpler, because it does not offer any search space expansions or gnuplot output and less options. For each parameter to optimize, the user has to define a search parameter . There are two types of parameters available: MathParameter - basically what GridSearch uses, with an expression to calculate the actual value using the min, max and step parameters ListParameter - the blank-separated list of values is used as input for the optimization (useful, if values cannot be described by a mathematical function) Here is a setup for finding the best ridge parameter (property classifier.ridge ) using the MathParameter search parameter using values from 10^-10 to 10^5: weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.MathParameter -property classifier.ridge -min -10.0 -max 5.0 -step 1.0 -base 10.0 -expression pow(BASE,I)\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 And here using the ListParameter search parameter for evaluating values 0.001, 0.05, 0.1, 0.5, 0.75 and 1.0 for the ridge parameter (property classifier.ridge ): weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.ListParameter -property classifier.ridge -list \\\"0.001 0.05 0.1 0.5 0.75 1.0\\\"\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 MultiSearch can be optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP) Auto-WEKA # Auto-WEKA is available as a package through the WEKA package manager. It provides the class weka.classifiers.meta.AutoWEKAClassifier and optimizes all parameters of all learners. It also automatically determines the best learner to use and the best attribute selection method for a given dataset. More information is available on the project website and the manual . Downloads # CVParam.java - optimizes J48's -C parameter See also # LibSVM - you need additional jars in your CLASSPATH to be able to use LibSVM Links # gnuplot homepage Java Beans Introspection","title":"Optimizing parameters"},{"location":"optimizing_parameters/#cvparameterselection","text":"This meta-classifier can optimize over an arbitrary number of parameters, with only one drawback (apart from the obvious explosion of possible parameter combinations): one cannot optimize on nested options, only direct options of the base classifier. What does that mean? It means, that you can optimize the C parameter of weka.classifiers.functions.SMO , but not the C of an weka.classifiers.functions.SMO within a weka.classifiers.meta.FilteredClassifier . Here are a few examples: J48 and it's confidence interval (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.trees.J48 as base classifier within CVParameterSelection open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 0.1 0.5 5 - This will test the confidence parameter from 0.1 to 0.5 with step size 0.1 (= 5 steps) close dialogs and start the classifier you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.trees.J48 Cross-validation Parameter: '-C' ranged from 0.1 to 0.5 with 5.0 steps Classifier Options: **-C 0.1** -M 2 SMO and it's complexity parameter (\"-C\") load your dataset in the Explorer choose weka.classifiers.meta.CVParameterSelection as classifier select weka.classifiers.functions.SMO as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel open the ArrayEditor for CVParameters and enter the following string (and click on Add ): C 2 8 4 This will test the complexity parameters 2, 4, 6 and 8 (= 4 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.SMO Cross-validation Parameter: '-C' ranged from 2.0 to 8.0 with 4.0 steps Classifier Options: **-C 8** -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.01\" * LibSVM and the gamma parameter of the RBF kernel (\"-G\") * load your dataset in the Explorer * choose weka.classifiers.meta.CVParameterSelection as classifier * select [weka.classifiers.functions.LibSVM](lib_svm.md) as base classifier within CVParameterSelection and modify its setup if necessary, e.g., RBF kernel * open the ArrayEditor for CVParameters and enter the following string (and click on Add ): G 0.01 0.1 10 This will iterate over the gamma parameter, using values from 0.01 to 0.1 (= 10 steps) * close dialogs and start the classifier * you will get output similar to this one, with the best parameters found in bold: Cross-validated Parameter selection. Classifier: weka.classifiers.functions.LibSVM Cross-validation Parameter: '-G' ranged from 0.01 to 0.1 with 10.0 steps Classifier Options: **-G 0.09** -S 0 -K 2 -D 3 -R 0.0 -N 0.5 -M 40.0 -C 1.0 -E 0.0010 -P 0.1","title":"CVParameterSelection"},{"location":"optimizing_parameters/#gridsearch","text":"weka.classifiers.meta.GridSearch is a meta-classifier for exploring 2 parameters, hence the grid in the name. If one turns the log on, the classifier will create output suitable for gnuplot , i.e., sections of the log will contain script and data sections. Instead of just using a classifier, one can specify a base classifier and a filter, which both of them can be optimized (one parameter each). In contrast to CVParameterSelection , GridSearch is not limited to first-level parameters of the base classifier, since it's using Java Beans Introspection and one can specify paths to the properties one wants to optimize. A property here is the string of the parameter displayed in the GenericObjectEditor (generated though Introspection), e.g., bagSizePercent or classifier of weka.classifiers.meta.Bagging . Due to some important bugfixes, one should obtain a version of Weka >3.5.6 later than 11 Sept 2007. For each of the two axes, X and Y, one can specify the following parameters: property The dot-separated path pointing to the property to be optimized. In order to distinguish between paths for the filter or the classifier, one needs to prefix the path either with filter. or classifier. for filter or classifier path respectively. expression The mathematical expression to generate the value for the property, processed with the weka.core.MathematicalExpression class, which supports the following functions: abs , sqrt , log , exp , sin , cos , tan , rint , floor , pow , ceil . These variables are available in the expression: BASE , FROM , TO , STEP , I ; with I ranging from FROM to TO . min The minimum value to start from. max The maximum value. step The step size used to get from min to max . base Used in pow() calculations. GridSearch can also optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP) [only when using Weka packages] Note: Correlation coefficient is only available for numeric classes and Accuracy only for nominal ones. Here are a some examples (taken from the Javadoc of the classifier): Optimizing SMO with RBFKernel (C and gamma) Start the Explorer and load your dataset with nominal class. Set the evaluation to Accuracy . Set the filter to weka.filters.AllFilter since we don't need any special data processing and we don't optimize the filter in this case (data gets always passed through filter!). Set weka.classifiers.functions.SMO as classifier with weka.classifiers.functions.supportVector.RBFKernel as kernel. Set the XProperty to \"classifier.c\", XMin to \"1\", XMax to \"16\", XStep to \"1\" and the XExpression to \"I\". This will test the \"C\" parameter of SMO for the values from 1 to 16. Set the YProperty to \"classifier.kernel.gamma\", YMin to \"-5\", YMax to \"2\", YStep to \"1\", YBase to \"10\" and YExpression to \"pow(BASE,I)\". This will test the gamma of the RBFKernel with the values 10 -5 , 10 -4 ,..,10 2 . Output will be similar to this one here: Filter: weka.filters.AllFilter Classifier: weka.classifiers.functions.SMO -C 2.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.RBFKernel -C 250007 -G 0.0\" X property: classifier.c Y property: classifier.kernel.gamma Evaluation: Accuracy Coordinates: [2.0, 0.0] Values: **2.0** (X coordinate), **1.0** (Y coordinate) * Optimizing PLSFilter with LinearRegression (# of components and ridge) - default setup * Start the Explorer and load your dataset with numeric class. * Set the evaluation to Correlation coefficient. * Set the filter to weka.filters.supervised.attribute.PLSFilter . * Set weka.classifiers.functions.LinearRegression as classifier and use no attribute selection and no elimination of colinear attributes (speeds up LinearRegression significantly!). * Set the XProperty to \"filter.numComponents\", XMin to \"5\", XMax to \"20\" (this depends heavily on your dataset, should be no more than the number of attributes!), XStep to \"1\" and XExpression to \"I\". This will test the number of components the PLSFilter will produce from 5 to 20. * Set the YProperty to \"classifier.ridge\", XMin to \"-10\", XMax to \"5\", YStep to \"1\" and YExpression to \"pow(BASE,I)\". This will try ridge parameters from 10 -10 to 10 5 . * Output will be similar to this one: Filter: weka.filters.supervised.attribute.PLSFilter -C 5 -M -A PLS1 -P center Classifier: weka.classifiers.functions.LinearRegression -S 1 -C -R 5.0 X property: filter.numComponents Y property: classifier.ridge Evaluation: Correlation coefficient Coordinates: [5.0, 5.0] Values: **5.0** (X coordinate), **100000.0** (Y coordinate) Notes: a property for the classifier starts with classifier. a property for the filter starts with filter. Arrays of objects are addressed with [ ] , with the index being 0-based. E.g., using a weka.filters.MultiFilter in GridSearch consisting of a ReplaceMissingValues and a PLSFilter filter one can address the numComponents property of the PLSFilter with filter.filter[1].numComponents","title":"GridSearch"},{"location":"optimizing_parameters/#multisearch","text":"weka.classifiers.meta.MultiSearch is available through this Weka package (requires Weka 3.7.11 or later; for downloads see the Releases section). MultiSearch is similar to GridSearch, more general and simpler at the same time. More general, because it allows the optimization of an arbitrary number of parameters, not just two. Simpler, because it does not offer any search space expansions or gnuplot output and less options. For each parameter to optimize, the user has to define a search parameter . There are two types of parameters available: MathParameter - basically what GridSearch uses, with an expression to calculate the actual value using the min, max and step parameters ListParameter - the blank-separated list of values is used as input for the optimization (useful, if values cannot be described by a mathematical function) Here is a setup for finding the best ridge parameter (property classifier.ridge ) using the MathParameter search parameter using values from 10^-10 to 10^5: weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.MathParameter -property classifier.ridge -min -10.0 -max 5.0 -step 1.0 -base 10.0 -expression pow(BASE,I)\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 And here using the ListParameter search parameter for evaluating values 0.001, 0.05, 0.1, 0.5, 0.75 and 1.0 for the ridge parameter (property classifier.ridge ): weka.classifiers.meta.MultiSearch \\ -E CC \\ -search \"weka.core.setupgenerator.ListParameter -property classifier.ridge -list \\\"0.001 0.05 0.1 0.5 0.75 1.0\\\"\" \\ -sample-size 100.0 -initial-folds 2 -subsequent-folds 10 -num-slots 1 -S 1 \\ -W weka.classifiers.functions.LinearRegression -- -S 1 -C -R 1.0E-8 MultiSearch can be optimized based on the following measures: Correlation coefficient (= CC) Root mean squared error (= RMSE) Root relative squared error (= RRSE) Mean absolute error (= MAE) Root absolute error (= RAE) Combined: (1-abs(CC)) + RRSE + RAE Accuracy (= ACC) Kappa (= KAP)","title":"MultiSearch"},{"location":"optimizing_parameters/#auto-weka","text":"Auto-WEKA is available as a package through the WEKA package manager. It provides the class weka.classifiers.meta.AutoWEKAClassifier and optimizes all parameters of all learners. It also automatically determines the best learner to use and the best attribute selection method for a given dataset. More information is available on the project website and the manual .","title":"Auto-WEKA"},{"location":"optimizing_parameters/#downloads","text":"CVParam.java - optimizes J48's -C parameter","title":"Downloads"},{"location":"optimizing_parameters/#see-also","text":"LibSVM - you need additional jars in your CLASSPATH to be able to use LibSVM","title":"See also"},{"location":"optimizing_parameters/#links","text":"gnuplot homepage Java Beans Introspection","title":"Links"},{"location":"osx_mountain_lion_weka_x_y_z_is_damaged_and_cant_be_installed_you_should_eject_the_disk_image/","text":"Mac OS X 10.8 (Mountain Lion) introduced a new security feature that, by default, limits \"acceptable\" applications to only those downloaded from the Mac App store. Thankfully, you can alter this in the system preferences. Go to \"Security & Privacy\" and change the \"Allow applications downloaded from:\" to \"Anywhere\". Weka will launch successfully after this change.","title":"Osx mountain lion weka x y z is damaged and cant be installed you should eject the disk image"},{"location":"performing_attribute_selection/","text":"In Weka, you have three options of performing attribute selection from commandline (not everything is possible from the GUI): the native approach, using the attribute selection classes directly using a meta-classifier the filter approach Notes: The commandlines outlined in this article are for a Linux/Unix bash (the backslash tells the shell that the command isn't finished yet and continues on the next line). In case of Windows or the SimpleCLI, just remove those backslashes and put everything on one line. The Explorer in the developer version (>= 3.5.4) also outputs the commandline setups to its log. Just click on the Log button to display the log and copy/paste the commandlines (you will need to add the appropriate java call and dataset files, of course). Native # Using the attribute selection classes directly outputs some additional useful information, like number of subsets evaluated/best merit (for subset evaluators), ranked output with merit per attribute (for ranking based setups). The attribute selection classes are located in the following package: weka.attributeSelection Example using CfsSubsetEval and BestFirst : java weka.attributeSelection.CfsSubsetEval \\ -M \\ -s \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -i <file.arff> Meta-classifier # Weka also offers a meta-classifier that takes a search algorithm and evaluator next to the base classifier. This makes the attribute selection process completely transparent and the base classifier receives only the reduced dataset. This is the full classname of the meta-classifier: weka.classifiers.meta.AttributeSelectedClassifier Example using CfsSubsetEval and BestFirst : java weka.classifiers.meta.AttributeSelectedClassifier \\ -t <training.arff> \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -W weka.classifiers.trees.J48 \\ -- \\ -C 0 .25 -M 2 Filter # In case you want to obtain the reduced/ranked data and not just output the selected/ranked attributes or using it internally in a classifier, you can use the filter approach. The following filter offers attribute selection: weka.filters.supervised.attribute.AttributeSelection Example using CfsSubsetEval and BestFirst in batch mode : java weka.filters.supervised.attribute.AttributeSelection \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -b \\ -i <input1.arff> \\ -o <output1.arff> \\ -r <input2.arff> \\ -s <output2.arff> Note: batch mode is not available from the Explorer. See also # Batch filtering - general information about batch filtering Use Weka in your Java code , section Attribute selection - if you want to use attribute selection from your own code.","title":"Performing attribute selection"},{"location":"performing_attribute_selection/#native","text":"Using the attribute selection classes directly outputs some additional useful information, like number of subsets evaluated/best merit (for subset evaluators), ranked output with merit per attribute (for ranking based setups). The attribute selection classes are located in the following package: weka.attributeSelection Example using CfsSubsetEval and BestFirst : java weka.attributeSelection.CfsSubsetEval \\ -M \\ -s \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -i <file.arff>","title":"Native"},{"location":"performing_attribute_selection/#meta-classifier","text":"Weka also offers a meta-classifier that takes a search algorithm and evaluator next to the base classifier. This makes the attribute selection process completely transparent and the base classifier receives only the reduced dataset. This is the full classname of the meta-classifier: weka.classifiers.meta.AttributeSelectedClassifier Example using CfsSubsetEval and BestFirst : java weka.classifiers.meta.AttributeSelectedClassifier \\ -t <training.arff> \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -W weka.classifiers.trees.J48 \\ -- \\ -C 0 .25 -M 2","title":"Meta-classifier"},{"location":"performing_attribute_selection/#filter","text":"In case you want to obtain the reduced/ranked data and not just output the selected/ranked attributes or using it internally in a classifier, you can use the filter approach. The following filter offers attribute selection: weka.filters.supervised.attribute.AttributeSelection Example using CfsSubsetEval and BestFirst in batch mode : java weka.filters.supervised.attribute.AttributeSelection \\ -E \"weka.attributeSelection.CfsSubsetEval -M\" \\ -S \"weka.attributeSelection.BestFirst -D 1 -N 5\" \\ -b \\ -i <input1.arff> \\ -o <output1.arff> \\ -r <input2.arff> \\ -s <output2.arff> Note: batch mode is not available from the Explorer.","title":"Filter"},{"location":"performing_attribute_selection/#see-also","text":"Batch filtering - general information about batch filtering Use Weka in your Java code , section Attribute selection - if you want to use attribute selection from your own code.","title":"See also"},{"location":"plotting_multiple_roc_curves/","text":"KnowledgeFlow # Description # Comparing different classifiers on one dataset can also be done via ROC curves , not just via Accuracy, Correlation coefficient etc. In the Explorer it is not possible to do that for several classifiers, this is only possible in the KnowledgeFlow . This is the basic setup (based on a Wekalist post): ArffLoader ---dataSet---> ClassAssigner ---dataSet---> ClassValuePicker (the class label you want the plot for) ---dataSet---> CrossValidationFoldMaker ---trainingSet/testSet (i.e. BOTH connections)---> Classifier of your choice ---batchClassifier---> ClassifierPerformanceEvaluator ---thresholdData---> ModelPerformanceChart This setup can be easily extended to host several classifiers, which illustrates the Plotting_multiple_roc.kfml example, containing J48 and RandomForest as classifiers. Java # Description # The VisualizeMultipleROC.java class lets you display several ROC curves in a single plot. The data it is using for display is from previously saved ROC curves. This example class is just a modified version of the VisualizeROC.java class, which displays only a single ROC curve (see Visualizing ROC curve article). See also # Wikipedia article on ROC curve Visualizing ROC curve ROC curves Downloads # Plotting_multiple_roc.kfml - Example KnowledgeFlow layout file VisualizeMultipleROC.java ( stable , developer )","title":"KnowledgeFlow"},{"location":"plotting_multiple_roc_curves/#knowledgeflow","text":"","title":"KnowledgeFlow"},{"location":"plotting_multiple_roc_curves/#description","text":"Comparing different classifiers on one dataset can also be done via ROC curves , not just via Accuracy, Correlation coefficient etc. In the Explorer it is not possible to do that for several classifiers, this is only possible in the KnowledgeFlow . This is the basic setup (based on a Wekalist post): ArffLoader ---dataSet---> ClassAssigner ---dataSet---> ClassValuePicker (the class label you want the plot for) ---dataSet---> CrossValidationFoldMaker ---trainingSet/testSet (i.e. BOTH connections)---> Classifier of your choice ---batchClassifier---> ClassifierPerformanceEvaluator ---thresholdData---> ModelPerformanceChart This setup can be easily extended to host several classifiers, which illustrates the Plotting_multiple_roc.kfml example, containing J48 and RandomForest as classifiers.","title":"Description"},{"location":"plotting_multiple_roc_curves/#java","text":"","title":"Java"},{"location":"plotting_multiple_roc_curves/#description_1","text":"The VisualizeMultipleROC.java class lets you display several ROC curves in a single plot. The data it is using for display is from previously saved ROC curves. This example class is just a modified version of the VisualizeROC.java class, which displays only a single ROC curve (see Visualizing ROC curve article).","title":"Description"},{"location":"plotting_multiple_roc_curves/#see-also","text":"Wikipedia article on ROC curve Visualizing ROC curve ROC curves","title":"See also"},{"location":"plotting_multiple_roc_curves/#downloads","text":"Plotting_multiple_roc.kfml - Example KnowledgeFlow layout file VisualizeMultipleROC.java ( stable , developer )","title":"Downloads"},{"location":"primer/","text":"WEKA is a comprehensive workbench for machine learning and data mining. Its main strengths lie in the classification area, where many of the main machine learning approaches have been implemented within a clean, object-oriented Java class hierarchy. Regression, association rule mining, time series prediction, and clustering algorithms have also been implemented. This document serves as a brief introduction to using WEKA from the command line interface. We will begin by describing basic concepts and ideas. Then, we will describe the weka.filters package, which is used to transform input data, e.g., for preprocessing, transformation, feature generation and so on. Following that, we will consider some machine learning algorithms that generate classification models. Afterwards, some practical examples are given. Note that, in the doc directory of the WEKA installation directory, you can find documentation of all Java classes in WEKA. Prepare to use it since this introduction is not intended to be complete. If you want to know exactly what is going on, take a look at the source code, which can be found in weka-src.jar and can be extracted via the jar utility from the Java Development Kit. Basic concepts # Dataset # A set of data items, the dataset, is a very basic concept of machine learning. A dataset is roughly equivalent to a two-dimensional spreadsheet or database table. In WEKA, it is implemented by the Instances class. A dataset is a collection of examples, each one of class Instance . Each Instance consists of a number of attributes, any of which can be nominal (= one of a predefined list of values), numeric (= a real or integer number) or a string (= an arbitrary long list of characters, enclosed in \"double quotes\"). WEKA also supports date attributes and relational attributes. The external representation of an Instances class is an ARFF file, which consists of a header describing the attribute types and the data as comma-separated list. Here is a short, commented example. A complete description of the ARFF file format can be found here . % This is a toy example, the UCI weather dataset. % Any relation to real weather is purely coincidental. Comment lines at the beginning of the dataset should give an indication of its source, context and meaning. @relation golfWeatherMichigan_1988/02/10_14days Here we state the internal name of the dataset. Try to be as descriptive as possible. @attribute outlook {sunny, overcast rainy} @attribute windy {TRUE, FALSE} Here we define two nominal attributes, outlook and windy . The former has three values: sunny , overcast and rainy ; the latter two: TRUE and FALSE . Nominal values with special characters, commas or spaces are enclosed in 'single quotes'. @attribute temperature numeric @attribute humidity numeric These lines define two numeric attributes. @attribute play {yes, no} The last attribute is the default target or class variable used for prediction. In our case it is a nominal attribute with two values, making this a binary classification problem. @data sunny,FALSE,85,85,no sunny,TRUE,80,90,no overcast,FALSE,83,86,yes rainy,FALSE,70,96,yes rainy,FALSE,68,80,yes The rest of the dataset consists of the token @data, followed by comma-separated values for the attributes -- one line per example. In our case there are five examples. Some basic statistics and validation of given ARFF files can be obtained via the main() routine of weka.core.Instances : java weka.core.Instances data/soybean.arff weka.core offers some other useful routines, e.g., converters.C45Loader and converters.CSVLoader , which can be used to convert C45 datasets and comma/tab-separated datasets respectively, e.g.: java weka.core.converters.CSVLoader data.csv > data.arff java weka.core.converters.C45Loader c45_filestem > data.arff Classifier # Any classification or regression algorithm in WEKA is derived from the abstract Classifier class. Surprisingly little is needed for a basic classifier: a routine which generates a classifier model from a training dataset (= buildClassifier ) and another routine which produces a classification for a given instance (= classifyInstance ), or generates a probability distribution for all classes of the instance (= distributionForInstance ). A classifier model is an arbitrary complex mapping from predictor attributes to the class attribute. The specific form and creation of this mapping, or model, differs from classifier to classifier. For example, ZeroR's model just consists of a single value: the most common class in the case of classification problems, or the median of all numeric values in case of predicting a numeric value (= regression learning). ZeroR is a trivial classifier, but it gives a lower bound on the performance of a given dataset that should be significantly improved by more complex classifiers. As such it is a reasonable test of how well the class can be predicted without considering the other attributes. Later , we will explain how to interpret the output from classifiers in detail -- for now just focus on the Correctly Classified Instances in the section Stratified cross-validation and notice how it improves from ZeroR to J48 when we use the soybean data: java weka.classifiers.rules.ZeroR -t soybean.arff java weka.classifiers.trees.J48 -t soybean.arff There are various approaches to determine the performance of classifiers. It can most simply be measured by counting the proportion of correctly predicted examples in a test dataset. This value is the classification accuracy , which is also 1-ErrorRate . Both terms are used in literature. The simplest case for evaluation is when we use a training set and a test set which are mutually independent. This is referred to as hold-out estimate. To estimate variance in these performance estimates, hold-out estimates may be computed by repeatedly by resampling the same dataset -- i.e., randomly shuffling it and then splitting it into training and test sets with a specific proportion of the examples, collecting all estimates on the test sets and computing average and standard deviation of accuracy. A more elaborate method is k -fold cross-validation. Here, a number of folds k is specified. The dataset is randomly shuffled and then split into k folds of equal size. In each iteration, one fold is used for testing and the other k-1 folds are used for training the classifier. The test results are collected and pooled (or averaged) over all folds. This gives the cross-validation estimate of accuracy. The folds can be purely random or slightly modified to create the same class distributions in each fold as in the complete dataset. In the latter case the cross-validation is called stratified . Leave-one-out (loo) cross-validation signifies that k is equal to the number of examples. Out of necessity, loo cv has to be non-stratified, i.e., the class distributions in the test sets are not the same as those in the training data. Therefore loo CV can produce misleading results in rare cases. However it is still quite useful in dealing with small datasets since it utilizes the greatest amount of training data from the dataset. weka filters # The weka.filters package contains Java classes that transform datasets -- by removing or adding attributes, resampling the dataset, removing examples and so on. This package offers useful support for data preprocessing, which is an important step in machine learning. All filters offer the command-line option -i for specifying the input dataset, and the option -o for specifying the output dataset. If any of these parameters is not given, this specifies standard input resp. output for use within pipes. Other parameters are specific to each filter and can be found out via - h , as with any other class. The weka.filters package is organized into supervised and unsupervised filtering, both of which are again subdivided into instance and attribute filtering. We will discuss each of the four subsection separately. weka.filters.supervised # Classes below weka.filters.supervised in WEKA's Java class hierarchy are for supervised filtering, i.e., taking advantage of the class information. For those filters, a class must be assigned by providing the index of the class attribute via -c . attribute # Discretize is used to discretize numeric attributes into nominal ones, based on the class information, via Fayyad & Irani's MDL method, or optionally with Kononeko's MDL method. Some learning schemes or classifiers can only process nominal data, e.g., rules.Prism ; and in some cases discretization may also reduce learning time and help combat overfitting. java weka.filters.supervised.attribute.Discretize -i data/iris.arff -o iris-nom.arff -c last java weka.filters.supervised.attribute.Discretize -i data/cpu.arff -o cpu-classvendor-nom.arff -c first NominalToBinary encodes all nominal attributes into binary (two-valued) attributes, which can be used to transform the dataset into a purely numeric representation, e.g., for visualization via multi-dimensional scaling. java weka.filters.supervised.attribute.NominalToBinary -i data/contact-lenses.arff -o contact-lenses-bin.arff -c last Note that most classifiers in WEKA utilize transformation filters internally, e.g., Logistic and SMO, so you may not have to use these filters explicity. instance # Resample creates a stratified subsample of the given dataset. This means that overall class distributions are approximately retained within the sample. A bias towards uniform class distribution can be specified via - B . java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -c last -Z 5 java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-uniform-5%.arff -c last -Z 5 -B 1 StratifiedRemoveFolds creates stratified cross-validation folds of the given dataset. This means that per default the class distributions are approximately retained within each fold. The following example splits soybean.arff into stratified training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-train.arff \\ -c last -N 4 -F 1 -V java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-test.arff \\ -c last -N 4 -F 1 weka.filters.unsupervised # Classes below weka.filters.unsupervised in WEKA's Java class hierarchy are for unsupervised filtering, e.g., the non-stratified version of Resample. A class should not be assigned here. attribute # StringToWordVector transforms string attributes into a word vectors, e.g., creating one attribute for each word that either encodes presence or word count ( -C ) within the string. -W can be used to set an approximate limit on the number of words. When a class is assigned, the limit applies to each class separately. This filter is useful for text mining. Obfuscate renames the dataset name, all attribute names and nominal attribute values. This is intended for exchanging sensitive datasets without giving away restricted information. Remove is intended for explicit deletion of attributes from a dataset, e.g. for removing attributes of the iris dataset: java weka.filters.unsupervised.attribute.Remove -R 1 -2 -i data/iris.arff -o iris-simplified.arff java weka.filters.unsupervised.attribute.Remove -V -R 3 -last -i data/iris.arff -o iris-simplified.arff instance # Resample creates a non-stratified subsample of the given dataset. It performs random sampling without regard to the class information. Otherwise it is equivalent to its supervised variant. java weka.filters.unsupervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -Z 5 RemoveFolds creates cross-validation folds of the given dataset. The class distributions are not retained. The following example splits soybean.arff into training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-train.arff -c last -N 4 -F 1 -V java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-test.arff -c last -N 4 -F 1 RemoveWithValues filters instances according to the value of an attribute. java weka.filters.unsupervised.instance.RemoveWithValues -i data/soybean.arff \\ -o soybean-without_herbicide_injury.arff -V -C last -L 19 weka.classifiers # Classifiers are at the core of WEKA. There are a lot of common options for classifiers, most of which are related to evaluation purposes. We will focus on the most important ones. All others including classifier-specific parameters can be found via - h , as usual. Parameter Description -t specifies the training file (ARFF format) -T specifies the test file in (ARFF format). If this parameter is missing, a crossvalidation will be performed (default: 10-fold cv) -x This parameter determines the number of folds for the cross-validation. A cv will only be performed if -T is missing. -c As we already know from the weka.filters section, this parameter sets the class variable with a one-based index. -d The model after training can be saved via this parameter. Each classifier has a different binary format for the model, so it can only be read back by the ct same classifier on a compatible dataset. Only the model on the training set is saved, not the multiple models generated via cross-validation. -l Loads a previously saved model, usually for testing on new, previously unseen data. In that case, a compatible test file should be specified, i.e. the same ributes in the same order. -p If a test file is specified, this parameter shows you the predictions and one attribute (0 for none) for all test instances. -o This parameter switches the human-readable output of the model description off. In case of support vector machines or NaiveBayes, this makes some sense unless you want to parse and visualize a lot of information. We now give a short list of selected classifiers in WEKA: trees.J48 A clone of the C4.5 decision tree learner bayes.NaiveBayes A Naive Bayesian learner. -K switches on kernel density estimation for numerical attributes which often improves performance. meta.ClassificationViaRegression -W functions.LinearRegression Multi-response linear regression. functions.Logistic Logistic Regression. functions.SMO Support Vector Machine (linear, polynomial and RBF kernel) with Seuential Minimal Optimization Algorithm due to [Platt, 1998]. Defaults to SVM with linear kernel, -E 5 -C 10 gives an SVM with polynomial kernel of degree 5 and lambda=10. lazy.KStar Instance-Based learner. -E sets the blend entropy automatically, which is usa`lly preferable. lazy.IBk Instance-Based learner with fixed neighborhood. -K sets the number of neighbors tou`se. IB1 is equivalent to IBk -K 1 rules.JRip A clone of the RIPPER rule learner. Based on a simple example, we will now explain the output of a typical classifier, weka.classifiers.trees.J48 . Consider the following call from the command line, or start the WEKA explorer and train J48 on weather.numeric.arff: java weka.classifiers.trees.J48 -t data/weather.numeric.arff J48 pruned tree ------------------ outlook = sunny | humidity <= 75: yes (2.0) | humidity > 75: no (3.0) outlook = overcast: yes (4.0) outlook = rainy | windy = TRUE: no (2.0) | windy = FALSE: yes (3.0) Number of Leaves : 5 Size of the tree : 8 The first part, unless you specify -o , is a human-readable form of the training set model. In this case, it is a decision tree. outlook is at the root of the tree and determines the first decision. In case it is overcast, we'll always play golf. The numbers in (parentheses) at the end of each leaf tell us the number of examples in this leaf. If one or more leaves were not pure (= all of the same class), the number of misclassified examples would also be given, after a /slash/ Time taken to build model: 0.05 seconds Time taken to test model on training data: 0 seconds As you can see, a decision tree learns quite fast and is evaluated even faster. == Error on training data == Correctly Classified Instance 14 100 % Incorrectly Classified Instances 0 0 % Kappa statistic 1 Mean absolute error 0 Root mean squared error 0 Relative absolute error 0 % Root relative squared error 0 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 1 0 1 1 1 yes 1 0 1 1 1 no == Confusion Matrix == a b <-- classified as 9 0 | a = yes 0 5 | b = no This is quite boring: our classifier is perfect, at least on the training data -- all instances were classified correctly and all errors are zero. As is usually the case, the training set accuracy is too optimistic. The detailed accuracy by class and the confusion matrix is similarily trivial. == Stratified cross-validation == Correctly Classified Instances 9 64.2857 % Incorrectly Classified Instances 5 35.7143 % Kappa statistic 0.186 Mean absolute error 0.2857 Root mean squared error 0.4818 Relative absolute error 60 % Root relative squared error 97.6586 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 0.778 0.6 0.7 0.778 0.737 yes 0.4 0.222 0.5 0.4 0.444 no == Confusion Matrix == a b <-- classified as 7 2 | a = yes 3 2 | b = no The stratified cross-validation paints a more realistic picture. The accuracy is around 64%. The kappa statistic measures the agreement of prediction with the true class -- 1.0 signifies complete agreement. The error values that are shown, e.g., the root of the mean squared error, indicate the accuracy of the probability estimates that are generated by the classification model. The confusion matrix is more commonly named contingency table . In our case we have two classes, and therefore a 2x2 confusion matrix, the matrix could be arbitrarily large. The number of correctly classified instances is the sum of diagonals in the matrix; all others are incorrectly classified (class \"a\" gets misclassified as \"b\" exactly twice, and class \"b\" gets misclassified as \"a\" three times). The True Positive (TP) rate is the proportion of examples which were classified as class x , among all examples which truly have class x , i.e., how much of the class was captured correctly. It is equivalent to Recall . In the confusion matrix, this is the diagonal element divided by the sum over the relevant row, i.e., 7/(7+2)=0.778 for class yes and 2/(3+2)=0.4 for class no in our example. The False Positive (FP) rate is the proportion of examples which were classified as class x , but belong to a different class, among all examples which are not of class x . In the matrix, this is the column sum of class x minus the diagonal element, divided by the row sums of all other classes; i.e. 3/5=0.6 for class yes and 2/9=0.222 for class no . The Precision is the proportion of the examples which truly have class x among all those which were classified as class x . In the matrix, this is the diagonal element divided by the sum over the relevant column, i.e. 7/(7+3)=0.7 for class yes and 2/(2+2)=0.5 for class no . The F-Measure is simply 2 Precision Recall/(Precision+Recall), a combined measure for precision and recall. These measures are useful for comparing classifiers. However, if more detailed information about the classifier's predictions are necessary, -p # outputs just the predictions for each test instance, along with a range of one-based attribute ids (0 for none). Let's look at the following example. We shall assume soybean-train.arff and soybean-test.arff have been constructed via weka.filters.supervised.instance.StratifiedRemoveFolds as in a previous example. java weka . classifiers . bayes . NaiveBayes - K - t soybean - train . arff - T soybean - test . arff - p 0 0 diaporthe-stem-canker 0.9999672587892333 diaporthe-stem-canker 1 diaporthe-stem-canker 0.9999992614503429 diaporthe-stem-canker 2 diaporthe-stem-canker 0.999998948559035 diaporthe-stem-canker 3 diaporthe-stem-canker 0.9999998441238833 diaporthe-stem-canker 4 diaporthe-stem-canker 0.9999989997681132 diaporthe-stem-canker 5 rhizoctonia-root-rot 0.9999999395928124 rhizoctonia-root-rot 6 rhizoctonia-root-rot 0.999998912860593 rhizoctonia-root-rot 7 rhizoctonia-root-rot 0.9999994386283236 rhizoctonia-root-rot ... The values in each line are separated by a single space. The fields are the zero-based test instance id, followed by the predicted class value, the confidence for the prediction (estimated probability of predicted class), and the true class. All these are correctly classified, so let's look at a few erroneous ones. 32 phyllosticta-leaf-spot 0.7789710144361445 brown-spot ... 39 alternarialeaf-spot 0.6403333824349896 brown-spot ... 44 phyllosticta-leaf-spot 0.893568420641914 brown-spot ... 46 alternarialeaf-spot 0.5788190397739439 brown-spot ... 73 brown-spot 0.4943768155314637 alternarialeaf-spot ... In each of these cases, a misclassification occurred, mostly between classes alternarialeaf-spot and brown-spot . The confidences seem to be lower than for correct classification, so for a real-life application it may make sense to output don't know below a certain threshold. WEKA also outputs a trailing newline. If we had chosen a range of attributes via -p , e.g., -p first-last , the mentioned attributes would have been output afterwards as comma-separated values, in parantheses. However, the zero-based instance id in the first column offers a safer way to determine the test instances. Usually, if you evaluate a classifier for a longer experiment, you will do something like this (for csh): java -Xmx1024m weka.classifiers.trees.J48 -t data.arff -k -d J48-data.model > & ! J48-data.out & The -Xmx1024m parameter for maximum heap size enables the Java heap, where Java stores objects, to grow to a maximum size of 1024 Megabytes. There is no overhead involved, it just leaves more room for the heap to grow. The - k flag gives you some additional performance statistics. In case your model performs well, it makes sense to save it via -d - you can always delete it later! The implicit cross-validation gives a more reasonable estimate of the expected accuracy on unseen data than the training set accuracy. The output both of standard error and output should be redirected, so you get both errors and the normal output of your classifier. The last & starts the task in the background. Keep an eye on your task via top and if you notice the hard disk works hard all the time (for linux), this probably means your task needs too much memory and will not finish in time for the exam. ;-) In that case, switch to a faster classifier or use filters , e.g., for Resample to reduce the size of your dataset or StratifiedRemoveFolds to create training and test sets - for most classifiers, training takes more time than testing. So, now you have run a lot of experiments -- which classifier is best? Try cat *.out | grep -A 3 \"Stratified\" | grep \"^Correctly\" ...this should give you all cross-validated accuracies. If the cross-validated accuracy is roughly the same as the training set accuracy, this indicates that your classifiers is presumably not overfitting the training set. Assume you have found the best classifier. To apply it on a new dataset, use something like java weka.classifiers.trees.J48 -l J48-data.model -T new-data.arff You will have to use the same classifier to load the model, but you need not set any options. Just add the new test file via -T . If you want, -p first-last will output all test instances with classifications and confidence scores, followed by all attribute values, so you can look at each error separately. The following more complex csh script creates datasets for learning curves, creating a 75% training set and 25% test set from a given dataset, then successively reducing the test set by factor 1.2 (83%), until it is also 25% in size. All this is repeated thirty times, with different random reorderings (- S ) and the results are written to different directories. The Experimenter GUI in WEKA can be used to design and run similar experiments. #!/bin/csh foreach f ( $* ) set run = 1 while ( $run < = 30 ) mkdir $run > & ! /dev/null java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -c last -i ../ $f -o $run /t_ $f java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -V -c last -i ../ $f -o $run /t0 $f foreach nr ( 0 1 2 3 4 5 ) set nrp1 = $nr @ nrp1++ java weka.filters.supervised.instance.Resample -S 0 -Z 83 -c last -i $run /t $nr$f -o $run /t $nrp1$f end echo Run $run of $f done . @ run++ end end If meta classifiers are used, i.e. classifiers whose options include classifier specifications - for example, StackingC or ClassificationViaRegression , care must be taken not to mix the parameters. For example, java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -S 1 \\ -t data/iris.arff -x 2 gives us an illegal options exception for -S 1 . This parameter is meant for LinearRegression, not for ClassificationViaRegression, but WEKA does not know this by itself. One way to clarify this situation is to enclose the classifier specification, including all parameters, in \"double\" quotes, like this: java weka.classifiers.meta.ClassificationViaRegression -W \"weka.classifiers.functions.LinearRegression -S 1\" \\ -t data/iris.arff -x 2 However this does not always work, depending on how the option handling was implemented in the top-level classifier. While for Stacking this approach would work quite well, for ClassificationViaRegression it does not. We get the dubious error message that the class weka.classifiers.functions.LinearRegression -S 1 cannot be found. Fortunately, there is another approach: All parameters given after -- are processed by the first sub-classifier; another -- lets us specify parameters for the second sub-classifier and so on. java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression \\ -t data/iris.arff -x 2 -- -S 1 In some cases, both approaches have to be mixed, for example: java weka.classifiers.meta.Stacking -B \"weka.classifiers.lazy.IBk -K 10\" \\ -M \"weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -- -S 1\" \\ -t data/iris.arff -x 2 Notice that while ClassificationViaRegression honors the -- parameter, Stacking itself does not.","title":"Primer"},{"location":"primer/#basic-concepts","text":"","title":"Basic concepts"},{"location":"primer/#dataset","text":"A set of data items, the dataset, is a very basic concept of machine learning. A dataset is roughly equivalent to a two-dimensional spreadsheet or database table. In WEKA, it is implemented by the Instances class. A dataset is a collection of examples, each one of class Instance . Each Instance consists of a number of attributes, any of which can be nominal (= one of a predefined list of values), numeric (= a real or integer number) or a string (= an arbitrary long list of characters, enclosed in \"double quotes\"). WEKA also supports date attributes and relational attributes. The external representation of an Instances class is an ARFF file, which consists of a header describing the attribute types and the data as comma-separated list. Here is a short, commented example. A complete description of the ARFF file format can be found here . % This is a toy example, the UCI weather dataset. % Any relation to real weather is purely coincidental. Comment lines at the beginning of the dataset should give an indication of its source, context and meaning. @relation golfWeatherMichigan_1988/02/10_14days Here we state the internal name of the dataset. Try to be as descriptive as possible. @attribute outlook {sunny, overcast rainy} @attribute windy {TRUE, FALSE} Here we define two nominal attributes, outlook and windy . The former has three values: sunny , overcast and rainy ; the latter two: TRUE and FALSE . Nominal values with special characters, commas or spaces are enclosed in 'single quotes'. @attribute temperature numeric @attribute humidity numeric These lines define two numeric attributes. @attribute play {yes, no} The last attribute is the default target or class variable used for prediction. In our case it is a nominal attribute with two values, making this a binary classification problem. @data sunny,FALSE,85,85,no sunny,TRUE,80,90,no overcast,FALSE,83,86,yes rainy,FALSE,70,96,yes rainy,FALSE,68,80,yes The rest of the dataset consists of the token @data, followed by comma-separated values for the attributes -- one line per example. In our case there are five examples. Some basic statistics and validation of given ARFF files can be obtained via the main() routine of weka.core.Instances : java weka.core.Instances data/soybean.arff weka.core offers some other useful routines, e.g., converters.C45Loader and converters.CSVLoader , which can be used to convert C45 datasets and comma/tab-separated datasets respectively, e.g.: java weka.core.converters.CSVLoader data.csv > data.arff java weka.core.converters.C45Loader c45_filestem > data.arff","title":"Dataset"},{"location":"primer/#classifier","text":"Any classification or regression algorithm in WEKA is derived from the abstract Classifier class. Surprisingly little is needed for a basic classifier: a routine which generates a classifier model from a training dataset (= buildClassifier ) and another routine which produces a classification for a given instance (= classifyInstance ), or generates a probability distribution for all classes of the instance (= distributionForInstance ). A classifier model is an arbitrary complex mapping from predictor attributes to the class attribute. The specific form and creation of this mapping, or model, differs from classifier to classifier. For example, ZeroR's model just consists of a single value: the most common class in the case of classification problems, or the median of all numeric values in case of predicting a numeric value (= regression learning). ZeroR is a trivial classifier, but it gives a lower bound on the performance of a given dataset that should be significantly improved by more complex classifiers. As such it is a reasonable test of how well the class can be predicted without considering the other attributes. Later , we will explain how to interpret the output from classifiers in detail -- for now just focus on the Correctly Classified Instances in the section Stratified cross-validation and notice how it improves from ZeroR to J48 when we use the soybean data: java weka.classifiers.rules.ZeroR -t soybean.arff java weka.classifiers.trees.J48 -t soybean.arff There are various approaches to determine the performance of classifiers. It can most simply be measured by counting the proportion of correctly predicted examples in a test dataset. This value is the classification accuracy , which is also 1-ErrorRate . Both terms are used in literature. The simplest case for evaluation is when we use a training set and a test set which are mutually independent. This is referred to as hold-out estimate. To estimate variance in these performance estimates, hold-out estimates may be computed by repeatedly by resampling the same dataset -- i.e., randomly shuffling it and then splitting it into training and test sets with a specific proportion of the examples, collecting all estimates on the test sets and computing average and standard deviation of accuracy. A more elaborate method is k -fold cross-validation. Here, a number of folds k is specified. The dataset is randomly shuffled and then split into k folds of equal size. In each iteration, one fold is used for testing and the other k-1 folds are used for training the classifier. The test results are collected and pooled (or averaged) over all folds. This gives the cross-validation estimate of accuracy. The folds can be purely random or slightly modified to create the same class distributions in each fold as in the complete dataset. In the latter case the cross-validation is called stratified . Leave-one-out (loo) cross-validation signifies that k is equal to the number of examples. Out of necessity, loo cv has to be non-stratified, i.e., the class distributions in the test sets are not the same as those in the training data. Therefore loo CV can produce misleading results in rare cases. However it is still quite useful in dealing with small datasets since it utilizes the greatest amount of training data from the dataset.","title":"Classifier"},{"location":"primer/#weka-filters","text":"The weka.filters package contains Java classes that transform datasets -- by removing or adding attributes, resampling the dataset, removing examples and so on. This package offers useful support for data preprocessing, which is an important step in machine learning. All filters offer the command-line option -i for specifying the input dataset, and the option -o for specifying the output dataset. If any of these parameters is not given, this specifies standard input resp. output for use within pipes. Other parameters are specific to each filter and can be found out via - h , as with any other class. The weka.filters package is organized into supervised and unsupervised filtering, both of which are again subdivided into instance and attribute filtering. We will discuss each of the four subsection separately.","title":"weka filters"},{"location":"primer/#wekafilterssupervised","text":"Classes below weka.filters.supervised in WEKA's Java class hierarchy are for supervised filtering, i.e., taking advantage of the class information. For those filters, a class must be assigned by providing the index of the class attribute via -c .","title":"weka.filters.supervised"},{"location":"primer/#attribute","text":"Discretize is used to discretize numeric attributes into nominal ones, based on the class information, via Fayyad & Irani's MDL method, or optionally with Kononeko's MDL method. Some learning schemes or classifiers can only process nominal data, e.g., rules.Prism ; and in some cases discretization may also reduce learning time and help combat overfitting. java weka.filters.supervised.attribute.Discretize -i data/iris.arff -o iris-nom.arff -c last java weka.filters.supervised.attribute.Discretize -i data/cpu.arff -o cpu-classvendor-nom.arff -c first NominalToBinary encodes all nominal attributes into binary (two-valued) attributes, which can be used to transform the dataset into a purely numeric representation, e.g., for visualization via multi-dimensional scaling. java weka.filters.supervised.attribute.NominalToBinary -i data/contact-lenses.arff -o contact-lenses-bin.arff -c last Note that most classifiers in WEKA utilize transformation filters internally, e.g., Logistic and SMO, so you may not have to use these filters explicity.","title":"attribute"},{"location":"primer/#instance","text":"Resample creates a stratified subsample of the given dataset. This means that overall class distributions are approximately retained within the sample. A bias towards uniform class distribution can be specified via - B . java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -c last -Z 5 java weka.filters.supervised.instance.Resample -i data/soybean.arff -o soybean-uniform-5%.arff -c last -Z 5 -B 1 StratifiedRemoveFolds creates stratified cross-validation folds of the given dataset. This means that per default the class distributions are approximately retained within each fold. The following example splits soybean.arff into stratified training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-train.arff \\ -c last -N 4 -F 1 -V java weka.filters.supervised.instance.StratifiedRemoveFolds -i data/soybean.arff -o soybean-test.arff \\ -c last -N 4 -F 1","title":"instance"},{"location":"primer/#wekafiltersunsupervised","text":"Classes below weka.filters.unsupervised in WEKA's Java class hierarchy are for unsupervised filtering, e.g., the non-stratified version of Resample. A class should not be assigned here.","title":"weka.filters.unsupervised"},{"location":"primer/#attribute_1","text":"StringToWordVector transforms string attributes into a word vectors, e.g., creating one attribute for each word that either encodes presence or word count ( -C ) within the string. -W can be used to set an approximate limit on the number of words. When a class is assigned, the limit applies to each class separately. This filter is useful for text mining. Obfuscate renames the dataset name, all attribute names and nominal attribute values. This is intended for exchanging sensitive datasets without giving away restricted information. Remove is intended for explicit deletion of attributes from a dataset, e.g. for removing attributes of the iris dataset: java weka.filters.unsupervised.attribute.Remove -R 1 -2 -i data/iris.arff -o iris-simplified.arff java weka.filters.unsupervised.attribute.Remove -V -R 3 -last -i data/iris.arff -o iris-simplified.arff","title":"attribute"},{"location":"primer/#instance_1","text":"Resample creates a non-stratified subsample of the given dataset. It performs random sampling without regard to the class information. Otherwise it is equivalent to its supervised variant. java weka.filters.unsupervised.instance.Resample -i data/soybean.arff -o soybean-5%.arff -Z 5 RemoveFolds creates cross-validation folds of the given dataset. The class distributions are not retained. The following example splits soybean.arff into training and test datasets, the latter consisting of 25% (=1/4) of the data. java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-train.arff -c last -N 4 -F 1 -V java weka.filters.unsupervised.instance.RemoveFolds -i data/soybean.arff -o soybean-test.arff -c last -N 4 -F 1 RemoveWithValues filters instances according to the value of an attribute. java weka.filters.unsupervised.instance.RemoveWithValues -i data/soybean.arff \\ -o soybean-without_herbicide_injury.arff -V -C last -L 19","title":"instance"},{"location":"primer/#wekaclassifiers","text":"Classifiers are at the core of WEKA. There are a lot of common options for classifiers, most of which are related to evaluation purposes. We will focus on the most important ones. All others including classifier-specific parameters can be found via - h , as usual. Parameter Description -t specifies the training file (ARFF format) -T specifies the test file in (ARFF format). If this parameter is missing, a crossvalidation will be performed (default: 10-fold cv) -x This parameter determines the number of folds for the cross-validation. A cv will only be performed if -T is missing. -c As we already know from the weka.filters section, this parameter sets the class variable with a one-based index. -d The model after training can be saved via this parameter. Each classifier has a different binary format for the model, so it can only be read back by the ct same classifier on a compatible dataset. Only the model on the training set is saved, not the multiple models generated via cross-validation. -l Loads a previously saved model, usually for testing on new, previously unseen data. In that case, a compatible test file should be specified, i.e. the same ributes in the same order. -p If a test file is specified, this parameter shows you the predictions and one attribute (0 for none) for all test instances. -o This parameter switches the human-readable output of the model description off. In case of support vector machines or NaiveBayes, this makes some sense unless you want to parse and visualize a lot of information. We now give a short list of selected classifiers in WEKA: trees.J48 A clone of the C4.5 decision tree learner bayes.NaiveBayes A Naive Bayesian learner. -K switches on kernel density estimation for numerical attributes which often improves performance. meta.ClassificationViaRegression -W functions.LinearRegression Multi-response linear regression. functions.Logistic Logistic Regression. functions.SMO Support Vector Machine (linear, polynomial and RBF kernel) with Seuential Minimal Optimization Algorithm due to [Platt, 1998]. Defaults to SVM with linear kernel, -E 5 -C 10 gives an SVM with polynomial kernel of degree 5 and lambda=10. lazy.KStar Instance-Based learner. -E sets the blend entropy automatically, which is usa`lly preferable. lazy.IBk Instance-Based learner with fixed neighborhood. -K sets the number of neighbors tou`se. IB1 is equivalent to IBk -K 1 rules.JRip A clone of the RIPPER rule learner. Based on a simple example, we will now explain the output of a typical classifier, weka.classifiers.trees.J48 . Consider the following call from the command line, or start the WEKA explorer and train J48 on weather.numeric.arff: java weka.classifiers.trees.J48 -t data/weather.numeric.arff J48 pruned tree ------------------ outlook = sunny | humidity <= 75: yes (2.0) | humidity > 75: no (3.0) outlook = overcast: yes (4.0) outlook = rainy | windy = TRUE: no (2.0) | windy = FALSE: yes (3.0) Number of Leaves : 5 Size of the tree : 8 The first part, unless you specify -o , is a human-readable form of the training set model. In this case, it is a decision tree. outlook is at the root of the tree and determines the first decision. In case it is overcast, we'll always play golf. The numbers in (parentheses) at the end of each leaf tell us the number of examples in this leaf. If one or more leaves were not pure (= all of the same class), the number of misclassified examples would also be given, after a /slash/ Time taken to build model: 0.05 seconds Time taken to test model on training data: 0 seconds As you can see, a decision tree learns quite fast and is evaluated even faster. == Error on training data == Correctly Classified Instance 14 100 % Incorrectly Classified Instances 0 0 % Kappa statistic 1 Mean absolute error 0 Root mean squared error 0 Relative absolute error 0 % Root relative squared error 0 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 1 0 1 1 1 yes 1 0 1 1 1 no == Confusion Matrix == a b <-- classified as 9 0 | a = yes 0 5 | b = no This is quite boring: our classifier is perfect, at least on the training data -- all instances were classified correctly and all errors are zero. As is usually the case, the training set accuracy is too optimistic. The detailed accuracy by class and the confusion matrix is similarily trivial. == Stratified cross-validation == Correctly Classified Instances 9 64.2857 % Incorrectly Classified Instances 5 35.7143 % Kappa statistic 0.186 Mean absolute error 0.2857 Root mean squared error 0.4818 Relative absolute error 60 % Root relative squared error 97.6586 % Total Number of Instances 14 == Detailed Accuracy By Class == TP Rate FP Rate Precision Recall F-Measure Class 0.778 0.6 0.7 0.778 0.737 yes 0.4 0.222 0.5 0.4 0.444 no == Confusion Matrix == a b <-- classified as 7 2 | a = yes 3 2 | b = no The stratified cross-validation paints a more realistic picture. The accuracy is around 64%. The kappa statistic measures the agreement of prediction with the true class -- 1.0 signifies complete agreement. The error values that are shown, e.g., the root of the mean squared error, indicate the accuracy of the probability estimates that are generated by the classification model. The confusion matrix is more commonly named contingency table . In our case we have two classes, and therefore a 2x2 confusion matrix, the matrix could be arbitrarily large. The number of correctly classified instances is the sum of diagonals in the matrix; all others are incorrectly classified (class \"a\" gets misclassified as \"b\" exactly twice, and class \"b\" gets misclassified as \"a\" three times). The True Positive (TP) rate is the proportion of examples which were classified as class x , among all examples which truly have class x , i.e., how much of the class was captured correctly. It is equivalent to Recall . In the confusion matrix, this is the diagonal element divided by the sum over the relevant row, i.e., 7/(7+2)=0.778 for class yes and 2/(3+2)=0.4 for class no in our example. The False Positive (FP) rate is the proportion of examples which were classified as class x , but belong to a different class, among all examples which are not of class x . In the matrix, this is the column sum of class x minus the diagonal element, divided by the row sums of all other classes; i.e. 3/5=0.6 for class yes and 2/9=0.222 for class no . The Precision is the proportion of the examples which truly have class x among all those which were classified as class x . In the matrix, this is the diagonal element divided by the sum over the relevant column, i.e. 7/(7+3)=0.7 for class yes and 2/(2+2)=0.5 for class no . The F-Measure is simply 2 Precision Recall/(Precision+Recall), a combined measure for precision and recall. These measures are useful for comparing classifiers. However, if more detailed information about the classifier's predictions are necessary, -p # outputs just the predictions for each test instance, along with a range of one-based attribute ids (0 for none). Let's look at the following example. We shall assume soybean-train.arff and soybean-test.arff have been constructed via weka.filters.supervised.instance.StratifiedRemoveFolds as in a previous example. java weka . classifiers . bayes . NaiveBayes - K - t soybean - train . arff - T soybean - test . arff - p 0 0 diaporthe-stem-canker 0.9999672587892333 diaporthe-stem-canker 1 diaporthe-stem-canker 0.9999992614503429 diaporthe-stem-canker 2 diaporthe-stem-canker 0.999998948559035 diaporthe-stem-canker 3 diaporthe-stem-canker 0.9999998441238833 diaporthe-stem-canker 4 diaporthe-stem-canker 0.9999989997681132 diaporthe-stem-canker 5 rhizoctonia-root-rot 0.9999999395928124 rhizoctonia-root-rot 6 rhizoctonia-root-rot 0.999998912860593 rhizoctonia-root-rot 7 rhizoctonia-root-rot 0.9999994386283236 rhizoctonia-root-rot ... The values in each line are separated by a single space. The fields are the zero-based test instance id, followed by the predicted class value, the confidence for the prediction (estimated probability of predicted class), and the true class. All these are correctly classified, so let's look at a few erroneous ones. 32 phyllosticta-leaf-spot 0.7789710144361445 brown-spot ... 39 alternarialeaf-spot 0.6403333824349896 brown-spot ... 44 phyllosticta-leaf-spot 0.893568420641914 brown-spot ... 46 alternarialeaf-spot 0.5788190397739439 brown-spot ... 73 brown-spot 0.4943768155314637 alternarialeaf-spot ... In each of these cases, a misclassification occurred, mostly between classes alternarialeaf-spot and brown-spot . The confidences seem to be lower than for correct classification, so for a real-life application it may make sense to output don't know below a certain threshold. WEKA also outputs a trailing newline. If we had chosen a range of attributes via -p , e.g., -p first-last , the mentioned attributes would have been output afterwards as comma-separated values, in parantheses. However, the zero-based instance id in the first column offers a safer way to determine the test instances. Usually, if you evaluate a classifier for a longer experiment, you will do something like this (for csh): java -Xmx1024m weka.classifiers.trees.J48 -t data.arff -k -d J48-data.model > & ! J48-data.out & The -Xmx1024m parameter for maximum heap size enables the Java heap, where Java stores objects, to grow to a maximum size of 1024 Megabytes. There is no overhead involved, it just leaves more room for the heap to grow. The - k flag gives you some additional performance statistics. In case your model performs well, it makes sense to save it via -d - you can always delete it later! The implicit cross-validation gives a more reasonable estimate of the expected accuracy on unseen data than the training set accuracy. The output both of standard error and output should be redirected, so you get both errors and the normal output of your classifier. The last & starts the task in the background. Keep an eye on your task via top and if you notice the hard disk works hard all the time (for linux), this probably means your task needs too much memory and will not finish in time for the exam. ;-) In that case, switch to a faster classifier or use filters , e.g., for Resample to reduce the size of your dataset or StratifiedRemoveFolds to create training and test sets - for most classifiers, training takes more time than testing. So, now you have run a lot of experiments -- which classifier is best? Try cat *.out | grep -A 3 \"Stratified\" | grep \"^Correctly\" ...this should give you all cross-validated accuracies. If the cross-validated accuracy is roughly the same as the training set accuracy, this indicates that your classifiers is presumably not overfitting the training set. Assume you have found the best classifier. To apply it on a new dataset, use something like java weka.classifiers.trees.J48 -l J48-data.model -T new-data.arff You will have to use the same classifier to load the model, but you need not set any options. Just add the new test file via -T . If you want, -p first-last will output all test instances with classifications and confidence scores, followed by all attribute values, so you can look at each error separately. The following more complex csh script creates datasets for learning curves, creating a 75% training set and 25% test set from a given dataset, then successively reducing the test set by factor 1.2 (83%), until it is also 25% in size. All this is repeated thirty times, with different random reorderings (- S ) and the results are written to different directories. The Experimenter GUI in WEKA can be used to design and run similar experiments. #!/bin/csh foreach f ( $* ) set run = 1 while ( $run < = 30 ) mkdir $run > & ! /dev/null java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -c last -i ../ $f -o $run /t_ $f java weka.filters.supervised.instance.StratifiedRemoveFolds -N 4 -F 1 -S $run -V -c last -i ../ $f -o $run /t0 $f foreach nr ( 0 1 2 3 4 5 ) set nrp1 = $nr @ nrp1++ java weka.filters.supervised.instance.Resample -S 0 -Z 83 -c last -i $run /t $nr$f -o $run /t $nrp1$f end echo Run $run of $f done . @ run++ end end If meta classifiers are used, i.e. classifiers whose options include classifier specifications - for example, StackingC or ClassificationViaRegression , care must be taken not to mix the parameters. For example, java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -S 1 \\ -t data/iris.arff -x 2 gives us an illegal options exception for -S 1 . This parameter is meant for LinearRegression, not for ClassificationViaRegression, but WEKA does not know this by itself. One way to clarify this situation is to enclose the classifier specification, including all parameters, in \"double\" quotes, like this: java weka.classifiers.meta.ClassificationViaRegression -W \"weka.classifiers.functions.LinearRegression -S 1\" \\ -t data/iris.arff -x 2 However this does not always work, depending on how the option handling was implemented in the top-level classifier. While for Stacking this approach would work quite well, for ClassificationViaRegression it does not. We get the dubious error message that the class weka.classifiers.functions.LinearRegression -S 1 cannot be found. Fortunately, there is another approach: All parameters given after -- are processed by the first sub-classifier; another -- lets us specify parameters for the second sub-classifier and so on. java weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression \\ -t data/iris.arff -x 2 -- -S 1 In some cases, both approaches have to be mixed, for example: java weka.classifiers.meta.Stacking -B \"weka.classifiers.lazy.IBk -K 10\" \\ -M \"weka.classifiers.meta.ClassificationViaRegression -W weka.classifiers.functions.LinearRegression -- -S 1\" \\ -t data/iris.arff -x 2 Notice that while ClassificationViaRegression honors the -- parameter, Stacking itself does not.","title":"weka.classifiers"},{"location":"properties_file/","text":"General # A properties file is a simple text file with this structure: <key>=<value> Notes: Comments start with the hash sign # . Backslashes within values need to be doubled (the backslashes get interpreted already when a property is read). To make a rather long property line more readable, one can use a backslash to continue on the next line. The Filter property, e.g., looks like this: weka.filters.Filter = \\ > weka.filters.supervised.attribute, \\ > weka.filters.supervised.instance, \\ > weka.filters.unsupervised.attribute, \\ > weka.filters.unsupervised.instance Precedence # The Weka property files (extension .props ) are searched for in the following order: current directory (< Weka 3.7.2) the user's home directory (see FAQ Where is my home directory located? for more information) (>= Weka 3.7.2) $WEKA_HOME/props (the default value for WEKA_HOME is user's home directory/wekafiles). the class path (normally the weka.jar file) If WEKA encounters those files it only supplements the properties, never overrides them. In other words, a property in the property file of the current directory has a higher precedence than the one in the user's home directory. Note: Under Cywgin , the home directory is still the Windows one, since the java installation will be still one for Windows. How to modify a .props file? # It is quite possible, that the default setup of WEKA is not to your liking and that you want to tweak it a little bit. The use of .props files instead of hard-coding makes it quite easy to modify WEKA's behavior. As example, we are modifying the background color of the 2D plots in the Explorer, changing it to dark gray . The responsible .props file is weka/gui/visualize/Visualize.props . These are the necessary steps: close WEKA extract the .props file from the weka.jar , using an archive manager that can handle ZIP files (e.g., 7-Zip under Windows) place this .props file in your home directory (see FAQ Where is my home directory located? on how to determine your home directory), or for Weka 3.7.2 or higher place this .props file in $WEKA_HOME/props (the default value of WEKA_HOME is user's home directory/wekafiles) open this .props with a text editor ( NB: Notepad under Windows might not handle the Unix line-endings correctly!) navigate to the property weka.gui.visualize.Plot2D.backgroundColour and change the color after the equal sign (\"=\") to darkGray (the article about weka/gui/visualize/Visualize.props lists all possible colors) save the file and restart WEKA Notes # Escaping Backslashes in values need to be escaped (i.e., doubled), otherwise they get interpreted as character sequence. E.g., \"is\\this\" will be interpreted as \"is his\". Correctly escaped, this would read as \"is\\this\". See also # Further information about specific props files: weka/core/Capabilities.props weka/core/logging/Logging.props weka/experiment/DatabaseUtils.props weka/gui/GenericObjectEditor.props weka/gui/GUIEditors.props weka/gui/GenericPropertiesCreator.props weka/gui/GenericPropertiesCreator.excludes weka/gui/LookAndFeel.props weka/gui/MemoryUsage.props weka/gui/SimpleCLI.props weka/gui/beans/Beans.props weka/gui/experiment/Experimenter.props weka/gui/explorer/Explorer.props weka/gui/scripting/Groovy.props weka/gui/scripting/Jython.props weka/gui/treevisualizer/TreeVisualizer.props weka/gui/visualize/Visualize.props","title":"Properties File"},{"location":"properties_file/#general","text":"A properties file is a simple text file with this structure: <key>=<value> Notes: Comments start with the hash sign # . Backslashes within values need to be doubled (the backslashes get interpreted already when a property is read). To make a rather long property line more readable, one can use a backslash to continue on the next line. The Filter property, e.g., looks like this: weka.filters.Filter = \\ > weka.filters.supervised.attribute, \\ > weka.filters.supervised.instance, \\ > weka.filters.unsupervised.attribute, \\ > weka.filters.unsupervised.instance","title":"General"},{"location":"properties_file/#precedence","text":"The Weka property files (extension .props ) are searched for in the following order: current directory (< Weka 3.7.2) the user's home directory (see FAQ Where is my home directory located? for more information) (>= Weka 3.7.2) $WEKA_HOME/props (the default value for WEKA_HOME is user's home directory/wekafiles). the class path (normally the weka.jar file) If WEKA encounters those files it only supplements the properties, never overrides them. In other words, a property in the property file of the current directory has a higher precedence than the one in the user's home directory. Note: Under Cywgin , the home directory is still the Windows one, since the java installation will be still one for Windows.","title":"Precedence"},{"location":"properties_file/#how-to-modify-a-props-file","text":"It is quite possible, that the default setup of WEKA is not to your liking and that you want to tweak it a little bit. The use of .props files instead of hard-coding makes it quite easy to modify WEKA's behavior. As example, we are modifying the background color of the 2D plots in the Explorer, changing it to dark gray . The responsible .props file is weka/gui/visualize/Visualize.props . These are the necessary steps: close WEKA extract the .props file from the weka.jar , using an archive manager that can handle ZIP files (e.g., 7-Zip under Windows) place this .props file in your home directory (see FAQ Where is my home directory located? on how to determine your home directory), or for Weka 3.7.2 or higher place this .props file in $WEKA_HOME/props (the default value of WEKA_HOME is user's home directory/wekafiles) open this .props with a text editor ( NB: Notepad under Windows might not handle the Unix line-endings correctly!) navigate to the property weka.gui.visualize.Plot2D.backgroundColour and change the color after the equal sign (\"=\") to darkGray (the article about weka/gui/visualize/Visualize.props lists all possible colors) save the file and restart WEKA","title":"How to modify a .props file?"},{"location":"properties_file/#notes","text":"Escaping Backslashes in values need to be escaped (i.e., doubled), otherwise they get interpreted as character sequence. E.g., \"is\\this\" will be interpreted as \"is his\". Correctly escaped, this would read as \"is\\this\".","title":"Notes"},{"location":"properties_file/#see-also","text":"Further information about specific props files: weka/core/Capabilities.props weka/core/logging/Logging.props weka/experiment/DatabaseUtils.props weka/gui/GenericObjectEditor.props weka/gui/GUIEditors.props weka/gui/GenericPropertiesCreator.props weka/gui/GenericPropertiesCreator.excludes weka/gui/LookAndFeel.props weka/gui/MemoryUsage.props weka/gui/SimpleCLI.props weka/gui/beans/Beans.props weka/gui/experiment/Experimenter.props weka/gui/explorer/Explorer.props weka/gui/scripting/Groovy.props weka/gui/scripting/Jython.props weka/gui/treevisualizer/TreeVisualizer.props weka/gui/visualize/Visualize.props","title":"See also"},{"location":"props_file/","text":"see Properties file","title":"Props file"},{"location":"removing_misclassified_instances_from_dataset/","text":"Sometimes it is necessary to clean out the instances misclassified by a classifier from a dataset. The following example loads a dataset, runs the RemoveMisclassified filter and saves the resulting dataset in another file again: RemoveMisclassifiedTest <input.arff> <classname> <output.arff> Source code: import weka.classifiers.Classifier ; import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.instance.RemoveMisclassified ; import java.io.BufferedReader ; import java.io.BufferedWriter ; import java.io.FileReader ; import java.io.FileWriter ; /** * Runs the RemoveMisclassified filter over a given ARFF file. * First parameter is the input file, the second the classifier * to use and the third one is the output file. * * Usage: RemoveMisclassifiedTest input.arff classname output.arff * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class RemoveMisclassifiedTest { public static void main ( String [] args ) throws Exception { if ( args . length != 3 ) { System . out . println ( \"\\nUsage: RemoveMisclassifiedTest input.arff classname output.arff\\n\" ); System . exit ( 1 ); } // get data Instances input = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); input . setClassIndex ( input . numAttributes () - 1 ); // get classifier Classifier c = Classifier . forName ( args [ 1 ] , new String [ 0 ] ); // setup and run filter RemoveMisclassified filter = new RemoveMisclassified (); filter . setClassifier ( c ); filter . setClassIndex ( - 1 ); filter . setNumFolds ( 0 ); filter . setThreshold ( 0.1 ); filter . setMaxIterations ( 0 ); filter . setInputFormat ( input ); Instances output = Filter . useFilter ( input , filter ); // output file BufferedWriter writer = new BufferedWriter ( new FileWriter ( args [ 2 ] )); writer . write ( output . toString ()); writer . newLine (); writer . flush (); writer . close (); } } See also # Use Weka in your Java code - for general use of the Weka API Save Instances to an ARFF File - for saving an Instances object to a file Downloads # RemoveMisclassifiedTest.java","title":"Removing misclassified instances from dataset"},{"location":"removing_misclassified_instances_from_dataset/#see-also","text":"Use Weka in your Java code - for general use of the Weka API Save Instances to an ARFF File - for saving an Instances object to a file","title":"See also"},{"location":"removing_misclassified_instances_from_dataset/#downloads","text":"RemoveMisclassifiedTest.java","title":"Downloads"},{"location":"requirements/","text":"The following matrix shows which minimum version of Java is necessary to run a specific Weka version. The latest official releases of Weka require Java 8 or later. Note that if you are using Windows and your computer has a display with high pixel density (HiDPI), you may need to use Java 9 or later to avoid problems with inappropriate scaling of Weka's graphical user interfaces. Weka Java 1.4 Java 5 Java 6 Java 7 Java 8 or later <3.4.0 \u2611 \u2611 \u2611 \u2611 \u2611 3.4.x \u2611 \u2611 \u2611 \u2611 \u2611 3.5.x <3.5.3 \u2611 \u2611 \u2611 \u2611 3.6.x \u2611 \u2611 \u2611 \u2611 3.7.x 3.7.0 <3.7.14 \u2611 \u2611 3.8.x <3.8.2 \u2611 3.9.x <3.9.2 \u2611","title":"Requirements"},{"location":"roc_curves/","text":"General # Weka just varies the threshold on the class probability estimates in each case. What does that mean? In case of a classifier that does not return proper class probabilities (like SMO with the -M option, or IB1), you will end up with only two points in the curve. Using a classifier that returns proper distributions, like BayesNet, J48 or SMO with -M option for building logistic models, you will get nice curves. The class used for calculating the ROC and also the AUC (= area under the curve) is weka.classifiers.evaluation.ThresholdCurve . Commandline # You can output the data for the ROC curves with the following options: -threshold-file <file> The file to save the threshold data to. The format is determined by the extensions, e.g., '.arff' for ARFF format or '.csv' for CSV. -threshold-label <label> The class label to determine the threshold data for (default is the first label) Here's an example for using J48 on the UCI dataset anneal , generating the ROC curve file for label U from a 10-fold cross-validation: java weka.classifiers.trees.J48 -t /some/where/anneal.arff \\ -threshold-file anneal_roc_U.arff -threshold-label U Explorer # Generating # The Weka Explorer enables you to plot the ROC ( Receiver operating characteristic ) curve for a certain class label of dataset: run a classifier on a dataset right-click in the result list on the result you want to display the curve for select Visualize threshold curve and choose the class label you want the plot for Note: the AUC for this plot is also displayed, just above the actual plot. Saving # You can save the ROC curve in two ways: as an ARFF file, containing the data points (can be displayed again) as an image (using Alt+Shift+Left click to bring up a save dialog) Loading # A previously saved ROC data file can be displayed in two ways: without the AUC - with the following command java [CLASSPATH|-classpath <your-classpath>] weka.gui.visualize.VisualizePanel <file> with the AUC - needs this source code KnowledgeFlow # See Plotting multiple ROC curves . See also # Plotting multiple ROC curves Generating ROC curve Links # WikiPedia article on ROC curve weka.classifiers.evaluation.ThresholdCurve","title":"Roc curves"},{"location":"roc_curves/#general","text":"Weka just varies the threshold on the class probability estimates in each case. What does that mean? In case of a classifier that does not return proper class probabilities (like SMO with the -M option, or IB1), you will end up with only two points in the curve. Using a classifier that returns proper distributions, like BayesNet, J48 or SMO with -M option for building logistic models, you will get nice curves. The class used for calculating the ROC and also the AUC (= area under the curve) is weka.classifiers.evaluation.ThresholdCurve .","title":"General"},{"location":"roc_curves/#commandline","text":"You can output the data for the ROC curves with the following options: -threshold-file <file> The file to save the threshold data to. The format is determined by the extensions, e.g., '.arff' for ARFF format or '.csv' for CSV. -threshold-label <label> The class label to determine the threshold data for (default is the first label) Here's an example for using J48 on the UCI dataset anneal , generating the ROC curve file for label U from a 10-fold cross-validation: java weka.classifiers.trees.J48 -t /some/where/anneal.arff \\ -threshold-file anneal_roc_U.arff -threshold-label U","title":"Commandline"},{"location":"roc_curves/#explorer","text":"","title":"Explorer"},{"location":"roc_curves/#generating","text":"The Weka Explorer enables you to plot the ROC ( Receiver operating characteristic ) curve for a certain class label of dataset: run a classifier on a dataset right-click in the result list on the result you want to display the curve for select Visualize threshold curve and choose the class label you want the plot for Note: the AUC for this plot is also displayed, just above the actual plot.","title":"Generating"},{"location":"roc_curves/#saving","text":"You can save the ROC curve in two ways: as an ARFF file, containing the data points (can be displayed again) as an image (using Alt+Shift+Left click to bring up a save dialog)","title":"Saving"},{"location":"roc_curves/#loading","text":"A previously saved ROC data file can be displayed in two ways: without the AUC - with the following command java [CLASSPATH|-classpath <your-classpath>] weka.gui.visualize.VisualizePanel <file> with the AUC - needs this source code","title":"Loading"},{"location":"roc_curves/#knowledgeflow","text":"See Plotting multiple ROC curves .","title":"KnowledgeFlow"},{"location":"roc_curves/#see-also","text":"Plotting multiple ROC curves Generating ROC curve","title":"See also"},{"location":"roc_curves/#links","text":"WikiPedia article on ROC curve weka.classifiers.evaluation.ThresholdCurve","title":"Links"},{"location":"saving_and_loading_models/","text":"You save a trained classifier with the -d option ( dumping ), e.g.: java weka.classifiers.trees.J48 -C 0 .25 -M 2 -t /some/where/train.arff -d /other/place/j48.model And you can load it with -l and use it on a test set, e.g.: java weka.classifiers.trees.J48 -l /other/place/j48.model -T /some/where/test.arff Note, when loading a model you no longer need to supply specific parameters to the classifier. Explorer # A trained model can be saved like this, e.g., J48: train your model on the training data /some/where/train.arff right-click in the Results list on the item which model you want to save select Save model and save it to /other/place/j48.model You can load the previously saved model with the following steps: load your test data /some/where/test.arff via the Supplied test set button right-click in the Results list , select Load model and choose /other/place/j48.model in the More options dialog, change the Output predictions to CSV or another format (and specify a file in the options of the output format), if you want to store the predictions in a file rather than having to copy/paste them select Re-evaluate model on current test set Based on this Weka Mailing List post. Making Predictions with your model without retraining # See the Making predictions article for detailed information. Source code # See Serialization for code examples. See also # Serialization","title":"Saving and loading models"},{"location":"saving_and_loading_models/#explorer","text":"A trained model can be saved like this, e.g., J48: train your model on the training data /some/where/train.arff right-click in the Results list on the item which model you want to save select Save model and save it to /other/place/j48.model You can load the previously saved model with the following steps: load your test data /some/where/test.arff via the Supplied test set button right-click in the Results list , select Load model and choose /other/place/j48.model in the More options dialog, change the Output predictions to CSV or another format (and specify a file in the options of the output format), if you want to store the predictions in a file rather than having to copy/paste them select Re-evaluate model on current test set Based on this Weka Mailing List post.","title":"Explorer"},{"location":"saving_and_loading_models/#making-predictions-with-your-model-without-retraining","text":"See the Making predictions article for detailed information.","title":"Making Predictions with your model without retraining"},{"location":"saving_and_loading_models/#source-code","text":"See Serialization for code examples.","title":"Source code"},{"location":"saving_and_loading_models/#see-also","text":"Serialization","title":"See also"},{"location":"serialization/","text":"Serialization is the process of saving an object in a persistent form, e.g., on the harddisk as a bytestream. Deserialization is the process in the opposite direction, creating an object from a persistently saved data structure. In Java , an object can be serialized if it imports the java.io.Serializable interface. Members of an object that are not supposed to be serialized, need to be prefixed with the keyword transient . In the following you'll find some Java code snippets for serializing and deserializing a J48 classifier. Of course, serialization is not limited to classifiers. Most schemes in Weka, like clusterers and filters, are also serializable. Serializing # Here we create a J48 classifier cls , train it with a dataset /some/where/data.arff , and save the built model to a file /some/where/j48.model . // create J48 Classifier cls = new J48 (); // train Instances inst = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/data.arff\" ))); inst . setClassIndex ( inst . numAttributes () - 1 ); cls . buildClassifier ( inst ); // serialize model ObjectOutputStream oos = new ObjectOutputStream ( new FileOutputStream ( \"/some/where/j48.model\" )); oos . writeObject ( cls ); oos . flush (); oos . close (); If you use the SerializationHelper class, then this shrinks to: // serialize model weka . core . SerializationHelper . write ( \"/some/where/j48.model\" , cls ); Deserializing # Here the previously saved model is deserialized as cls and again available for classification. // deserialize model ObjectInputStream ois = new ObjectInputStream ( new FileInputStream ( \"/some/where/j48.model\" )); Classifier cls = ( Classifier ) ois . readObject (); ois . close (); Or, with the SerializationHelper class: // deserialize model Classifier cls = ( Classifier ) weka . core . SerializationHelper . read ( \"/some/where/j48.model\" ); Serialization in Weka # The Explorer serializes the classifier and the training header together. This makes it easy to test whether a dataset is compatible with the dataset the classifier was trained with. The commandline option -d of the developer version stores the training header as well. In order to read serialized models that contain the header information as well, you can use the readAll method of the weka.core.SerializationHelper . For serializing models with their datasets, use writeAll . See also # Use Weka in your Java code Links # Java Serialization Specifications","title":"Serialization"},{"location":"serialization/#serializing","text":"Here we create a J48 classifier cls , train it with a dataset /some/where/data.arff , and save the built model to a file /some/where/j48.model . // create J48 Classifier cls = new J48 (); // train Instances inst = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/data.arff\" ))); inst . setClassIndex ( inst . numAttributes () - 1 ); cls . buildClassifier ( inst ); // serialize model ObjectOutputStream oos = new ObjectOutputStream ( new FileOutputStream ( \"/some/where/j48.model\" )); oos . writeObject ( cls ); oos . flush (); oos . close (); If you use the SerializationHelper class, then this shrinks to: // serialize model weka . core . SerializationHelper . write ( \"/some/where/j48.model\" , cls );","title":"Serializing"},{"location":"serialization/#deserializing","text":"Here the previously saved model is deserialized as cls and again available for classification. // deserialize model ObjectInputStream ois = new ObjectInputStream ( new FileInputStream ( \"/some/where/j48.model\" )); Classifier cls = ( Classifier ) ois . readObject (); ois . close (); Or, with the SerializationHelper class: // deserialize model Classifier cls = ( Classifier ) weka . core . SerializationHelper . read ( \"/some/where/j48.model\" );","title":"Deserializing"},{"location":"serialization/#serialization-in-weka","text":"The Explorer serializes the classifier and the training header together. This makes it easy to test whether a dataset is compatible with the dataset the classifier was trained with. The commandline option -d of the developer version stores the training header as well. In order to read serialized models that contain the header information as well, you can use the readAll method of the weka.core.SerializationHelper . For serializing models with their datasets, use writeAll .","title":"Serialization in Weka"},{"location":"serialization/#see-also","text":"Use Weka in your Java code","title":"See also"},{"location":"serialization/#links","text":"Java Serialization Specifications","title":"Links"},{"location":"speeding_up_weka/","text":"The following is based on a post from Eibe Frank on the Weka mailing list. CPU acceleration # WEKA implementations of algorithms that are based on standard linear algebra generally apply the MTJ library, which, optionally, can use a much faster native backend than the default pure-Java backend. Install the appropriate netlibNative* package for your platform using the WEKA package manager to get this acceleration. To get further speed-ups, compile OpenBLAS or similar for your particular computer and link it with MTJ/NetlibJava, i.e., to make use of multi-threaded linear algebra (see info at https://github.com/fommil/netlib-java ). There is good news for Mac OS X users: OS X comes with a system-optimised library (vecLib) and all you need to do is install the WEKA package netlibNativeOSX* to get high-speed matrix algebra. Here is a list of WEKA schemes (not sure whether it\u2019s complete) that can benefit from this (but only on sufficiently large data): GaussianProcesses PrincipalComponents LinearRegression M5P M5Rules MultivariateGaussianEstimator There are also some schemes in various packages: LDA QDA FLDA LatentSemanticAnalysis Nystroem RotationForest LeastMedSq RBFNetwork GPU acceleration # Now, regarding GPUs, it's actually possible to set up a GPU-based backend for MTJ/NetlibJava. However, WEKA uses double-precision arithmetic and consumer-grade GPUs are optimised for single-precision arithmetic and very slow when using double-precision arithmetic. (Neural networks in deep learning libraries are generally trained using single-precision arithmetic.) Anyway, see instructions for using NVBLAS (Nvidia's BLAS wrapper) on Ubuntu 20.04 in article MTJ with NVBLAS . I tried PrincipalComponents (see instructions below for how I did this) and LDA with NVBLAS. However, in my experiments, only a small fraction of the BLAS operations were off-loaded to the GPU by NVBLAS, even though the names of the BLAS routines in nvblas.log are all in the list of GPU-based routines at https://docs.nvidia.com/cuda/nvblas/index.html#routines Perhaps it also depends on the parameter settings of those routines as to whether they are off-loaded to the GPU. It seems NVBLAS uses some heuristics to guess whether it's worthwhile to use the GPU instead of the CPU for certain input parameters. So, apart from using wekaDeeplearning4j, there is this second way of using GPUs: by applying the NVBLAS backend with MTJ/netlib-java. The third way to use a GPU from WEKA is to install XGBoost with GPU support in Python or R (note that the GPU support for XGBoost was actually developed here at Waikato by Rory Mitchell!). That can then be used in WEKA via the RPlugin and wekaPython. In 2020, when I tried this on a Windows machine, it only worked out of the box using wekaPython though, and not using R. Here is an email to a colleague that I wrote at the time: The fourth way to use a GPU from WEKA is to use the kerasZoo package, which has https://github.com/Waikato/weka-3.8/blob/master/packages/internal/kerasZoo/src/main/java/weka/classifiers/keras/KerasZooClassifier.java However, it may need to be updated to work with the very latest version of WEKA. Finally, I suppose any installed GPU-support in R/Python for any of other the learning schemes in MLR (version 1) or scikit-learn should be \"transparently\" available in WEKA via RPlugin and wekaPython respectively. Unfortunately, in contrast to the MTJ and wekaDeeplearning4j options, wekaPython or RPlugin always incurs an overheard (both, in terms of time and memory) by transferring the data from WEKA to R or Python respectively (and then from RAM to GPU device memory!).","title":"Speeding up weka"},{"location":"speeding_up_weka/#cpu-acceleration","text":"WEKA implementations of algorithms that are based on standard linear algebra generally apply the MTJ library, which, optionally, can use a much faster native backend than the default pure-Java backend. Install the appropriate netlibNative* package for your platform using the WEKA package manager to get this acceleration. To get further speed-ups, compile OpenBLAS or similar for your particular computer and link it with MTJ/NetlibJava, i.e., to make use of multi-threaded linear algebra (see info at https://github.com/fommil/netlib-java ). There is good news for Mac OS X users: OS X comes with a system-optimised library (vecLib) and all you need to do is install the WEKA package netlibNativeOSX* to get high-speed matrix algebra. Here is a list of WEKA schemes (not sure whether it\u2019s complete) that can benefit from this (but only on sufficiently large data): GaussianProcesses PrincipalComponents LinearRegression M5P M5Rules MultivariateGaussianEstimator There are also some schemes in various packages: LDA QDA FLDA LatentSemanticAnalysis Nystroem RotationForest LeastMedSq RBFNetwork","title":"CPU acceleration"},{"location":"speeding_up_weka/#gpu-acceleration","text":"Now, regarding GPUs, it's actually possible to set up a GPU-based backend for MTJ/NetlibJava. However, WEKA uses double-precision arithmetic and consumer-grade GPUs are optimised for single-precision arithmetic and very slow when using double-precision arithmetic. (Neural networks in deep learning libraries are generally trained using single-precision arithmetic.) Anyway, see instructions for using NVBLAS (Nvidia's BLAS wrapper) on Ubuntu 20.04 in article MTJ with NVBLAS . I tried PrincipalComponents (see instructions below for how I did this) and LDA with NVBLAS. However, in my experiments, only a small fraction of the BLAS operations were off-loaded to the GPU by NVBLAS, even though the names of the BLAS routines in nvblas.log are all in the list of GPU-based routines at https://docs.nvidia.com/cuda/nvblas/index.html#routines Perhaps it also depends on the parameter settings of those routines as to whether they are off-loaded to the GPU. It seems NVBLAS uses some heuristics to guess whether it's worthwhile to use the GPU instead of the CPU for certain input parameters. So, apart from using wekaDeeplearning4j, there is this second way of using GPUs: by applying the NVBLAS backend with MTJ/netlib-java. The third way to use a GPU from WEKA is to install XGBoost with GPU support in Python or R (note that the GPU support for XGBoost was actually developed here at Waikato by Rory Mitchell!). That can then be used in WEKA via the RPlugin and wekaPython. In 2020, when I tried this on a Windows machine, it only worked out of the box using wekaPython though, and not using R. Here is an email to a colleague that I wrote at the time: The fourth way to use a GPU from WEKA is to use the kerasZoo package, which has https://github.com/Waikato/weka-3.8/blob/master/packages/internal/kerasZoo/src/main/java/weka/classifiers/keras/KerasZooClassifier.java However, it may need to be updated to work with the very latest version of WEKA. Finally, I suppose any installed GPU-support in R/Python for any of other the learning schemes in MLR (version 1) or scikit-learn should be \"transparently\" available in WEKA via RPlugin and wekaPython respectively. Unfortunately, in contrast to the MTJ and wekaDeeplearning4j options, wekaPython or RPlugin always incurs an overheard (both, in terms of time and memory) by transferring the data from WEKA to R or Python respectively (and then from RAM to GPU device memory!).","title":"GPU acceleration"},{"location":"stemmers/","text":"Introduction # Weka supports stemming algorithms in the developer version. The stemming algorithms are located in the following package: weka.core.stemmers Currently, the Lovins Stemmer (+ iterated version) and support for the Snowball stemmers are included. Snowball stemmers # Weka contains a wrapper class for the Snowball stemmers (containing the Porter stemmer and several other stemmers for different languages). The relevant class is weka.core.stemmers.SnowballStemmer. The Snowball classes are not included, they only have to be present in the classpath. The reason for this is, that the Weka team doesn't have to watch out for new versions of the stemmers and update them. There are three ways of getting hold of the Snowball stemmers: For Weka 3.7.x you can install an unofficial package You can add the pre-compiled snowball-20051019.jar archive to your classpath and you're set. (based on source code from 2005-10-19, compiled 2005-10-22) You can compile the stemmers yourself with the newest sources. Just download the snowball-20051019.zip . Note: the patch target is specific to the source code from 2005-10-19. PTStemmer # PTStemmer is a stemmer library for Portuguese developed by Pedro Oliveira. In order to use this library: you can install the unofficial package when using Weka 3.7.x you just need to download the ptstemmer.jar and add them to your classpath. The source code of the wrapper project is also available: ptstemmer-weka-src-20091105.tar.gz . NB: the source code and the resulting jars are based on version 1.0 of the PTStemmer library. Using stemmers # The stemmers can either be used: from commandline within the StringToWordVector (package weka.filters.unsupervised.attribute ) Commandline # All stemmers support the following options: -h for displaying a brief help -i <input-file> The file to process -o <output-file> The file to output the processed data to (default stdout ) -l Uses lowercase strings, i.e. the input is automatically converted to lower case StringToWordVector # Just use the GenericObjectEditor to choose the right stemmer and the desired options (if the stemmer offers these). Adding new stemmers # You can easily add new stemmers, if you follow these guidelines (for use in the GenericObjectEditor ): they should be located in the weka.core.stemmers package and they must implement the interface weka.core.stemmers.Stemmer . Links # Snowball homepage ANT homepage","title":"Introduction"},{"location":"stemmers/#introduction","text":"Weka supports stemming algorithms in the developer version. The stemming algorithms are located in the following package: weka.core.stemmers Currently, the Lovins Stemmer (+ iterated version) and support for the Snowball stemmers are included.","title":"Introduction"},{"location":"stemmers/#snowball-stemmers","text":"Weka contains a wrapper class for the Snowball stemmers (containing the Porter stemmer and several other stemmers for different languages). The relevant class is weka.core.stemmers.SnowballStemmer. The Snowball classes are not included, they only have to be present in the classpath. The reason for this is, that the Weka team doesn't have to watch out for new versions of the stemmers and update them. There are three ways of getting hold of the Snowball stemmers: For Weka 3.7.x you can install an unofficial package You can add the pre-compiled snowball-20051019.jar archive to your classpath and you're set. (based on source code from 2005-10-19, compiled 2005-10-22) You can compile the stemmers yourself with the newest sources. Just download the snowball-20051019.zip . Note: the patch target is specific to the source code from 2005-10-19.","title":"Snowball stemmers"},{"location":"stemmers/#ptstemmer","text":"PTStemmer is a stemmer library for Portuguese developed by Pedro Oliveira. In order to use this library: you can install the unofficial package when using Weka 3.7.x you just need to download the ptstemmer.jar and add them to your classpath. The source code of the wrapper project is also available: ptstemmer-weka-src-20091105.tar.gz . NB: the source code and the resulting jars are based on version 1.0 of the PTStemmer library.","title":"PTStemmer"},{"location":"stemmers/#using-stemmers","text":"The stemmers can either be used: from commandline within the StringToWordVector (package weka.filters.unsupervised.attribute )","title":"Using stemmers"},{"location":"stemmers/#commandline","text":"All stemmers support the following options: -h for displaying a brief help -i <input-file> The file to process -o <output-file> The file to output the processed data to (default stdout ) -l Uses lowercase strings, i.e. the input is automatically converted to lower case","title":"Commandline"},{"location":"stemmers/#stringtowordvector","text":"Just use the GenericObjectEditor to choose the right stemmer and the desired options (if the stemmer offers these).","title":"StringToWordVector"},{"location":"stemmers/#adding-new-stemmers","text":"You can easily add new stemmers, if you follow these guidelines (for use in the GenericObjectEditor ): they should be located in the weka.core.stemmers package and they must implement the interface weka.core.stemmers.Stemmer .","title":"Adding new stemmers"},{"location":"stemmers/#links","text":"Snowball homepage ANT homepage","title":"Links"},{"location":"stop_word_filtering_and_attributes/","text":"I have WEKA 3 6 3 When I do stop word filtering I get more attributes than before, however the program performs better and is faster and more accure. Hercules Dalianis hercules@dsv.su.se","title":"Stop word filtering and attributes"},{"location":"subversion/","text":"General # The Subversion repository for WEKA has been disabled. We have switched to a Git repository. More information is on this page .","title":"General"},{"location":"subversion/#general","text":"The Subversion repository for WEKA has been disabled. We have switched to a Git repository. More information is on this page .","title":"General"},{"location":"text_categorization_with_weka/","text":"In the following one can find some information of how to use Weka for text categorization . Import # Weka needs the data to be present in ARFF or XRFF format in order to perform any classification tasks. Directories # One can transform the text files with the following tools into ARFF format (depending on the version of Weka you are using): TextDirectoryToArff tool (3.4.x and >= 3.5.3) this Java class transforms a directory of files into an ARFF file TextDirectoryLoader converter (> 3.5.3) this converter is based on the TextDirectoryToArff tool and located in the weka.core.converters package Example directory layout for TextDirectoryLoader : ... | +- text_example | +- class1 | | | + file1.txt | | | + file2.txt | | | ... | +- class2 | | | + another_file1.txt | | | + another_file2.txt | | | ... The above directory structure can be turned into an ARFF file like this: java weka.core.converters.TextDirectoryLoader -dir text_example > text_example.arff CSV files # CSV files can be imported in Weka easily via the Weka Explorer or via commandline via the CSVLoader class: java weka.core.converters.CSVLoader file.csv > file.arff By default, non-numerical attributes get imported as NOMINAL attributes, which is not necessarily desired for textual data, especially if one wants to use the StringToWordVector filter. In order to change the attribute to STRING , one can run the NominalToString filter (package weka.filters.unsupervised.attribute ) on the data, specifying the attribute index or range of indices that should be converted (NB: this filter does not exclude the class attribute from conversion!). In order to retain the attribute types, one needs to save the file in ARFF or XRFF format (or in the compressed version of these formats). Third-party tools # TagHelper Tools , which allows one to transform texts into vectors of stemmed or unstemmed unigrams, bigrams, part-of-speech bigrams, and some user defined features, and then saves this representation to ARFF . Currently processes English, German, and Chinese. Spanish and Portugese are in progress. Working with textual data # Conversion # Most classifiers in Weka cannot handle String attributes. For these learning schemes one has to process the data with appropriate filters, e.g., the StringToWordVector filter which can perform TF/IDF transformation . The StringToWordVector filter places the class attribute of the generated output data at the beginning. In case you'd to like to have it as last attribute again, you can use the Reorder filter with the following setup: weka.filters.unsupervised.attribute.Reorder -R 2 -last,first And with the MultiFilter you can also apply both filters in one go, instead of subsequently. Makes it easier in the Explorer for instance. Stopwords # The StringToWordVector filter can also work with a different stopword list than the built-in one (based on the Rainbow system). One can use the -stopwords option to load the external stopwords file. The format for such a stopword file is one stopword per line, lines starting with '#' are interpreted as comments and ignored. Note: There was a bug in Weka 3.5.6 (which introduced the support of external stopwords lists), which ignored the external stopwords list. Later versions from 21/07/2007 on will work correctly. UTF-8 # In case you are working with text files containing non-ASCII characters, e.g., Arabic, you might encounter some display problems under Windows. Java was designed to display UTF-8 , which should include arabic characters. By default, Java uses code page 1252 under Windows, which garbles the display of other characters. In order to fix this, you will have to modify the java command-line with which you start up Weka: java -Dfile.encoding = utf-8 -classpath ... The -Dfile.encoding=utf-8 tells Java to explicitly use UTF-8 encoding instead of the default CP1252 . If you are starting Weka via start menu and you use a recent version (at least 3.5.8 or 3.4.13), then you will just have to modify the fileEncoding placeholder in the RunWeka.ini accordingly. Examples # text_example.zip - contains a directory structure and example files that can be imported with the TextDirectoryLoader converter. TextCategorizationTest.java - uses the TextDirectoryLoader converter to turn a directory structure into a dataset, applies the StringToWordVector and builds a classifier with the filtered data. See also # Batch filtering - for generating a test set with the same dictionary as the training set All text categorization articles Links # Javadoc StringToWordVector TextDirectoryLoader","title":"Text categorization with weka"},{"location":"text_categorization_with_weka/#import","text":"Weka needs the data to be present in ARFF or XRFF format in order to perform any classification tasks.","title":"Import"},{"location":"text_categorization_with_weka/#directories","text":"One can transform the text files with the following tools into ARFF format (depending on the version of Weka you are using): TextDirectoryToArff tool (3.4.x and >= 3.5.3) this Java class transforms a directory of files into an ARFF file TextDirectoryLoader converter (> 3.5.3) this converter is based on the TextDirectoryToArff tool and located in the weka.core.converters package Example directory layout for TextDirectoryLoader : ... | +- text_example | +- class1 | | | + file1.txt | | | + file2.txt | | | ... | +- class2 | | | + another_file1.txt | | | + another_file2.txt | | | ... The above directory structure can be turned into an ARFF file like this: java weka.core.converters.TextDirectoryLoader -dir text_example > text_example.arff","title":"Directories"},{"location":"text_categorization_with_weka/#csv-files","text":"CSV files can be imported in Weka easily via the Weka Explorer or via commandline via the CSVLoader class: java weka.core.converters.CSVLoader file.csv > file.arff By default, non-numerical attributes get imported as NOMINAL attributes, which is not necessarily desired for textual data, especially if one wants to use the StringToWordVector filter. In order to change the attribute to STRING , one can run the NominalToString filter (package weka.filters.unsupervised.attribute ) on the data, specifying the attribute index or range of indices that should be converted (NB: this filter does not exclude the class attribute from conversion!). In order to retain the attribute types, one needs to save the file in ARFF or XRFF format (or in the compressed version of these formats).","title":"CSV files"},{"location":"text_categorization_with_weka/#third-party-tools","text":"TagHelper Tools , which allows one to transform texts into vectors of stemmed or unstemmed unigrams, bigrams, part-of-speech bigrams, and some user defined features, and then saves this representation to ARFF . Currently processes English, German, and Chinese. Spanish and Portugese are in progress.","title":"Third-party tools"},{"location":"text_categorization_with_weka/#working-with-textual-data","text":"","title":"Working with textual data"},{"location":"text_categorization_with_weka/#conversion","text":"Most classifiers in Weka cannot handle String attributes. For these learning schemes one has to process the data with appropriate filters, e.g., the StringToWordVector filter which can perform TF/IDF transformation . The StringToWordVector filter places the class attribute of the generated output data at the beginning. In case you'd to like to have it as last attribute again, you can use the Reorder filter with the following setup: weka.filters.unsupervised.attribute.Reorder -R 2 -last,first And with the MultiFilter you can also apply both filters in one go, instead of subsequently. Makes it easier in the Explorer for instance.","title":"Conversion"},{"location":"text_categorization_with_weka/#stopwords","text":"The StringToWordVector filter can also work with a different stopword list than the built-in one (based on the Rainbow system). One can use the -stopwords option to load the external stopwords file. The format for such a stopword file is one stopword per line, lines starting with '#' are interpreted as comments and ignored. Note: There was a bug in Weka 3.5.6 (which introduced the support of external stopwords lists), which ignored the external stopwords list. Later versions from 21/07/2007 on will work correctly.","title":"Stopwords"},{"location":"text_categorization_with_weka/#utf-8","text":"In case you are working with text files containing non-ASCII characters, e.g., Arabic, you might encounter some display problems under Windows. Java was designed to display UTF-8 , which should include arabic characters. By default, Java uses code page 1252 under Windows, which garbles the display of other characters. In order to fix this, you will have to modify the java command-line with which you start up Weka: java -Dfile.encoding = utf-8 -classpath ... The -Dfile.encoding=utf-8 tells Java to explicitly use UTF-8 encoding instead of the default CP1252 . If you are starting Weka via start menu and you use a recent version (at least 3.5.8 or 3.4.13), then you will just have to modify the fileEncoding placeholder in the RunWeka.ini accordingly.","title":"UTF-8"},{"location":"text_categorization_with_weka/#examples","text":"text_example.zip - contains a directory structure and example files that can be imported with the TextDirectoryLoader converter. TextCategorizationTest.java - uses the TextDirectoryLoader converter to turn a directory structure into a dataset, applies the StringToWordVector and builds a classifier with the filtered data.","title":"Examples"},{"location":"text_categorization_with_weka/#see-also","text":"Batch filtering - for generating a test set with the same dictionary as the training set All text categorization articles","title":"See also"},{"location":"text_categorization_with_weka/#links","text":"Javadoc StringToWordVector TextDirectoryLoader","title":"Links"},{"location":"text_classification_with_weka/","text":"see Text categorization with Weka","title":"Text classification with weka"},{"location":"troubleshooting/","text":"Click on one of the links for more information: Weka download problems OutOfMemoryException StackOverflowError just-in-time (JIT) compiler CSV file conversion ARFF file doesn't load Error message: nominal value not declared in header, read Token[X], line Y Spaces in labels of ARFF files Single quotes in labels of ARFF files CLASSPATH problems Instance ID Visualization Memory consumption and Garbage collector GUIChooser starts but not Experimenter or Explorer KnowledgeFlow toolbars are empty OSX Mountain Lion - Weka x-y-z is damaged and can't be installed. You should eject the disk image Ubuntu 18.04: WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS See also the Frequently Asked Questions .","title":"Troubleshooting"},{"location":"tuning_classifier_parameters/","text":"see Optimizing parameters","title":"Tuning classifier parameters"},{"location":"use_weka_in_your_java_code/","text":"The most common components you might want to use are Instances - your data Filter - for preprocessing the data Classifier/Clusterer - built on the processed data Evaluating - how good is the classifier/clusterer? Attribute selection - removing irrelevant attributes from your data The following sections explain how to use them in your own code. A link to an example class can be found at the end of this page, under the Links section. The classifiers and filters always list their options in the Javadoc API ( stable , developer version) specification. A comprehensive source of information is the chapter Using the API of the Weka manual. Packages # Initialization # In order to get your installed Weka packages initialized and also the internal MTJ and arpack libraries added to the classpath, call the loadPackages method of the weka.core.WekaPackageManager class before you instantiate any other classifiers, clusterers, filters, etc: import weka.core.WekaPackageManager; ... WekaPackageManager.loadPackages(false); Management # You can list all packages using: for ( Package p : WekaPackageManager . getAllPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); Currently installed packages using: for ( Package p : WekaPackageManager . getInstalledPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); And packages that are available for installation with: for ( Package p : WekaPackageManager . getAvailablePackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); The following installs the latest version (version parameter is null ) of the alternatingModelTrees package: WekaPackageManager . installPackageFromRepository ( \"alternatingModelTrees\" , null , System . out ); And this call uninstalls the package: WekaPackageManager . uninstallPackage ( \"alternatingModelTrees\" , true , System . out ); You can install a package also directly from a URL, e.g.: java . net . URL url = new java . net . URL ( \"https://sourceforge.net/projects/weka/files/weka-packages/DilcaDistance1.0.2.zip/download\" ); WekaPackageManager . installPackageFromURL ( url , System . out ); Instantiation # For instantiating classes from packages, you can use the forName method of the weka.core.Utils class. The following example shows how to instantiate the (hypothetical) classifier com.example.FunkyClassifier , which is available from a Weka package that is currently installed: import weka.core.Utils; import weka.classifiers.Classifier; ... Classifier cls = (Classifier) Utils.forName( Classifier.class, \"com.example.FunkyClassifier\", new String[]{\"-R\", \"1\", \"-another-option\"}); Instances # Datasets # The DataSource class is not limited to ARFF files. It can also read CSV files and other formats (basically all file formats that Weka can import via its converters; it uses the file extension to determine the associated loader). import weka.core.converters.ConverterUtils.DataSource ; ... DataSource source = new DataSource ( \"/some/where/data.arff\" ); Instances data = source . getDataSet (); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 ); Database # Reading from Databases is slightly more complicated, but still very easy. First, you'll have to modify your DatabaseUtils.props file to reflect your database connection. Suppose you want to connect to a MySQL server that is running on the local machine on the default port 3306 . The MySQL JDBC driver is called Connector/J . (The driver class is org.gjt.mm.mysql.Driver .) The database where your target data resides is called some_database . Since you're only reading, you can use the default user nobody without a password. Your props file must contain the following lines: jdbcDriver = org.gjt.mm.mysql.Driver jdbcURL = jdbc:mysql://localhost:3306/some_database Secondly, your Java code needs to look like this to load the data from the database: import weka.core.Instances ; import weka.experiment.InstanceQuery ; ... InstanceQuery query = new InstanceQuery (); query . setUsername ( \"nobody\" ); query . setPassword ( \"\" ); query . setQuery ( \"select * from whatsoever\" ); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query . retrieveInstances (); Notes: * Don't forget to add the JDBC driver to your CLASSPATH . * For MS Access, you must use the JDBC-ODBC-bridge that is part of a JDK. The Windows databases article explains how to do this. * InstanceQuery automatically converts VARCHAR database columns to NOMINAL attributes, and long TEXT database columns to STRING attributes. So if you use InstanceQuery to do text mining against text that appears in a VARCHAR column, Weka will regard such text as nominal values. Thus it will fail to tokenize and mine that text. Use the NominalToString or StringToNominal filter (package weka.filters.unsupervised.attribute ) to convert the attributes into the correct type. Option handling # Weka schemes that implement the weka.core.OptionHandler interface, such as classifiers, clusterers, and filters, offer the following methods for setting and retrieving options: void setOptions(String[] options) String[] getOptions() There are several ways of setting the options: Manually creating a String array: String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; options [ 1 ] = \"1\" ; Using a single command-line string and using the splitOptions method of the weka.core.Utils class to turn it into an array: String [] options = weka . core . Utils . splitOptions ( \"-R 1\" ); Using the OptionsToCode.java class to automatically turn a command line into code. Especially handy if the command line contains nested classes that have their own options, such as kernels for SMO: java OptionsToCode weka.classifiers.functions.SMO will generate output like this: // create new instance of scheme weka . classifiers . functions . SMO scheme = new weka . classifiers . functions . SMO (); // set options scheme . setOptions ( weka . core . Utils . splitOptions ( \"-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \\\"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\\\"\" )); Also, the OptionTree.java tool allows you to view a nested options string, e.g., used at the command line, as a tree. This can help you spot nesting errors. Filter # A filter has two different properties: supervised or unsupervised either takes the class attribute into account or not attribute - or instance -based e.g., removing a certain attribute or removing instances that meet a certain condition Most filters implement the OptionHandler interface, which means you can set the options via a String array, rather than setting them each manually via set -methods. For example, if you want to remove the first attribute of a dataset, you need this filter weka . filters . unsupervised . attribute . Remove with this option -R 1 If you have an Instances object, called data , you can create and apply the filter like this: import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Remove ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; // \"range\" options [ 1 ] = \"1\" ; // first attribute Remove remove = new Remove (); // new instance of filter remove . setOptions ( options ); // set options remove . setInputFormat ( data ); // inform filter about dataset **AFTER** setting options Instances newData = Filter . useFilter ( data , remove ); // apply filter Filtering on-the-fly # The FilteredClassifer meta-classifier is an easy way of filtering data on the fly. It removes the necessity of filtering the data before the classifier can be trained. Also, the data need not be passed through the trained filter again at prediction time. The following is an example of using this meta-classifier with the Remove filter and J48 for getting rid of a numeric ID attribute in the data: import weka.classifiers.meta.FilteredClassifier ; import weka.classifiers.trees.J48 ; import weka.filters.unsupervised.attribute.Remove ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // filter Remove rm = new Remove (); rm . setAttributeIndices ( \"1\" ); // remove 1st attribute // classifier J48 j48 = new J48 (); j48 . setUnpruned ( true ); // using an unpruned J48 // meta-classifier FilteredClassifier fc = new FilteredClassifier (); fc . setFilter ( rm ); fc . setClassifier ( j48 ); // train and make predictions fc . buildClassifier ( train ); for ( int i = 0 ; i < test . numInstances (); i ++ ) { double pred = fc . classifyInstance ( test . instance ( i )); System . out . print ( \"ID: \" + test . instance ( i ). value ( 0 )); System . out . print ( \", actual: \" + test . classAttribute (). value (( int ) test . instance ( i ). classValue ())); System . out . println ( \", predicted: \" + test . classAttribute (). value (( int ) pred )); } Other handy meta-schemes in Weka: weka.clusterers.FilteredClusterer weka.assocations.FilteredAssociator Batch filtering # On the command line, you can enable a second input/output pair (via -r and -s ) with the -b option, in order to process the second file with the same filter setup as the first one. Necessary, if you're using attribute selection or standardization - otherwise you end up with incompatible datasets. This is done fairly easy, since one initializes the filter only once with the setInputFormat(Instances) method, namely with the training set, and then applies the filter subsequently to the training set and the test set. The following example shows how to apply the Standardize filter to a train and a test set. Instances train = ... // from somewhere Instances test = ... // from somewhere Standardize filter = new Standardize (); filter . setInputFormat ( train ); // initializing the filter once with training set Instances newTrain = Filter . useFilter ( train , filter ); // configures the Filter based on train instances and returns filtered instances Instances newTest = Filter . useFilter ( test , filter ); // create new test set Calling conventions # The setInputFormat(Instances) method always has to be the last call before the filter is applied, e.g., with Filter.useFilter(Instances,Filter) . Why? First, it is the convention for using filters and, secondly, lots of filters generate the header of the output format in the setInputFormat(Instances) method with the currently set options (setting otpions after this call doesn't have any effect any more). Classification # The necessary classes can be found in this package: weka.classifiers Building a Classifier # Batch # A Weka classifier is rather simple to train on a given dataset. E.g., we can train an unpruned C4.5 tree algorithm on a given dataset data . The training is done via the buildClassifier(Instances) method. import weka.classifiers.trees.J48 ; ... String [] options = new String [ 1 ] ; options [ 0 ] = \"-U\" ; // unpruned tree J48 tree = new J48 (); // new instance of tree tree . setOptions ( options ); // set the options tree . buildClassifier ( data ); // build classifier Incremental # Classifiers implementing the weka.classifiers.UpdateableClassifier interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc of this interface to see what classifiers are implementing it. The actual process of training an incremental classifier is fairly simple: Call buildClassifier(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClassifier(Instance) method to feed the classifier new weka.core.Instance objects, one by one. Here is an example using data from a weka.core.converters.ArffLoader to train weka.classifiers.bayes.NaiveBayesUpdateable : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); structure . setClassIndex ( structure . numAttributes () - 1 ); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable (); nb . buildClassifier ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) nb . updateClassifier ( current ); A working example is IncrementalClassifier.java . Evaluating # Cross-validation # If you only have a training set and no test you might want to evaluate the classifier by using 10 times 10-fold cross-validation. This can be easily done via the Evaluation class. Here we seed the random selection of our folds for the CV with 1 . Check out the Evaluation class for more information about the statistics it produces. import weka.classifiers.Evaluation ; import java.util.Random ; ... Evaluation eval = new Evaluation ( newData ); eval . crossValidateModel ( tree , newData , 10 , new Random ( 1 )); Note: The classifier (in our example tree ) should not be trained when handed over to the crossValidateModel method. Why? If the classifier does not abide to the Weka convention that a classifier must be re-initialized every time the buildClassifier method is called (in other words: subsequent calls to the buildClassifier method always return the same results), you will get inconsistent and worthless results. The crossValidateModel takes care of training and evaluating the classifier. (It creates a copy of the original classifier that you hand over to the crossValidateModel for each run of the cross-validation.) Train/test set # In case you have a dedicated test set, you can train the classifier and then evaluate it on this test set. In the following example, a J48 is instantiated, trained and then evaluated. Some statistics are printed to stdout : import weka.core.Instances ; import weka.classifiers.Evaluation ; import weka.classifiers.trees.J48 ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // train classifier Classifier cls = new J48 (); cls . buildClassifier ( train ); // evaluate classifier and print some statistics Evaluation eval = new Evaluation ( train ); eval . evaluateModel ( cls , test ); System . out . println ( eval . toSummaryString ( \"\\nResults\\n======\\n\" , false )); Statistics # Some methods for retrieving the results from the evaluation: nominal class correct() - number of correctly classified instances (see also incorrect() ) pctCorrect() - percentage of correctly classified instances (see also pctIncorrect() ) kappa() - Kappa statistics numeric class correlationCoefficient() - correlation coefficient general meanAbsoluteError() - the mean absolute error rootMeanSquaredError() - the root mean squared error unclassified() - number of unclassified instances pctUnclassified() - percentage of unclassified instances If you want to have the exact same behavior as from the command line, use this call: import weka.classifiers.trees.J48 ; import weka.classifiers.Evaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( Evaluation . evaluateModel ( new J48 (), options )); ROC curves/AUC # You can also generate ROC curves/AUC with the predictions Weka recorded during testing. You can access these predictions via the predictions() method of the Evaluation class. See the Generating ROC curve article for a full example of how to generate ROC curves. Classifying instances # In case you have an unlabeled dataset that you want to classify with your newly trained classifier, you can use the following code snippet. It loads the file /some/where/unlabeled.arff , uses the previously built classifier tree to label the instances, and saves the labeled data as /some/where/labeled.arff . import java.io.BufferedReader ; import java.io.BufferedWriter ; import java.io.FileReader ; import java.io.FileWriter ; import weka.core.Instances ; ... // load unlabeled data Instances unlabeled = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/unlabeled.arff\" ))); // set class attribute unlabeled . setClassIndex ( unlabeled . numAttributes () - 1 ); // create copy Instances labeled = new Instances ( unlabeled ); // label instances for ( int i = 0 ; i < unlabeled . numInstances (); i ++ ) { double clsLabel = tree . classifyInstance ( unlabeled . instance ( i )); labeled . instance ( i ). setClassValue ( clsLabel ); } // save labeled data BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"/some/where/labeled.arff\" )); writer . write ( labeled . toString ()); writer . newLine (); writer . flush (); writer . close (); Note on nominal classes: If you're interested in the distribution over all the classes, use the method distributionForInstance(Instance) . This method returns a double array with the probability for each class. The returned double value from classifyInstance (or the index in the array returned by distributionForInstance ) is just the index for the string values in the attribute. That is, if you want the string representation for the class label returned above clsLabel , then you can print it like this: System . out . println ( clsLabel + \" -> \" + unlabeled . classAttribute (). value (( int ) clsLabel )); Clustering # Clustering is similar to classification. The necessary classes can be found in this package: weka.clusterers Building a Clusterer # Batch # A clusterer is built in much the same way as a classifier, but the buildClusterer(Instances) method instead of buildClassifier(Instances) . The following code snippet shows how to build an EM clusterer with a maximum of 100 iterations. import weka.clusterers.EM ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-I\" ; // max. iterations options [ 1 ] = \"100\" ; EM clusterer = new EM (); // new instance of clusterer clusterer . setOptions ( options ); // set the options clusterer . buildClusterer ( data ); // build the clusterer Incremental # Clusterers implementing the weka.clusterers.UpdateableClusterer interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc for this interface to see which clusterers implement it. The actual process of training an incremental clusterer is fairly simple: Call buildClusterer(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClusterer(Instance) method to feed the clusterer new weka.core.Instance objects, one by one. Call updateFinished() after all Instance objects have been processed, for the clusterer to perform additional computations. Here is an example using data from a weka.core.converters.ArffLoader to train weka.clusterers.Cobweb : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); // train Cobweb Cobweb cw = new Cobweb (); cw . buildClusterer ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) cw . updateClusterer ( current ); cw . updateFinished (); A working example is IncrementalClusterer.java . Evaluating # For evaluating a clusterer, you can use the ClusterEvaluation class. In this example, the number of clusters found is written to output: import weka.clusterers.ClusterEvaluation ; import weka.clusterers.Clusterer ; ... ClusterEvaluation eval = new ClusterEvaluation (); Clusterer clusterer = new EM (); // new clusterer instance, default options clusterer . buildClusterer ( data ); // build clusterer eval . setClusterer ( clusterer ); // the cluster to evaluate eval . evaluateClusterer ( newData ); // data to evaluate the clusterer on System . out . println ( \"# of clusters: \" + eval . getNumClusters ()); // output # of clusters Or, in the case of DensityBasedClusterer , you can cross-validate the clusterer (Note: with MakeDensityBasedClusterer you can turn any clusterer into a density-based one): import weka.clusterers.ClusterEvaluation ; import weka.clusterers.DensityBasedClusterer ; import weka.core.Instances ; import java.util.Random ; ... Instances data = ... // from somewhere DensityBasedClusterer clusterer = new ... // the clusterer to evaluate double logLikelyhood = ClusterEvaluation . crossValidateModel ( // cross-validate clusterer , data , 10 , // with 10 folds new Random ( 1 )); // and random number generator with seed 1 Or, if you want the same behavior/print-out from command line, use this call: import weka.clusterers.EM ; import weka.clusterers.ClusterEvaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( ClusterEvaluation . evaluateClusterer ( new EM (), options )); Clustering instances # The only difference with regard to classification is the method name. Instead of classifyInstance(Instance) , it is now clusterInstance(Instance) . The method for obtaining the distribution is still the same, i.e., distributionForInstance(Instance) . Classes to clusters evaluation # If your data contains a class attribute and you want to check how well the generated clusters fit the classes, you can perform a so-called classes to clusters evaluation. The Weka Explorer offers this functionality, and it's quite easy to implement. These are the necessary steps (complete source code: ClassesToClusters.java ): load the data and set the class attribute Instances data = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/file.arff\" ))); data . setClassIndex ( data . numAttributes () - 1 ); generate the class-less data to train the clusterer with weka . filters . unsupervised . attribute . Remove filter = new weka . filters . unsupervised . attribute . Remove (); filter . setAttributeIndices ( \"\" + ( data . classIndex () + 1 )); filter . setInputFormat ( data ); Instances dataClusterer = Filter . useFilter ( data , filter ); train the clusterer, e.g., EM EM clusterer = new EM (); // set further options for EM, if necessary... clusterer . buildClusterer ( dataClusterer ); evaluate the clusterer with the data still containing the class attribute ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( data ); print the results of the evaluation to stdout System . out . println ( eval . clusterResultsToString ()); Attribute selection # There is no real need to use the attribute selection classes directly in your own code, since there are already a meta-classifier and a filter available for applying attribute selection, but the low-level approach is still listed for the sake of completeness. The following examples all use CfsSubsetEval and GreedyStepwise (backwards). The code listed below is taken from the AttributeSelectionTest.java . Meta-Classifier # The following meta-classifier performs a preprocessing step of attribute selection before the data gets presented to the base classifier (in the example here, this is J48 ). Instances data = ... // from somewhere AttributeSelectedClassifier classifier = new AttributeSelectedClassifier (); CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); J48 base = new J48 (); classifier . setClassifier ( base ); classifier . setEvaluator ( eval ); classifier . setSearch ( search ); // 10-fold cross-validation Evaluation evaluation = new Evaluation ( data ); evaluation . crossValidateModel ( classifier , data , 10 , new Random ( 1 )); System . out . println ( evaluation . toSummaryString ()); Filter # The filter approach is straightforward: after setting up the filter, one just filters the data through the filter and obtains the reduced dataset. Instances data = ... // from somewhere AttributeSelection filter = new AttributeSelection (); // package weka.filters.supervised.attribute! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); filter . setEvaluator ( eval ); filter . setSearch ( search ); filter . setInputFormat ( data ); // generate new data Instances newData = Filter . useFilter ( data , filter ); System . out . println ( newData ); Low-level # If neither the meta-classifier nor filter approach is suitable for your purposes, you can use the attribute selection classes themselves. Instances data = ... // from somewhere AttributeSelection attsel = new AttributeSelection (); // package weka.attributeSelection! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); attsel . setEvaluator ( eval ); attsel . setSearch ( search ); attsel . SelectAttributes ( data ); // obtain the attribute indices that were selected int [] indices = attsel . selectedAttributes (); System . out . println ( Utils . arrayToString ( indices )); Note on randomization # Most machine learning schemes, like classifiers and clusterers, are susceptible to the ordering of the data. Using a different seed for randomizing the data will most likely produce a different result. For example, the Explorer, or a classifier/clusterer run from the command line, uses only a seeded java.util.Random number generator, whereas the weka.core.Instances.getRandomNumberGenerator(int) (which the WekaDemo.java uses) also takes the data into account for seeding. Unless one runs 10-fold cross-validation 10 times and averages the results, one will most likely get different results. See also # Weka Examples - pointer to collection of example classes Databases - for more information about using databases in Weka (includes ODBC, e.g., for MS Access) weka/experiment/DatabaseUtils.props - the database setup file Generating cross-validation folds (Java approach) - in case you want to run 10-fold cross-validation manually Generating classifier evaluation output manually - if you want to generate some of the evaluation statistics output manually Creating Instances on-the-fly - explains how to generate a weka.core.Instances object from scratch Save Instances to an ARFF File - shows how to output a dataset Using the Experiment API Examples # The following are a few sample classes for using various parts of the Weka API: WekaDemo.java ( stable , developer ) - little demo class that loads data from a file, runs it through a filter and trains/evaluates a classifier ClusteringDemo.java ( stable , developer ) - a basic example for using the clusterer API ClassesToClusters.java ( stable , developer ) - performs a classes to clusters evaluation like in the Explorer AttributeSelectionTest.java ( stable , developer ) - example code for using the attribute selection API M5PExample.java ( stable , developer ) - example using M5P to obtain data from database, train model, serialize it to a file, and use this serialized model to make predictions again. OptionsToCode.java ( stable , developer ) - turns a Weka command line for a scheme with options into Java code, correctly escaping quotes and backslashes. OptionTree.java ( stable , developer ) - displays nested Weka options as tree. IncrementalClassifier.java ( stable , developer ) - Example class for how to train an incremental classifier (in this case, weka.classifiers.bayes.NaiveBayesUpdateable ). IncrementalClusterer.java ( stable , developer ) - Example class for how to train an incremental clusterer (in this case, weka.clusterers.Cobweb ). Links # Weka API Stable version Developer version","title":"Use weka in your java code"},{"location":"use_weka_in_your_java_code/#packages","text":"","title":"Packages"},{"location":"use_weka_in_your_java_code/#initialization","text":"In order to get your installed Weka packages initialized and also the internal MTJ and arpack libraries added to the classpath, call the loadPackages method of the weka.core.WekaPackageManager class before you instantiate any other classifiers, clusterers, filters, etc: import weka.core.WekaPackageManager; ... WekaPackageManager.loadPackages(false);","title":"Initialization"},{"location":"use_weka_in_your_java_code/#management","text":"You can list all packages using: for ( Package p : WekaPackageManager . getAllPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); Currently installed packages using: for ( Package p : WekaPackageManager . getInstalledPackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); And packages that are available for installation with: for ( Package p : WekaPackageManager . getAvailablePackages ()) System . out . println ( \"- \" + p . getName () + \"/\" + p . getPackageMetaData (). get ( \"Version\" )); The following installs the latest version (version parameter is null ) of the alternatingModelTrees package: WekaPackageManager . installPackageFromRepository ( \"alternatingModelTrees\" , null , System . out ); And this call uninstalls the package: WekaPackageManager . uninstallPackage ( \"alternatingModelTrees\" , true , System . out ); You can install a package also directly from a URL, e.g.: java . net . URL url = new java . net . URL ( \"https://sourceforge.net/projects/weka/files/weka-packages/DilcaDistance1.0.2.zip/download\" ); WekaPackageManager . installPackageFromURL ( url , System . out );","title":"Management"},{"location":"use_weka_in_your_java_code/#instantiation","text":"For instantiating classes from packages, you can use the forName method of the weka.core.Utils class. The following example shows how to instantiate the (hypothetical) classifier com.example.FunkyClassifier , which is available from a Weka package that is currently installed: import weka.core.Utils; import weka.classifiers.Classifier; ... Classifier cls = (Classifier) Utils.forName( Classifier.class, \"com.example.FunkyClassifier\", new String[]{\"-R\", \"1\", \"-another-option\"});","title":"Instantiation"},{"location":"use_weka_in_your_java_code/#instances","text":"","title":"Instances"},{"location":"use_weka_in_your_java_code/#datasets","text":"The DataSource class is not limited to ARFF files. It can also read CSV files and other formats (basically all file formats that Weka can import via its converters; it uses the file extension to determine the associated loader). import weka.core.converters.ConverterUtils.DataSource ; ... DataSource source = new DataSource ( \"/some/where/data.arff\" ); Instances data = source . getDataSet (); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if ( data . classIndex () == - 1 ) data . setClassIndex ( data . numAttributes () - 1 );","title":"Datasets"},{"location":"use_weka_in_your_java_code/#database","text":"Reading from Databases is slightly more complicated, but still very easy. First, you'll have to modify your DatabaseUtils.props file to reflect your database connection. Suppose you want to connect to a MySQL server that is running on the local machine on the default port 3306 . The MySQL JDBC driver is called Connector/J . (The driver class is org.gjt.mm.mysql.Driver .) The database where your target data resides is called some_database . Since you're only reading, you can use the default user nobody without a password. Your props file must contain the following lines: jdbcDriver = org.gjt.mm.mysql.Driver jdbcURL = jdbc:mysql://localhost:3306/some_database Secondly, your Java code needs to look like this to load the data from the database: import weka.core.Instances ; import weka.experiment.InstanceQuery ; ... InstanceQuery query = new InstanceQuery (); query . setUsername ( \"nobody\" ); query . setPassword ( \"\" ); query . setQuery ( \"select * from whatsoever\" ); // You can declare that your data set is sparse // query.setSparseData(true); Instances data = query . retrieveInstances (); Notes: * Don't forget to add the JDBC driver to your CLASSPATH . * For MS Access, you must use the JDBC-ODBC-bridge that is part of a JDK. The Windows databases article explains how to do this. * InstanceQuery automatically converts VARCHAR database columns to NOMINAL attributes, and long TEXT database columns to STRING attributes. So if you use InstanceQuery to do text mining against text that appears in a VARCHAR column, Weka will regard such text as nominal values. Thus it will fail to tokenize and mine that text. Use the NominalToString or StringToNominal filter (package weka.filters.unsupervised.attribute ) to convert the attributes into the correct type.","title":"Database"},{"location":"use_weka_in_your_java_code/#option-handling","text":"Weka schemes that implement the weka.core.OptionHandler interface, such as classifiers, clusterers, and filters, offer the following methods for setting and retrieving options: void setOptions(String[] options) String[] getOptions() There are several ways of setting the options: Manually creating a String array: String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; options [ 1 ] = \"1\" ; Using a single command-line string and using the splitOptions method of the weka.core.Utils class to turn it into an array: String [] options = weka . core . Utils . splitOptions ( \"-R 1\" ); Using the OptionsToCode.java class to automatically turn a command line into code. Especially handy if the command line contains nested classes that have their own options, such as kernels for SMO: java OptionsToCode weka.classifiers.functions.SMO will generate output like this: // create new instance of scheme weka . classifiers . functions . SMO scheme = new weka . classifiers . functions . SMO (); // set options scheme . setOptions ( weka . core . Utils . splitOptions ( \"-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \\\"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\\\"\" )); Also, the OptionTree.java tool allows you to view a nested options string, e.g., used at the command line, as a tree. This can help you spot nesting errors.","title":"Option handling"},{"location":"use_weka_in_your_java_code/#filter","text":"A filter has two different properties: supervised or unsupervised either takes the class attribute into account or not attribute - or instance -based e.g., removing a certain attribute or removing instances that meet a certain condition Most filters implement the OptionHandler interface, which means you can set the options via a String array, rather than setting them each manually via set -methods. For example, if you want to remove the first attribute of a dataset, you need this filter weka . filters . unsupervised . attribute . Remove with this option -R 1 If you have an Instances object, called data , you can create and apply the filter like this: import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Remove ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-R\" ; // \"range\" options [ 1 ] = \"1\" ; // first attribute Remove remove = new Remove (); // new instance of filter remove . setOptions ( options ); // set options remove . setInputFormat ( data ); // inform filter about dataset **AFTER** setting options Instances newData = Filter . useFilter ( data , remove ); // apply filter","title":"Filter"},{"location":"use_weka_in_your_java_code/#filtering-on-the-fly","text":"The FilteredClassifer meta-classifier is an easy way of filtering data on the fly. It removes the necessity of filtering the data before the classifier can be trained. Also, the data need not be passed through the trained filter again at prediction time. The following is an example of using this meta-classifier with the Remove filter and J48 for getting rid of a numeric ID attribute in the data: import weka.classifiers.meta.FilteredClassifier ; import weka.classifiers.trees.J48 ; import weka.filters.unsupervised.attribute.Remove ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // filter Remove rm = new Remove (); rm . setAttributeIndices ( \"1\" ); // remove 1st attribute // classifier J48 j48 = new J48 (); j48 . setUnpruned ( true ); // using an unpruned J48 // meta-classifier FilteredClassifier fc = new FilteredClassifier (); fc . setFilter ( rm ); fc . setClassifier ( j48 ); // train and make predictions fc . buildClassifier ( train ); for ( int i = 0 ; i < test . numInstances (); i ++ ) { double pred = fc . classifyInstance ( test . instance ( i )); System . out . print ( \"ID: \" + test . instance ( i ). value ( 0 )); System . out . print ( \", actual: \" + test . classAttribute (). value (( int ) test . instance ( i ). classValue ())); System . out . println ( \", predicted: \" + test . classAttribute (). value (( int ) pred )); } Other handy meta-schemes in Weka: weka.clusterers.FilteredClusterer weka.assocations.FilteredAssociator","title":"Filtering on-the-fly"},{"location":"use_weka_in_your_java_code/#batch-filtering","text":"On the command line, you can enable a second input/output pair (via -r and -s ) with the -b option, in order to process the second file with the same filter setup as the first one. Necessary, if you're using attribute selection or standardization - otherwise you end up with incompatible datasets. This is done fairly easy, since one initializes the filter only once with the setInputFormat(Instances) method, namely with the training set, and then applies the filter subsequently to the training set and the test set. The following example shows how to apply the Standardize filter to a train and a test set. Instances train = ... // from somewhere Instances test = ... // from somewhere Standardize filter = new Standardize (); filter . setInputFormat ( train ); // initializing the filter once with training set Instances newTrain = Filter . useFilter ( train , filter ); // configures the Filter based on train instances and returns filtered instances Instances newTest = Filter . useFilter ( test , filter ); // create new test set","title":"Batch filtering"},{"location":"use_weka_in_your_java_code/#calling-conventions","text":"The setInputFormat(Instances) method always has to be the last call before the filter is applied, e.g., with Filter.useFilter(Instances,Filter) . Why? First, it is the convention for using filters and, secondly, lots of filters generate the header of the output format in the setInputFormat(Instances) method with the currently set options (setting otpions after this call doesn't have any effect any more).","title":"Calling conventions"},{"location":"use_weka_in_your_java_code/#classification","text":"The necessary classes can be found in this package: weka.classifiers","title":"Classification"},{"location":"use_weka_in_your_java_code/#building-a-classifier","text":"","title":"Building a Classifier"},{"location":"use_weka_in_your_java_code/#batch","text":"A Weka classifier is rather simple to train on a given dataset. E.g., we can train an unpruned C4.5 tree algorithm on a given dataset data . The training is done via the buildClassifier(Instances) method. import weka.classifiers.trees.J48 ; ... String [] options = new String [ 1 ] ; options [ 0 ] = \"-U\" ; // unpruned tree J48 tree = new J48 (); // new instance of tree tree . setOptions ( options ); // set the options tree . buildClassifier ( data ); // build classifier","title":"Batch"},{"location":"use_weka_in_your_java_code/#incremental","text":"Classifiers implementing the weka.classifiers.UpdateableClassifier interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc of this interface to see what classifiers are implementing it. The actual process of training an incremental classifier is fairly simple: Call buildClassifier(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClassifier(Instance) method to feed the classifier new weka.core.Instance objects, one by one. Here is an example using data from a weka.core.converters.ArffLoader to train weka.classifiers.bayes.NaiveBayesUpdateable : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); structure . setClassIndex ( structure . numAttributes () - 1 ); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable (); nb . buildClassifier ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) nb . updateClassifier ( current ); A working example is IncrementalClassifier.java .","title":"Incremental"},{"location":"use_weka_in_your_java_code/#evaluating","text":"","title":"Evaluating"},{"location":"use_weka_in_your_java_code/#cross-validation","text":"If you only have a training set and no test you might want to evaluate the classifier by using 10 times 10-fold cross-validation. This can be easily done via the Evaluation class. Here we seed the random selection of our folds for the CV with 1 . Check out the Evaluation class for more information about the statistics it produces. import weka.classifiers.Evaluation ; import java.util.Random ; ... Evaluation eval = new Evaluation ( newData ); eval . crossValidateModel ( tree , newData , 10 , new Random ( 1 )); Note: The classifier (in our example tree ) should not be trained when handed over to the crossValidateModel method. Why? If the classifier does not abide to the Weka convention that a classifier must be re-initialized every time the buildClassifier method is called (in other words: subsequent calls to the buildClassifier method always return the same results), you will get inconsistent and worthless results. The crossValidateModel takes care of training and evaluating the classifier. (It creates a copy of the original classifier that you hand over to the crossValidateModel for each run of the cross-validation.)","title":"Cross-validation"},{"location":"use_weka_in_your_java_code/#traintest-set","text":"In case you have a dedicated test set, you can train the classifier and then evaluate it on this test set. In the following example, a J48 is instantiated, trained and then evaluated. Some statistics are printed to stdout : import weka.core.Instances ; import weka.classifiers.Evaluation ; import weka.classifiers.trees.J48 ; ... Instances train = ... // from somewhere Instances test = ... // from somewhere // train classifier Classifier cls = new J48 (); cls . buildClassifier ( train ); // evaluate classifier and print some statistics Evaluation eval = new Evaluation ( train ); eval . evaluateModel ( cls , test ); System . out . println ( eval . toSummaryString ( \"\\nResults\\n======\\n\" , false ));","title":"Train/test set"},{"location":"use_weka_in_your_java_code/#statistics","text":"Some methods for retrieving the results from the evaluation: nominal class correct() - number of correctly classified instances (see also incorrect() ) pctCorrect() - percentage of correctly classified instances (see also pctIncorrect() ) kappa() - Kappa statistics numeric class correlationCoefficient() - correlation coefficient general meanAbsoluteError() - the mean absolute error rootMeanSquaredError() - the root mean squared error unclassified() - number of unclassified instances pctUnclassified() - percentage of unclassified instances If you want to have the exact same behavior as from the command line, use this call: import weka.classifiers.trees.J48 ; import weka.classifiers.Evaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( Evaluation . evaluateModel ( new J48 (), options ));","title":"Statistics"},{"location":"use_weka_in_your_java_code/#roc-curvesauc","text":"You can also generate ROC curves/AUC with the predictions Weka recorded during testing. You can access these predictions via the predictions() method of the Evaluation class. See the Generating ROC curve article for a full example of how to generate ROC curves.","title":"ROC curves/AUC"},{"location":"use_weka_in_your_java_code/#classifying-instances","text":"In case you have an unlabeled dataset that you want to classify with your newly trained classifier, you can use the following code snippet. It loads the file /some/where/unlabeled.arff , uses the previously built classifier tree to label the instances, and saves the labeled data as /some/where/labeled.arff . import java.io.BufferedReader ; import java.io.BufferedWriter ; import java.io.FileReader ; import java.io.FileWriter ; import weka.core.Instances ; ... // load unlabeled data Instances unlabeled = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/unlabeled.arff\" ))); // set class attribute unlabeled . setClassIndex ( unlabeled . numAttributes () - 1 ); // create copy Instances labeled = new Instances ( unlabeled ); // label instances for ( int i = 0 ; i < unlabeled . numInstances (); i ++ ) { double clsLabel = tree . classifyInstance ( unlabeled . instance ( i )); labeled . instance ( i ). setClassValue ( clsLabel ); } // save labeled data BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"/some/where/labeled.arff\" )); writer . write ( labeled . toString ()); writer . newLine (); writer . flush (); writer . close (); Note on nominal classes: If you're interested in the distribution over all the classes, use the method distributionForInstance(Instance) . This method returns a double array with the probability for each class. The returned double value from classifyInstance (or the index in the array returned by distributionForInstance ) is just the index for the string values in the attribute. That is, if you want the string representation for the class label returned above clsLabel , then you can print it like this: System . out . println ( clsLabel + \" -> \" + unlabeled . classAttribute (). value (( int ) clsLabel ));","title":"Classifying instances"},{"location":"use_weka_in_your_java_code/#clustering","text":"Clustering is similar to classification. The necessary classes can be found in this package: weka.clusterers","title":"Clustering"},{"location":"use_weka_in_your_java_code/#building-a-clusterer","text":"","title":"Building a Clusterer"},{"location":"use_weka_in_your_java_code/#batch_1","text":"A clusterer is built in much the same way as a classifier, but the buildClusterer(Instances) method instead of buildClassifier(Instances) . The following code snippet shows how to build an EM clusterer with a maximum of 100 iterations. import weka.clusterers.EM ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-I\" ; // max. iterations options [ 1 ] = \"100\" ; EM clusterer = new EM (); // new instance of clusterer clusterer . setOptions ( options ); // set the options clusterer . buildClusterer ( data ); // build the clusterer","title":"Batch"},{"location":"use_weka_in_your_java_code/#incremental_1","text":"Clusterers implementing the weka.clusterers.UpdateableClusterer interface can be trained incrementally. This conserves memory, since the data doesn't have to be loaded into memory all at once. See the Javadoc for this interface to see which clusterers implement it. The actual process of training an incremental clusterer is fairly simple: Call buildClusterer(Instances) with the structure of the dataset (may or may not contain any actual data rows). Subsequently call the updateClusterer(Instance) method to feed the clusterer new weka.core.Instance objects, one by one. Call updateFinished() after all Instance objects have been processed, for the clusterer to perform additional computations. Here is an example using data from a weka.core.converters.ArffLoader to train weka.clusterers.Cobweb : // load data ArffLoader loader = new ArffLoader (); loader . setFile ( new File ( \"/some/where/data.arff\" )); Instances structure = loader . getStructure (); // train Cobweb Cobweb cw = new Cobweb (); cw . buildClusterer ( structure ); Instance current ; while (( current = loader . getNextInstance ( structure )) != null ) cw . updateClusterer ( current ); cw . updateFinished (); A working example is IncrementalClusterer.java .","title":"Incremental"},{"location":"use_weka_in_your_java_code/#evaluating_1","text":"For evaluating a clusterer, you can use the ClusterEvaluation class. In this example, the number of clusters found is written to output: import weka.clusterers.ClusterEvaluation ; import weka.clusterers.Clusterer ; ... ClusterEvaluation eval = new ClusterEvaluation (); Clusterer clusterer = new EM (); // new clusterer instance, default options clusterer . buildClusterer ( data ); // build clusterer eval . setClusterer ( clusterer ); // the cluster to evaluate eval . evaluateClusterer ( newData ); // data to evaluate the clusterer on System . out . println ( \"# of clusters: \" + eval . getNumClusters ()); // output # of clusters Or, in the case of DensityBasedClusterer , you can cross-validate the clusterer (Note: with MakeDensityBasedClusterer you can turn any clusterer into a density-based one): import weka.clusterers.ClusterEvaluation ; import weka.clusterers.DensityBasedClusterer ; import weka.core.Instances ; import java.util.Random ; ... Instances data = ... // from somewhere DensityBasedClusterer clusterer = new ... // the clusterer to evaluate double logLikelyhood = ClusterEvaluation . crossValidateModel ( // cross-validate clusterer , data , 10 , // with 10 folds new Random ( 1 )); // and random number generator with seed 1 Or, if you want the same behavior/print-out from command line, use this call: import weka.clusterers.EM ; import weka.clusterers.ClusterEvaluation ; ... String [] options = new String [ 2 ] ; options [ 0 ] = \"-t\" ; options [ 1 ] = \"/some/where/somefile.arff\" ; System . out . println ( ClusterEvaluation . evaluateClusterer ( new EM (), options ));","title":"Evaluating"},{"location":"use_weka_in_your_java_code/#clustering-instances","text":"The only difference with regard to classification is the method name. Instead of classifyInstance(Instance) , it is now clusterInstance(Instance) . The method for obtaining the distribution is still the same, i.e., distributionForInstance(Instance) .","title":"Clustering instances"},{"location":"use_weka_in_your_java_code/#classes-to-clusters-evaluation","text":"If your data contains a class attribute and you want to check how well the generated clusters fit the classes, you can perform a so-called classes to clusters evaluation. The Weka Explorer offers this functionality, and it's quite easy to implement. These are the necessary steps (complete source code: ClassesToClusters.java ): load the data and set the class attribute Instances data = new Instances ( new BufferedReader ( new FileReader ( \"/some/where/file.arff\" ))); data . setClassIndex ( data . numAttributes () - 1 ); generate the class-less data to train the clusterer with weka . filters . unsupervised . attribute . Remove filter = new weka . filters . unsupervised . attribute . Remove (); filter . setAttributeIndices ( \"\" + ( data . classIndex () + 1 )); filter . setInputFormat ( data ); Instances dataClusterer = Filter . useFilter ( data , filter ); train the clusterer, e.g., EM EM clusterer = new EM (); // set further options for EM, if necessary... clusterer . buildClusterer ( dataClusterer ); evaluate the clusterer with the data still containing the class attribute ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( data ); print the results of the evaluation to stdout System . out . println ( eval . clusterResultsToString ());","title":"Classes to clusters evaluation"},{"location":"use_weka_in_your_java_code/#attribute-selection","text":"There is no real need to use the attribute selection classes directly in your own code, since there are already a meta-classifier and a filter available for applying attribute selection, but the low-level approach is still listed for the sake of completeness. The following examples all use CfsSubsetEval and GreedyStepwise (backwards). The code listed below is taken from the AttributeSelectionTest.java .","title":"Attribute selection"},{"location":"use_weka_in_your_java_code/#meta-classifier","text":"The following meta-classifier performs a preprocessing step of attribute selection before the data gets presented to the base classifier (in the example here, this is J48 ). Instances data = ... // from somewhere AttributeSelectedClassifier classifier = new AttributeSelectedClassifier (); CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); J48 base = new J48 (); classifier . setClassifier ( base ); classifier . setEvaluator ( eval ); classifier . setSearch ( search ); // 10-fold cross-validation Evaluation evaluation = new Evaluation ( data ); evaluation . crossValidateModel ( classifier , data , 10 , new Random ( 1 )); System . out . println ( evaluation . toSummaryString ());","title":"Meta-Classifier"},{"location":"use_weka_in_your_java_code/#filter_1","text":"The filter approach is straightforward: after setting up the filter, one just filters the data through the filter and obtains the reduced dataset. Instances data = ... // from somewhere AttributeSelection filter = new AttributeSelection (); // package weka.filters.supervised.attribute! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); filter . setEvaluator ( eval ); filter . setSearch ( search ); filter . setInputFormat ( data ); // generate new data Instances newData = Filter . useFilter ( data , filter ); System . out . println ( newData );","title":"Filter"},{"location":"use_weka_in_your_java_code/#low-level","text":"If neither the meta-classifier nor filter approach is suitable for your purposes, you can use the attribute selection classes themselves. Instances data = ... // from somewhere AttributeSelection attsel = new AttributeSelection (); // package weka.attributeSelection! CfsSubsetEval eval = new CfsSubsetEval (); GreedyStepwise search = new GreedyStepwise (); search . setSearchBackwards ( true ); attsel . setEvaluator ( eval ); attsel . setSearch ( search ); attsel . SelectAttributes ( data ); // obtain the attribute indices that were selected int [] indices = attsel . selectedAttributes (); System . out . println ( Utils . arrayToString ( indices ));","title":"Low-level"},{"location":"use_weka_in_your_java_code/#note-on-randomization","text":"Most machine learning schemes, like classifiers and clusterers, are susceptible to the ordering of the data. Using a different seed for randomizing the data will most likely produce a different result. For example, the Explorer, or a classifier/clusterer run from the command line, uses only a seeded java.util.Random number generator, whereas the weka.core.Instances.getRandomNumberGenerator(int) (which the WekaDemo.java uses) also takes the data into account for seeding. Unless one runs 10-fold cross-validation 10 times and averages the results, one will most likely get different results.","title":"Note on randomization"},{"location":"use_weka_in_your_java_code/#see-also","text":"Weka Examples - pointer to collection of example classes Databases - for more information about using databases in Weka (includes ODBC, e.g., for MS Access) weka/experiment/DatabaseUtils.props - the database setup file Generating cross-validation folds (Java approach) - in case you want to run 10-fold cross-validation manually Generating classifier evaluation output manually - if you want to generate some of the evaluation statistics output manually Creating Instances on-the-fly - explains how to generate a weka.core.Instances object from scratch Save Instances to an ARFF File - shows how to output a dataset Using the Experiment API","title":"See also"},{"location":"use_weka_in_your_java_code/#examples","text":"The following are a few sample classes for using various parts of the Weka API: WekaDemo.java ( stable , developer ) - little demo class that loads data from a file, runs it through a filter and trains/evaluates a classifier ClusteringDemo.java ( stable , developer ) - a basic example for using the clusterer API ClassesToClusters.java ( stable , developer ) - performs a classes to clusters evaluation like in the Explorer AttributeSelectionTest.java ( stable , developer ) - example code for using the attribute selection API M5PExample.java ( stable , developer ) - example using M5P to obtain data from database, train model, serialize it to a file, and use this serialized model to make predictions again. OptionsToCode.java ( stable , developer ) - turns a Weka command line for a scheme with options into Java code, correctly escaping quotes and backslashes. OptionTree.java ( stable , developer ) - displays nested Weka options as tree. IncrementalClassifier.java ( stable , developer ) - Example class for how to train an incremental classifier (in this case, weka.classifiers.bayes.NaiveBayesUpdateable ). IncrementalClusterer.java ( stable , developer ) - Example class for how to train an incremental clusterer (in this case, weka.clusterers.Cobweb ).","title":"Examples"},{"location":"use_weka_in_your_java_code/#links","text":"Weka API Stable version Developer version","title":"Links"},{"location":"use_weka_with_the_microsoft_net_framework/","text":"So you want to use Weka with your existing Microsoft .NET code? Or you want to use your .NET components in a Java-based system that uses Weka? Achieving .NET and Java interoperability is possible, but there is no 'one size fits all' solution. The sheer number of different ways to attempt this should give an indication of the difficulty of the problem. That said, here is a summary of some of the possibilities that you can try. Direct Interoperability # IKVM # IKVM is an implementation of Java for .NET. It allows you to call your Java classes from directly from your .NET code, and via the GNU Classpath provides most of the standard Java API for use in .NET. It also provides a .NET version of the Java Virtual Machine. In other words, with this software you can now use almost any Java class in your .NET code! You could even develop for .NET using Java, and then easily import your classes into your .NET system. Even better, IKVM is Open Source software and is freely available. The only disadvantage is that its functionality is limited to the extent of the GNU Classpath; however this now covers most of the API, and is rapidly expanding. It also doesn't appear to have any functionality in the other direction - you can't run .NET code in Java. For use with Weka, IKVM has successfully been tested on a simple C# program that runs a classifier on a dataset. The GUI will not load at the time of writing, but I suspect that most of the Weka API will work fine. Because of this, IKVM is recommended for use in small Open Source research projects using Weka. See this IKVM with Weka tutorial for more detail. Bridging Software # Bridging software allows you to use your Java classes in your .NET code, and your .NET classes in your java code. This works by running both the .NET and Java Virtual Machines simultaneously, and creating proxy classes that 'stand in' for each class in the alternative framework. Runtime bridges are relatively computationally-efficient, and provide seamless and flexible interoperability solutions. The main disadvantage with this method is that the software tools that facilitate this are generally expensive third-party programs that must be purchased. Some .NET / Java bridging tools: JNBridge JBind2.net JuggerNET J-Integra for .NET JACOB An open-source project that provides allows you to call COM components from Java (but not vice-versa). Hosting .NET Controls in Java This tutorial explains how to write your own custom COM bridges using JNI. This is similar to what JACOB does, except that you will have to code it yourself. Java Native Interface SDK for .NET and tools that use it: Object-Oriented JNI for .NET This .NET library implements regular JNI SDK in .NET. OOJNI .NET Add-in for MS Visual Studio Generates wrappers for java classes from Java Bytecode selected in C++, Managed C++, C#, J#, VB. Indirect Interoperability # Interoperability using a Database # If both your .NET components and your Java components only need to asynchronously interface with a database, interoperability is very simple. The components from the different frameworks do not have to know about each other - they just interact with the database as they normally would. For more detail, see Database Interoperability . Using Enterprise Messaging Services # Messaging services are used to facilitate asynchronous communication between different components in a system. They provide an API for sending messages between components, and provide security, data integrity checking and error handling to ensure reliable information transfer. To use these systems for interoperability between .NET and Java, you may need a messaging system for each framework that has the capability of talking to messaging systems from the other framework. The disadvantage of this type of system is that it may be expensive and difficult to set up. However, if you already have such a system in place, it makes sense to use it for interoperability purposes. .NET Enterprise Messaging Services Microsoft BizTalk Server Java Enterprise Messaging Services IBM Websphere MQ Fiorano MQ Web Services # Web Services are a common method used for achieving interoperability. The main attractions are that web-based systems are both platform and language independant, and there exist standard protocols to facilitate the communication. A Web Services based approach generally involves setting up a web server, and some proxy classes in each framework to communicate with this server. Communication is generally achieved through XML-based protocols such as SOAP. The problem with this method is that serialization to XML can create large files that must be sent to the web server, so there may be efficiency issues. Microsoft WSE and JWSDP are freely available extensions to .NET and Java respectively, for the purposes of developing web-server based solutions. Tutorials for using this method for interoperability can be found here , here and here . Other Useful Links # An introduction to IKVM - This article supplies a code example of Java/.NET interoperability using IKVM. IKVM.NET - An open source implementation of Java for .NET (Highly recommended). IKVM with Weka tutorial - A tutorial on using Weka with C# using IKVM. This tutorial is part of the WekaWiki. Java/.NET Interop: Bridging Muddled Waters - A very comprehensive set of articles on the interoperability problem. Watch out for the various authors' biases towards their own companies and products though. Microsoft .NET and Java/J2EE Interoperability - An MSDN directory of articles on .NET / Java interopererability. Mono - An open source .NET development environment, cross platform. Java/.NET Integration as Simple as Possible - the article, which describes the simplest way to embed .NET controls into a Java GUI with OOJNI\u00ae Problems using weka from VB .NET in VS 2005 - you may experience some hiccups when attempting to use an IKVM generated assembly in your VB .Net code in Visual Studio 2005...","title":"Use weka with the microsoft net framework"},{"location":"use_weka_with_the_microsoft_net_framework/#direct-interoperability","text":"","title":"Direct Interoperability"},{"location":"use_weka_with_the_microsoft_net_framework/#ikvm","text":"IKVM is an implementation of Java for .NET. It allows you to call your Java classes from directly from your .NET code, and via the GNU Classpath provides most of the standard Java API for use in .NET. It also provides a .NET version of the Java Virtual Machine. In other words, with this software you can now use almost any Java class in your .NET code! You could even develop for .NET using Java, and then easily import your classes into your .NET system. Even better, IKVM is Open Source software and is freely available. The only disadvantage is that its functionality is limited to the extent of the GNU Classpath; however this now covers most of the API, and is rapidly expanding. It also doesn't appear to have any functionality in the other direction - you can't run .NET code in Java. For use with Weka, IKVM has successfully been tested on a simple C# program that runs a classifier on a dataset. The GUI will not load at the time of writing, but I suspect that most of the Weka API will work fine. Because of this, IKVM is recommended for use in small Open Source research projects using Weka. See this IKVM with Weka tutorial for more detail.","title":"IKVM"},{"location":"use_weka_with_the_microsoft_net_framework/#bridging-software","text":"Bridging software allows you to use your Java classes in your .NET code, and your .NET classes in your java code. This works by running both the .NET and Java Virtual Machines simultaneously, and creating proxy classes that 'stand in' for each class in the alternative framework. Runtime bridges are relatively computationally-efficient, and provide seamless and flexible interoperability solutions. The main disadvantage with this method is that the software tools that facilitate this are generally expensive third-party programs that must be purchased. Some .NET / Java bridging tools: JNBridge JBind2.net JuggerNET J-Integra for .NET JACOB An open-source project that provides allows you to call COM components from Java (but not vice-versa). Hosting .NET Controls in Java This tutorial explains how to write your own custom COM bridges using JNI. This is similar to what JACOB does, except that you will have to code it yourself. Java Native Interface SDK for .NET and tools that use it: Object-Oriented JNI for .NET This .NET library implements regular JNI SDK in .NET. OOJNI .NET Add-in for MS Visual Studio Generates wrappers for java classes from Java Bytecode selected in C++, Managed C++, C#, J#, VB.","title":"Bridging Software"},{"location":"use_weka_with_the_microsoft_net_framework/#indirect-interoperability","text":"","title":"Indirect Interoperability"},{"location":"use_weka_with_the_microsoft_net_framework/#interoperability-using-a-database","text":"If both your .NET components and your Java components only need to asynchronously interface with a database, interoperability is very simple. The components from the different frameworks do not have to know about each other - they just interact with the database as they normally would. For more detail, see Database Interoperability .","title":"Interoperability using a Database"},{"location":"use_weka_with_the_microsoft_net_framework/#using-enterprise-messaging-services","text":"Messaging services are used to facilitate asynchronous communication between different components in a system. They provide an API for sending messages between components, and provide security, data integrity checking and error handling to ensure reliable information transfer. To use these systems for interoperability between .NET and Java, you may need a messaging system for each framework that has the capability of talking to messaging systems from the other framework. The disadvantage of this type of system is that it may be expensive and difficult to set up. However, if you already have such a system in place, it makes sense to use it for interoperability purposes. .NET Enterprise Messaging Services Microsoft BizTalk Server Java Enterprise Messaging Services IBM Websphere MQ Fiorano MQ","title":"Using Enterprise Messaging Services"},{"location":"use_weka_with_the_microsoft_net_framework/#web-services","text":"Web Services are a common method used for achieving interoperability. The main attractions are that web-based systems are both platform and language independant, and there exist standard protocols to facilitate the communication. A Web Services based approach generally involves setting up a web server, and some proxy classes in each framework to communicate with this server. Communication is generally achieved through XML-based protocols such as SOAP. The problem with this method is that serialization to XML can create large files that must be sent to the web server, so there may be efficiency issues. Microsoft WSE and JWSDP are freely available extensions to .NET and Java respectively, for the purposes of developing web-server based solutions. Tutorials for using this method for interoperability can be found here , here and here .","title":"Web Services"},{"location":"use_weka_with_the_microsoft_net_framework/#other-useful-links","text":"An introduction to IKVM - This article supplies a code example of Java/.NET interoperability using IKVM. IKVM.NET - An open source implementation of Java for .NET (Highly recommended). IKVM with Weka tutorial - A tutorial on using Weka with C# using IKVM. This tutorial is part of the WekaWiki. Java/.NET Interop: Bridging Muddled Waters - A very comprehensive set of articles on the interoperability problem. Watch out for the various authors' biases towards their own companies and products though. Microsoft .NET and Java/J2EE Interoperability - An MSDN directory of articles on .NET / Java interopererability. Mono - An open source .NET development environment, cross platform. Java/.NET Integration as Simple as Possible - the article, which describes the simplest way to embed .NET controls into a Java GUI with OOJNI\u00ae Problems using weka from VB .NET in VS 2005 - you may experience some hiccups when attempting to use an IKVM generated assembly in your VB .Net code in Visual Studio 2005...","title":"Other Useful Links"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/","text":"A new framework to create ARFF from JPA Entities. What makes me want to develop this project: Not having to worry about parameters of access to the database, since having my application integrated to JPA Use Entities (POJO) JPA and its metadata to generate the ARFF when you need to outsource information for use by third parties. Generate new Entity (POJO) JPA based on information obtained in the ARFF (planned to release V0.9.0) Site About Project: http://socialsla.github.io/Weka-JPA-Persistence Source on GitHub: https://github.com/SocialSLA/Weka-JPA-Persistence How code # try { Weka2JPAHelper l_helper; // inject with CDI or create a class with new Weka2JPAHelper(Logger,EntityManager); l_helper.save(new File(\"Teste.arff\"), A_JPA_Entity.class); }catch(IOException e){} If you want create the data to create a ARFF file from other source, you can use a JPA Entity like the example: try{ Weka2JPAHelper l_helper = CDIManager.get(Weka2JPAHelper.class); Collection<Entity2Weka> l_list = new ArrayList<Entity2Weka>(); Classification l_positivo = new Classification(1, \"Positivo\"); Classification l_negativo = new Classification(-1, \"Negativo\"); Classification l_neutro = new Classification(0, \"Neutro\"); l_list.add(new Entity2Weka(\"Post numero 1 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 2 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 3 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 4 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 5 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 6 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(null, l_positivo)); l_list.add(new Entity2Weka(null, l_neutro)); l_list.add(new Entity2Weka(null, l_negativo)); l_list.add(new Entity2Weka(\"Post numero 10 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 16 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", null)); l_helper.save(new File(\"Test.arff\"), Entity2Weka.class, l_list); CDIManager.stop(); }catch{IOException e){} Entity Example # The Entity class with name Entity2Weka is a Entity Base used to get attributes and make header file: @Entity public class Entity2Weka { @Id private Integer id; // id not is used @Column private String post; // generate a Sring Attribute @ManyToOne private Classification classification; // generate a relational attribute public Entity2Weka(){} public Entity2Weka(String p_string, Classification p_classification) { post = p_string; classification = p_classification; } } Classification Entity is a slave entity from entity base: @Entity @NamedQueries({ @NamedQuery(name = Classification.NAMED_QUERY_FIND_ALL, query = \"SELECT c FROM Classification c\"), @NamedQuery(name = Classification.NAMED_QUERY_FIND_BY_NAME, query = \"SELECT c FROM Classification c WHERE c.name = ?\") }) public class Classification { public static final String NAMED_QUERY_FIND_ALL = \"Classification.findAll\"; public static final String NAMED_QUERY_FIND_BY_NAME = \"Classification.findByName\"; @Id private Integer id; @Column(length = 50, nullable = false, unique = true) private String name; @Column(length = 200, nullable = true) private String description; public Classification() { } public Classification(int p_i, String p_name) { id = p_i; name = p_name; } ..... public toString(){ return name; } } Result this ARFF # @relation Entity2Weka @attribute post string @attribute classification {'Desconhecido (-888)','Negativo (-1)','Neutro (0)','Positivo (1)','Ambiguo (999)'} @data 'Post numero 1 para teste do helper','Positivo (1)' 'Post numero 2 para teste do helper','Negativo (-1)' 'Post numero 3 para teste do helper','Neutro (0)' 'Post numero 4 para teste do helper','Negativo (-1)' 'Post numero 5 para teste do helper','Positivo (1)' 'Post numero 6 para teste do helper','Negativo (-1)' 'Post numero 7 para teste do helper','Positivo (1)' 'Post numero 8 para teste do helper','Neutro (0)' 'Post numero 9 para teste do helper','Negativo (-1)' ?,'Positivo (1)' ?,'Neutro (0)' ?,'Negativo (-1)' 'Post numero 10 para teste do helper',? 'Post numero 16 para teste do helper',? 'Post numero 7 para teste do helper',? 'Post numero 8 para teste do helper',? 'Post numero 9 para teste do helper',? Download source # Download this two pakages: WEKA JPA Persistence - V0.0.4 CDI Utils - for Configuration Injection","title":"Using a new java framework to create arff from jpa entity"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#how-code","text":"try { Weka2JPAHelper l_helper; // inject with CDI or create a class with new Weka2JPAHelper(Logger,EntityManager); l_helper.save(new File(\"Teste.arff\"), A_JPA_Entity.class); }catch(IOException e){} If you want create the data to create a ARFF file from other source, you can use a JPA Entity like the example: try{ Weka2JPAHelper l_helper = CDIManager.get(Weka2JPAHelper.class); Collection<Entity2Weka> l_list = new ArrayList<Entity2Weka>(); Classification l_positivo = new Classification(1, \"Positivo\"); Classification l_negativo = new Classification(-1, \"Negativo\"); Classification l_neutro = new Classification(0, \"Neutro\"); l_list.add(new Entity2Weka(\"Post numero 1 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 2 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 3 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 4 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 5 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 6 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", l_positivo)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", l_neutro)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", l_negativo)); l_list.add(new Entity2Weka(null, l_positivo)); l_list.add(new Entity2Weka(null, l_neutro)); l_list.add(new Entity2Weka(null, l_negativo)); l_list.add(new Entity2Weka(\"Post numero 10 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 16 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 7 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 8 para teste do helper\", null)); l_list.add(new Entity2Weka(\"Post numero 9 para teste do helper\", null)); l_helper.save(new File(\"Test.arff\"), Entity2Weka.class, l_list); CDIManager.stop(); }catch{IOException e){}","title":"How code"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#entity-example","text":"The Entity class with name Entity2Weka is a Entity Base used to get attributes and make header file: @Entity public class Entity2Weka { @Id private Integer id; // id not is used @Column private String post; // generate a Sring Attribute @ManyToOne private Classification classification; // generate a relational attribute public Entity2Weka(){} public Entity2Weka(String p_string, Classification p_classification) { post = p_string; classification = p_classification; } } Classification Entity is a slave entity from entity base: @Entity @NamedQueries({ @NamedQuery(name = Classification.NAMED_QUERY_FIND_ALL, query = \"SELECT c FROM Classification c\"), @NamedQuery(name = Classification.NAMED_QUERY_FIND_BY_NAME, query = \"SELECT c FROM Classification c WHERE c.name = ?\") }) public class Classification { public static final String NAMED_QUERY_FIND_ALL = \"Classification.findAll\"; public static final String NAMED_QUERY_FIND_BY_NAME = \"Classification.findByName\"; @Id private Integer id; @Column(length = 50, nullable = false, unique = true) private String name; @Column(length = 200, nullable = true) private String description; public Classification() { } public Classification(int p_i, String p_name) { id = p_i; name = p_name; } ..... public toString(){ return name; } }","title":"Entity Example"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#result-this-arff","text":"@relation Entity2Weka @attribute post string @attribute classification {'Desconhecido (-888)','Negativo (-1)','Neutro (0)','Positivo (1)','Ambiguo (999)'} @data 'Post numero 1 para teste do helper','Positivo (1)' 'Post numero 2 para teste do helper','Negativo (-1)' 'Post numero 3 para teste do helper','Neutro (0)' 'Post numero 4 para teste do helper','Negativo (-1)' 'Post numero 5 para teste do helper','Positivo (1)' 'Post numero 6 para teste do helper','Negativo (-1)' 'Post numero 7 para teste do helper','Positivo (1)' 'Post numero 8 para teste do helper','Neutro (0)' 'Post numero 9 para teste do helper','Negativo (-1)' ?,'Positivo (1)' ?,'Neutro (0)' ?,'Negativo (-1)' 'Post numero 10 para teste do helper',? 'Post numero 16 para teste do helper',? 'Post numero 7 para teste do helper',? 'Post numero 8 para teste do helper',? 'Post numero 9 para teste do helper',?","title":"Result this ARFF"},{"location":"using_a_new_java_framework_to_create_arff_from_jpa_entity/#download-source","text":"Download this two pakages: WEKA JPA Persistence - V0.0.4 CDI Utils - for Configuration Injection","title":"Download source"},{"location":"using_cluster_algorithms/","text":"This article discusses the use of cluster schemes in Weka from the commandline. This functionality is, of course, also available from the GUI, namely the Explorer and the KnowledgeFlow. Commandline # Clusterers can be used in a similar fashion Weka's classifiers: -t <file> specifies the training file -T <file> specifies the test file -p <attribute range> for outputting predictions (if a test file is present, then for this one, otherwise the train file) -c <index> performs a classes to clusters evaluation (during training, the class attribute will be automatically ignored) -x <folds> performs cross-validation for density-based clusterers (no classes to clusters evaluation possible!). With weka.clusterers.MakeDensityBasedClusterer , any clusterer can be turned into a density-based one. -d <file> and -l <file> for saving and loading serialized models Some examples: EM with train and test file: java weka.clusterers.EM \\ -I 10 \\ # only 10 iterations -t train.arff \\ -T test.arff * SimpleKMeans with classes to clusters evaluation: java weka.clusterers.SimpleKMeans -t train.arff \\ -c last # the class attribute is the last Sample output: ... Class attribute: class Classes to Clusters: 0 1 <-- assigned to cluster 242 442 | 3 22 77 | 2 Cluster 0 <-- 2 Cluster 1 <-- 3 Incorrectly clustered instances : 319.0 40.7407 % * running 2-fold cross-validation on SimpleKMeans (we need to use MakeDensityBasedClusterer , since SimpleKMeans is no density-based clusterer!): java weka.clusterers.MakeDensityBasedClusterer -W weka.clusterers.SimpleKMeans \\ -t train.arff \\ -x 2 # 2 folds Sample output: ... 2 fold CV Log Likelihood: -40.9751 Filters # Weka contains some filters that make life easier with the cluster algorithms. AddCluster # The filter weka.filters.unsupervised.attribute.AddCluster adds the cluster number as nominal attribute to the data processed by the filter. This makes the post-processing or analyzing of the cluster assignments easier than with the -p X option. Here's an example for the UCI dataset anneal using SimpleKMeans : java weka.filters.unsupervised.attribute.AddCluster \\ -W \"weka.clusterers.SimpleKMeans -N 6 -S 42\" \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff And some example output: @relation ... @attribute family {'?',GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS} @attribute product-type {C,H,G} ... @attribute class {1,2,3,4,5,U} @attribute cluster {cluster1,cluster2,cluster3,cluster4,cluster5,cluster6} @data '?',C,A,8,0,'?',S,'?',0,'?','?',G,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',COIL,0.7,610,0,'?',0,'?',3,cluster2 '?',C,R,0,0,'?',S,2,0,'?','?',E,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',Y,'?','?','?',COIL,3.2,610,0,'?',0,'?',3,cluster2 ClusterMembership # If you're more interested in the probability for each each cluster an instance gets assigned, you can use the filter weka.filters.unsupervised.attribute.ClusterMembership . Here's an example for the UCI dataset anneal using EM : java weka.filters.unsupervised.attribute.ClusterMembership \\ -W weka.clusterers.EM \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff \\ -- \\ # additional options for EM follow after the -- -I 10 And some example output: @relation ... @attribute pCluster_0_0 numeric @attribute pCluster_0_1 numeric @attribute pCluster_0_2 numeric @attribute pCluster_0_3 numeric @attribute pCluster_0_4 numeric @attribute pCluster_0_5 numeric @data 0.000147,0.009863,0,0.98999,0.000001,0 0.00292,0.000002,0,0.997078,0,0 ... Classifiers # ClassificationViaClustering # A new meta-classifier, weka.classifiers.meta.ClassificationViaClustering , got introduced in the developer version (>3.5.6), which mimics the clusters to classes functionality of the weka.core.ClusterEvaluation class. A user defined cluster algorithm is built with the training data presented to the meta-classifier (after the class attribute got removed, of course) and then the mapping between classes and clusters is determined. This mapping is then used for predicting class labels of unseen instances. Here's an example for the UCI dataset balance-scale : java weka.classifiers.meta.ClassificationViaClustering \\ -t balance-scale.arff \\ -W weka.clusterers.SimpleKMeans \\ -- \\ -N 3 # additional parameters for SimpleKMeans, since the dataset has 3 class labels And some sample output: ... Clusters to classes mapping: 1. Cluster: B (2) 2. Cluster: R (3) 3. Cluster: L (1) Classes to clusters mapping: 1. Class (L): 3. Cluster 2. Class (B): 1. Cluster 3. Class (R): 2. Cluster ... Note: In order to obtain a useful model, you have to make sure that the cluster algorithms are properly setup for the dataset you are using. E.g., SimpleKMeans has a fixed number of clusters that it should determine. Trying to determine 2 clusters on a dataset with 5 class labels isn't very useful. Notes # All examples are for a Linux bash. For Windows or the SimplCLI, just remove the backslashes and collapse all the lines into a single one. Comments in the examples follow a # and need to be removed, of course. ... denotes an omission of unnecessary content. See also # Use Weka in your Java code , section Clustering explains using the Weka API for clusterers Batch filtering - shows how to use filters in batch mode Serialization - for using serialized/saved models","title":"Using cluster algorithms"},{"location":"using_cluster_algorithms/#commandline","text":"Clusterers can be used in a similar fashion Weka's classifiers: -t <file> specifies the training file -T <file> specifies the test file -p <attribute range> for outputting predictions (if a test file is present, then for this one, otherwise the train file) -c <index> performs a classes to clusters evaluation (during training, the class attribute will be automatically ignored) -x <folds> performs cross-validation for density-based clusterers (no classes to clusters evaluation possible!). With weka.clusterers.MakeDensityBasedClusterer , any clusterer can be turned into a density-based one. -d <file> and -l <file> for saving and loading serialized models Some examples: EM with train and test file: java weka.clusterers.EM \\ -I 10 \\ # only 10 iterations -t train.arff \\ -T test.arff * SimpleKMeans with classes to clusters evaluation: java weka.clusterers.SimpleKMeans -t train.arff \\ -c last # the class attribute is the last Sample output: ... Class attribute: class Classes to Clusters: 0 1 <-- assigned to cluster 242 442 | 3 22 77 | 2 Cluster 0 <-- 2 Cluster 1 <-- 3 Incorrectly clustered instances : 319.0 40.7407 % * running 2-fold cross-validation on SimpleKMeans (we need to use MakeDensityBasedClusterer , since SimpleKMeans is no density-based clusterer!): java weka.clusterers.MakeDensityBasedClusterer -W weka.clusterers.SimpleKMeans \\ -t train.arff \\ -x 2 # 2 folds Sample output: ... 2 fold CV Log Likelihood: -40.9751","title":"Commandline"},{"location":"using_cluster_algorithms/#filters","text":"Weka contains some filters that make life easier with the cluster algorithms.","title":"Filters"},{"location":"using_cluster_algorithms/#addcluster","text":"The filter weka.filters.unsupervised.attribute.AddCluster adds the cluster number as nominal attribute to the data processed by the filter. This makes the post-processing or analyzing of the cluster assignments easier than with the -p X option. Here's an example for the UCI dataset anneal using SimpleKMeans : java weka.filters.unsupervised.attribute.AddCluster \\ -W \"weka.clusterers.SimpleKMeans -N 6 -S 42\" \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff And some example output: @relation ... @attribute family {'?',GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS} @attribute product-type {C,H,G} ... @attribute class {1,2,3,4,5,U} @attribute cluster {cluster1,cluster2,cluster3,cluster4,cluster5,cluster6} @data '?',C,A,8,0,'?',S,'?',0,'?','?',G,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',COIL,0.7,610,0,'?',0,'?',3,cluster2 '?',C,R,0,0,'?',S,2,0,'?','?',E,'?','?','?','?','?','?','?','?','?','?','?','?','?','?','?',Y,'?','?','?',COIL,3.2,610,0,'?',0,'?',3,cluster2","title":"AddCluster"},{"location":"using_cluster_algorithms/#clustermembership","text":"If you're more interested in the probability for each each cluster an instance gets assigned, you can use the filter weka.filters.unsupervised.attribute.ClusterMembership . Here's an example for the UCI dataset anneal using EM : java weka.filters.unsupervised.attribute.ClusterMembership \\ -W weka.clusterers.EM \\ -I last \\ # we want to ignore the class attribute -i anneal.arff \\ -o out.arff \\ -- \\ # additional options for EM follow after the -- -I 10 And some example output: @relation ... @attribute pCluster_0_0 numeric @attribute pCluster_0_1 numeric @attribute pCluster_0_2 numeric @attribute pCluster_0_3 numeric @attribute pCluster_0_4 numeric @attribute pCluster_0_5 numeric @data 0.000147,0.009863,0,0.98999,0.000001,0 0.00292,0.000002,0,0.997078,0,0 ...","title":"ClusterMembership"},{"location":"using_cluster_algorithms/#classifiers","text":"","title":"Classifiers"},{"location":"using_cluster_algorithms/#classificationviaclustering","text":"A new meta-classifier, weka.classifiers.meta.ClassificationViaClustering , got introduced in the developer version (>3.5.6), which mimics the clusters to classes functionality of the weka.core.ClusterEvaluation class. A user defined cluster algorithm is built with the training data presented to the meta-classifier (after the class attribute got removed, of course) and then the mapping between classes and clusters is determined. This mapping is then used for predicting class labels of unseen instances. Here's an example for the UCI dataset balance-scale : java weka.classifiers.meta.ClassificationViaClustering \\ -t balance-scale.arff \\ -W weka.clusterers.SimpleKMeans \\ -- \\ -N 3 # additional parameters for SimpleKMeans, since the dataset has 3 class labels And some sample output: ... Clusters to classes mapping: 1. Cluster: B (2) 2. Cluster: R (3) 3. Cluster: L (1) Classes to clusters mapping: 1. Class (L): 3. Cluster 2. Class (B): 1. Cluster 3. Class (R): 2. Cluster ... Note: In order to obtain a useful model, you have to make sure that the cluster algorithms are properly setup for the dataset you are using. E.g., SimpleKMeans has a fixed number of clusters that it should determine. Trying to determine 2 clusters on a dataset with 5 class labels isn't very useful.","title":"ClassificationViaClustering"},{"location":"using_cluster_algorithms/#notes","text":"All examples are for a Linux bash. For Windows or the SimplCLI, just remove the backslashes and collapse all the lines into a single one. Comments in the examples follow a # and need to be removed, of course. ... denotes an omission of unnecessary content.","title":"Notes"},{"location":"using_cluster_algorithms/#see-also","text":"Use Weka in your Java code , section Clustering explains using the Weka API for clusterers Batch filtering - shows how to use filters in batch mode Serialization - for using serialized/saved models","title":"See also"},{"location":"using_clusterers/","text":"see Using cluster algorithms","title":"Using clusterers"},{"location":"using_the_api/","text":"Several articles describe certain aspects of using the Weka API: Use Weka in your Java code Weka Examples Generating cross-validation folds Creating an ARFF file Binarize Attribute ARFF files from Text Collections Adding attributes to a dataset Save Instances to an ARFF File Generating ROC curve Visualizing ROC curve Serialization It is possible to use Weka through Jupyter notebooks as well, see the following article for more information: Jupyter notebooks","title":"Using the API"},{"location":"using_the_mathexpression_filter/","text":"The filter MathExpression can be found in this package: weka.filters.unsupervised.attribute It provides a powerful means of performing mathematical transformations of numeric attributes. The following operators are supported: +, -, *, /, (, ), pow, log, abs, cos, exp, sqrt, tan, sin, ceil, floor, rint, MEAN, MAX, MIN, SD, COUNT, SUM, SUMSQUARED, ifelse The attribute value that is being processed, can be referenced as A . Manual discretization # One can even use the filter for manually discretizing numeric attributes, if the other Discretize filters (supervised and unsupervised) cannot be used. This works thanks to the ifelse operator. It is basically a two-step-process: run MathExpression to turn all the values into discrete ones run NumericToNominal to turn the numeric values then into nominal labels Here's an example: a dataset where the first attribute needs to be discretized into 3 bins the bins need to be as follows (-inf-20.0] (20.0-80.0] (80.0-inf) using MathExpression to create discrete values weka.filters.unsupervised.attribute.MathExpression \\ -E \"ifelse(A>20, ifelse(A>80, 3, 2), 1)\" \\ -V \\ -R 1 Note: -V -R 1 means we only want to transform the first attribute. Without -V all the numeric attributes would be transformed according to this expression. * this results in the following transformation (-inf-20.0] -> 1 (20.0-80.0] -> 2 (80.0-inf) -> 3 using NumericToBinary to create a nominal attribute from the numeric one weka.filters.unsupervised.attribute.NumericToNominal \\ -R 1 optional: if one wants to rename those labels, one can use the class listed in the Rename Attribute Values article for that Note: the \"\\\" at the end of the lines tell a *nix bash to continue on the next line.","title":"Using the mathexpression filter"},{"location":"using_the_mathexpression_filter/#manual-discretization","text":"One can even use the filter for manually discretizing numeric attributes, if the other Discretize filters (supervised and unsupervised) cannot be used. This works thanks to the ifelse operator. It is basically a two-step-process: run MathExpression to turn all the values into discrete ones run NumericToNominal to turn the numeric values then into nominal labels Here's an example: a dataset where the first attribute needs to be discretized into 3 bins the bins need to be as follows (-inf-20.0] (20.0-80.0] (80.0-inf) using MathExpression to create discrete values weka.filters.unsupervised.attribute.MathExpression \\ -E \"ifelse(A>20, ifelse(A>80, 3, 2), 1)\" \\ -V \\ -R 1 Note: -V -R 1 means we only want to transform the first attribute. Without -V all the numeric attributes would be transformed according to this expression. * this results in the following transformation (-inf-20.0] -> 1 (20.0-80.0] -> 2 (80.0-inf) -> 3 using NumericToBinary to create a nominal attribute from the numeric one weka.filters.unsupervised.attribute.NumericToNominal \\ -R 1 optional: if one wants to rename those labels, one can use the class listed in the Rename Attribute Values article for that Note: the \"\\\" at the end of the lines tell a *nix bash to continue on the next line.","title":"Manual discretization"},{"location":"using_weka_from_groovy/","text":"Groovy... is an agile and dynamic language for the Java Virtual Machine builds upon the strengths of Java but has additional power features inspired by languages like Python, Ruby and Smalltalk makes modern programming features available to Java developers with almost-zero learning curve supports Domain-Specific Languages and other compact syntax so your code becomes easy to read and maintain makes writing shell and build scripts easy with its powerful processing primitives, OO abilities and an Ant DSL increases developer productivity by reducing scaffolding code when developing web, GUI, database or console applications simplifies testing by supporting unit testing and mocking out-of-the-box seamlessly integrates with all existing Java objects and libraries compiles straight to Java bytecode so you can use it anywhere you can use Java -- taken from the Groovy homepage This article explains how to use Weka classes within Groovy. Groovy CLASSPATH # Additional jars can be added to Groovy in various ways: $GROOVY_HOME/lib Any jar that is placed in this directory will be available in Groovy $GROOVY_HOME/conf/groovy-starter.conf This file lists jar files and directories from which to include jar files. The syntax is fairly easy: load <path> For example, loading the weka.jar that is located in $HOME/myjars: load !{user.home}/myjars/weka.jar Or loading all jars in a directory, e.g., $HOME/myjars: load !{user.home}/myjars/*.jar Java CLASSPATH Groovy automatically imports the Java CLASSPATH , i.e., everything that is listed in the CLASSPATH environment variable. -classpath option When running a Groovy script, you can explicitly tell Groovy, what CLASSPATH to use: groovy -classpath <jars, etc=\"\"> <script.groovy> The Grape: # Groovy dependency manager Grape is a JAR dependency manager embedded into Groovy. Grape lets you quickly add maven repository dependencies to your classpath, making scripting even easier. Weka can be added as a dependency in your Groovy script You can also search for dependencies and versions numbers on mvnrepository.com and it will provide you the Grab annotation form of the pom.xml entry. Accessing Weka classes from Groovy # Requirements # Groovy 1.5.7 or later Weka 3.5.4 or later Implementation # If your Groovy CLASSPATH has been setup correctly, you can use all those classes in Groovy straight away. E.g., after including the weka.jar , I can run the following little script ( UsingJ48.groovy to train J48 on a supplied dataset and output its model: import weka.classifiers.trees.J48 import weka.core.converters.ConverterUtils.DataSource import weka.core.Instances if ( args . size () == 0 ) { println \"Usage: UsingJ48.groovy <arff-file>\" System . exit ( 0 ) } // load data and set class index data = DataSource . read ( args [ 0 ]) data . setClassIndex ( data . numAttributes () - 1 ) // create the model j48 = new J48 () j48 . buildClassifier ( data ) // print out the built model println j48 A slightly more elaborate example can be found in UsingJ48Ext.groovy , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.groovy needs Weka 3.6.x to run, due to some changes in the API. Implementing a Groovy classifier # Requirements # Groovy 1.5.7 or later developer version of Weka later than 3.5.8 Implementation # Implementing a Groovy classifier is pretty straight-forward, since the syntax is almost the same and Java classes are imported/used in Groovy just like in Java. The GeroR.groovy file re-implements the weka.classifiers.rules.ZeroR classifier as Groovy script. The class declaration for GeroR.groovy looks like this, for instance: class GeroR extends Classifier implements WeightedInstancesHandler { ... } For more information on implementing classifiers, see the Writing your own Classifier article. Execution # Groovy # As long as you have the weka.jar in your Groovy environment, you can directly run these scripts using the groovy command. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: groovy -classpath weka.jar \\ /some/where/FunkyClassifier.groovy \\ -t /my/datasets/data.arff \\ <more options for the Groovy script> Weka # In order to execute classifiers written in Groovy, you have to use the weka.classifiers.scripting.GroovyClassifier classifier. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: java weka.jar:groovy-all-1.5.7.jar \\ weka.classifiers.scripting.GroovyClassifier \\ -t /my/datasets/data.arff \\ -G /some/where/FunkyClassifier.groovy \\ -- <more options for the Groovy script> Downloads # UsingJ48.groovy - simple example for using J48 in a Groovy script UsingJ48Ext.groovy - a slightly more elaborate example script for using J48 CustomCV.groovy thread GeroR.groovy - weka.classifiers.rules.ZeroR implemented in Groovy LibsvmWeights.groovy and outputs the best weights multiple_eval.groovy - evaluates multiple classifiers on multiple train/test pairs and outputs statistics AttributeStatistics.groovy - Outputs statistics for each attribute in a dataset Links # Groovy Homepage Getting Started Style Guide Support","title":"Using weka from groovy"},{"location":"using_weka_from_groovy/#groovy-classpath","text":"Additional jars can be added to Groovy in various ways: $GROOVY_HOME/lib Any jar that is placed in this directory will be available in Groovy $GROOVY_HOME/conf/groovy-starter.conf This file lists jar files and directories from which to include jar files. The syntax is fairly easy: load <path> For example, loading the weka.jar that is located in $HOME/myjars: load !{user.home}/myjars/weka.jar Or loading all jars in a directory, e.g., $HOME/myjars: load !{user.home}/myjars/*.jar Java CLASSPATH Groovy automatically imports the Java CLASSPATH , i.e., everything that is listed in the CLASSPATH environment variable. -classpath option When running a Groovy script, you can explicitly tell Groovy, what CLASSPATH to use: groovy -classpath <jars, etc=\"\"> <script.groovy>","title":"Groovy CLASSPATH"},{"location":"using_weka_from_groovy/#the-grape","text":"Groovy dependency manager Grape is a JAR dependency manager embedded into Groovy. Grape lets you quickly add maven repository dependencies to your classpath, making scripting even easier. Weka can be added as a dependency in your Groovy script You can also search for dependencies and versions numbers on mvnrepository.com and it will provide you the Grab annotation form of the pom.xml entry.","title":"The Grape:"},{"location":"using_weka_from_groovy/#accessing-weka-classes-from-groovy","text":"","title":"Accessing Weka classes from Groovy"},{"location":"using_weka_from_groovy/#requirements","text":"Groovy 1.5.7 or later Weka 3.5.4 or later","title":"Requirements"},{"location":"using_weka_from_groovy/#implementation","text":"If your Groovy CLASSPATH has been setup correctly, you can use all those classes in Groovy straight away. E.g., after including the weka.jar , I can run the following little script ( UsingJ48.groovy to train J48 on a supplied dataset and output its model: import weka.classifiers.trees.J48 import weka.core.converters.ConverterUtils.DataSource import weka.core.Instances if ( args . size () == 0 ) { println \"Usage: UsingJ48.groovy <arff-file>\" System . exit ( 0 ) } // load data and set class index data = DataSource . read ( args [ 0 ]) data . setClassIndex ( data . numAttributes () - 1 ) // create the model j48 = new J48 () j48 . buildClassifier ( data ) // print out the built model println j48 A slightly more elaborate example can be found in UsingJ48Ext.groovy , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.groovy needs Weka 3.6.x to run, due to some changes in the API.","title":"Implementation"},{"location":"using_weka_from_groovy/#implementing-a-groovy-classifier","text":"","title":"Implementing a Groovy classifier"},{"location":"using_weka_from_groovy/#requirements_1","text":"Groovy 1.5.7 or later developer version of Weka later than 3.5.8","title":"Requirements"},{"location":"using_weka_from_groovy/#implementation_1","text":"Implementing a Groovy classifier is pretty straight-forward, since the syntax is almost the same and Java classes are imported/used in Groovy just like in Java. The GeroR.groovy file re-implements the weka.classifiers.rules.ZeroR classifier as Groovy script. The class declaration for GeroR.groovy looks like this, for instance: class GeroR extends Classifier implements WeightedInstancesHandler { ... } For more information on implementing classifiers, see the Writing your own Classifier article.","title":"Implementation"},{"location":"using_weka_from_groovy/#execution","text":"","title":"Execution"},{"location":"using_weka_from_groovy/#groovy","text":"As long as you have the weka.jar in your Groovy environment, you can directly run these scripts using the groovy command. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: groovy -classpath weka.jar \\ /some/where/FunkyClassifier.groovy \\ -t /my/datasets/data.arff \\ <more options for the Groovy script>","title":"Groovy"},{"location":"using_weka_from_groovy/#weka","text":"In order to execute classifiers written in Groovy, you have to use the weka.classifiers.scripting.GroovyClassifier classifier. Here is an example, executing the FunkyClassifier.groovy script in a Linux bash: java weka.jar:groovy-all-1.5.7.jar \\ weka.classifiers.scripting.GroovyClassifier \\ -t /my/datasets/data.arff \\ -G /some/where/FunkyClassifier.groovy \\ -- <more options for the Groovy script>","title":"Weka"},{"location":"using_weka_from_groovy/#downloads","text":"UsingJ48.groovy - simple example for using J48 in a Groovy script UsingJ48Ext.groovy - a slightly more elaborate example script for using J48 CustomCV.groovy thread GeroR.groovy - weka.classifiers.rules.ZeroR implemented in Groovy LibsvmWeights.groovy and outputs the best weights multiple_eval.groovy - evaluates multiple classifiers on multiple train/test pairs and outputs statistics AttributeStatistics.groovy - Outputs statistics for each attribute in a dataset","title":"Downloads"},{"location":"using_weka_from_groovy/#links","text":"Groovy Homepage Getting Started Style Guide Support","title":"Links"},{"location":"using_weka_from_jython/","text":"Jython is an implementation of the high-level, dynamic, object-oriented language Python written in 100% Pure Java, and seamlessly integrated with the Java platform. It thus allows you to run Python on any Java platform. -- taken from the Jython homepage This article explains how use Weka classes from within Jython and how to write a classifier in Jython that can be used within the Weka framework. Accessing Weka classes from Jython # Requirements # In order for Jython to find the Weka classes, you must export them in your CLASSPATH . Here is an example for adding the weka.jar located in the directory /some/where to the CLASSPATH in a bash under Linux: export CLASSPATH = $CLASSPATH :/some/where/weka.jar Note: Windows users must just the backslash (\"\\\") in the command prompt instead of the slash (\"/\") in paths. Implementation # As soon as one imports classes in a Jython module one can use that class just like in Java. E.g., if one wants to use the J48 classifier, one only needs to import it as follows: import weka.classifiers.trees.J48 as J48 Here's a Jython module ( UsingJ48.py ): import sys import java.io.FileReader as FileReader import weka.core.Instances as Instances import weka.classifiers.trees.J48 as J48 # load data file file = FileReader ( \"/some/where/file.arff\" ) data = Instances ( file ) data . setClassIndex ( data . numAttributes () - 1 ) # create the model j48 = J48 () j48 . buildClassifier ( data ) # print out the built model print j48 A slightly more elaborate example can be found in UsingJ48Ext.py , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.py needs Weka 3.6.x to run, due to some changes in the API. Implementing a Jython classifier # Requirements # Weka >3.5.6 Jython 2.2rc2 (later versions should work as well) Implementation # This section covers the implementation of weka.classifiers.rules.ZeroR in Python, JeroR.py : Subclass an abstract superclass of Weka classifiers (in this case weka.classifiers.Classifier ): class JeroR (**Classifier**, JythonSerializableObject): Note: the JythonSerializableObject interface is necessary for Serialization purposes (Weka creates copies of classifiers via serialization ) You have to implement the following methods: def listOptions(self): Returns an java.util.Enumeration of weka.core.Option objects of all available options. Calling the superclass method is done with *<superclass>*.listOptions() , e.g., Classifier.listOptions() . def setOptions(self, options): Sets the commandline options, with the parameter options being an array of strings. def getOptions(self): Returns an array of strings, containing all the currently set options (to be used with setOptions(self,options) ). def getCapabilities(self): Returns a weka.core.Capabilities object with information about what attributes and classes can be processed by this algorithm. def buildClassifier(self, instances): This method builds the actual model based on the data provided. The first statements in this method should be the ones checking the capabilities of the algorithm against the data and removing all instances with a missing class value: # check the capabilities self . getCapabilities () . testWithFail ( instances ) # remove instances with missing class instances = Instances ( instances ) instances . deleteWithMissingClass () at least one of the following two: def classifyInstance(self, instance): Returns either the index of the predicted class label (for nominal classes) or the regression result (for numeric classes) def distributionForInstance(self, instance): This method returns an array of doubles containing the probabilities for all class labels. In case of a numeric class attribute, the length of this array is 1. In Jython, you can use the [jarray](http://www.jython.org/docs/jarray.html) module to generate a double array. With the following line you can create the correct array to be returned by this method (you still need to fill it with values): result = jarray.zeros(instance.numClasses(), 'd') Of course, the elements of this array must sum up to 1. * def toString(self): Returns a string describing the not-yet-built or built model. The following code snippet simulates the \"main\" method; it creates an instance of the classifier and passes it on to the Classifier.runClassifier method: if __name__ = \"__main__\" : Classifier . runClassifier ( JeroR (), sys . argv [ 1 :]) This doesn't work right out-of-the-box, since Jython cannot access protected static methods in superclasses. One has to set the following value in the Jython registry to make it work (taken from this FAQ): python.security.respectJavaAccessibility = false Documentation # Documentation in Python is done with the so-called doc strings within the class or method the documentation is for. Using HappyDoc , one can use structured text to output nice HTML, similar to Javadoc . Class doc string: class JeroR ( Classifier , JythonSerializableObject ): \"\"\" JeroR is a Jython implementation of the Weka classifier ZeroR 'author' -- FracPete (fracpete at waikato dot ac dot nz) 'version' -- $Revision$ \"\"\" Note: the $Revision$ tag is filled in by a source control system like CVS or Subversion. Method doc string: def classifyInstance ( self , instance ): \"\"\" returns the prediction for the given instance Parameter(s): 'instance' -- the instance to predict the class value for Return: the prediction for the given instance \"\"\" Execution # Note: The commands listed here for a Linux/Unix bash, for Windows remove all the backslashes (\"\\\") at the end of the lines and assemble the command in a single line. Under Windows, the path separator \":\" used in the CLASSPATH needs to be replaced with \";\" as well. Jython # The Jython classifier, e.g., FunkyClassifier.py , can be run like this from commandline, with only the weka.jar and the jython.jar in the CLASSPATH : java -classpath weka.jar:jython.jar \\ org.python.util.jython \\ /some/place/FunkyClassifier.py \\ -t /some/where/file.arff Weka # In order to execute the Jython classifier FunkyClassifier.py with Weka, one basically only needs to have the weka.jar and the jython.jar in the CLASSPATH and call the weka.classifiers.JythonClassifier classifier with the Jython classifier, i.e., FunkyClassifier.py , as parameter (\" -J \"): java -classpath weka.jar:jython.jar \\ weka.classifiers.JythonClassifier \\ -J /some/place/FunkyClassifier.py \\ -t /some/where/file.arff Downloads # UsingJ48.py UsingJ48Ext.py JeroR.py - weka.classifiers.rules.ZeroR as Jython script implemented See also # Use Weka in your Java code - for general information on how to use the Weka API Using Weka via Jepp - using the javax.script approach to interface Java and Python Links # Jython Homepage Java arrays Registry Embedding Jython Python Homepage HappyDoc - generating documentation from Jython/Python modules Java Homepage Javadoc Eclipse Homepage PyDev plugin","title":"Using weka from jython"},{"location":"using_weka_from_jython/#accessing-weka-classes-from-jython","text":"","title":"Accessing Weka classes from Jython"},{"location":"using_weka_from_jython/#requirements","text":"In order for Jython to find the Weka classes, you must export them in your CLASSPATH . Here is an example for adding the weka.jar located in the directory /some/where to the CLASSPATH in a bash under Linux: export CLASSPATH = $CLASSPATH :/some/where/weka.jar Note: Windows users must just the backslash (\"\\\") in the command prompt instead of the slash (\"/\") in paths.","title":"Requirements"},{"location":"using_weka_from_jython/#implementation","text":"As soon as one imports classes in a Jython module one can use that class just like in Java. E.g., if one wants to use the J48 classifier, one only needs to import it as follows: import weka.classifiers.trees.J48 as J48 Here's a Jython module ( UsingJ48.py ): import sys import java.io.FileReader as FileReader import weka.core.Instances as Instances import weka.classifiers.trees.J48 as J48 # load data file file = FileReader ( \"/some/where/file.arff\" ) data = Instances ( file ) data . setClassIndex ( data . numAttributes () - 1 ) # create the model j48 = J48 () j48 . buildClassifier ( data ) # print out the built model print j48 A slightly more elaborate example can be found in UsingJ48Ext.py , which uses more methods of the weka.classifiers.Evaluation class. NB: The example UsingJ48Ext.py needs Weka 3.6.x to run, due to some changes in the API.","title":"Implementation"},{"location":"using_weka_from_jython/#implementing-a-jython-classifier","text":"","title":"Implementing a Jython classifier"},{"location":"using_weka_from_jython/#requirements_1","text":"Weka >3.5.6 Jython 2.2rc2 (later versions should work as well)","title":"Requirements"},{"location":"using_weka_from_jython/#implementation_1","text":"This section covers the implementation of weka.classifiers.rules.ZeroR in Python, JeroR.py : Subclass an abstract superclass of Weka classifiers (in this case weka.classifiers.Classifier ): class JeroR (**Classifier**, JythonSerializableObject): Note: the JythonSerializableObject interface is necessary for Serialization purposes (Weka creates copies of classifiers via serialization ) You have to implement the following methods: def listOptions(self): Returns an java.util.Enumeration of weka.core.Option objects of all available options. Calling the superclass method is done with *<superclass>*.listOptions() , e.g., Classifier.listOptions() . def setOptions(self, options): Sets the commandline options, with the parameter options being an array of strings. def getOptions(self): Returns an array of strings, containing all the currently set options (to be used with setOptions(self,options) ). def getCapabilities(self): Returns a weka.core.Capabilities object with information about what attributes and classes can be processed by this algorithm. def buildClassifier(self, instances): This method builds the actual model based on the data provided. The first statements in this method should be the ones checking the capabilities of the algorithm against the data and removing all instances with a missing class value: # check the capabilities self . getCapabilities () . testWithFail ( instances ) # remove instances with missing class instances = Instances ( instances ) instances . deleteWithMissingClass () at least one of the following two: def classifyInstance(self, instance): Returns either the index of the predicted class label (for nominal classes) or the regression result (for numeric classes) def distributionForInstance(self, instance): This method returns an array of doubles containing the probabilities for all class labels. In case of a numeric class attribute, the length of this array is 1. In Jython, you can use the [jarray](http://www.jython.org/docs/jarray.html) module to generate a double array. With the following line you can create the correct array to be returned by this method (you still need to fill it with values): result = jarray.zeros(instance.numClasses(), 'd') Of course, the elements of this array must sum up to 1. * def toString(self): Returns a string describing the not-yet-built or built model. The following code snippet simulates the \"main\" method; it creates an instance of the classifier and passes it on to the Classifier.runClassifier method: if __name__ = \"__main__\" : Classifier . runClassifier ( JeroR (), sys . argv [ 1 :]) This doesn't work right out-of-the-box, since Jython cannot access protected static methods in superclasses. One has to set the following value in the Jython registry to make it work (taken from this FAQ): python.security.respectJavaAccessibility = false","title":"Implementation"},{"location":"using_weka_from_jython/#documentation","text":"Documentation in Python is done with the so-called doc strings within the class or method the documentation is for. Using HappyDoc , one can use structured text to output nice HTML, similar to Javadoc . Class doc string: class JeroR ( Classifier , JythonSerializableObject ): \"\"\" JeroR is a Jython implementation of the Weka classifier ZeroR 'author' -- FracPete (fracpete at waikato dot ac dot nz) 'version' -- $Revision$ \"\"\" Note: the $Revision$ tag is filled in by a source control system like CVS or Subversion. Method doc string: def classifyInstance ( self , instance ): \"\"\" returns the prediction for the given instance Parameter(s): 'instance' -- the instance to predict the class value for Return: the prediction for the given instance \"\"\"","title":"Documentation"},{"location":"using_weka_from_jython/#execution","text":"Note: The commands listed here for a Linux/Unix bash, for Windows remove all the backslashes (\"\\\") at the end of the lines and assemble the command in a single line. Under Windows, the path separator \":\" used in the CLASSPATH needs to be replaced with \";\" as well.","title":"Execution"},{"location":"using_weka_from_jython/#jython","text":"The Jython classifier, e.g., FunkyClassifier.py , can be run like this from commandline, with only the weka.jar and the jython.jar in the CLASSPATH : java -classpath weka.jar:jython.jar \\ org.python.util.jython \\ /some/place/FunkyClassifier.py \\ -t /some/where/file.arff","title":"Jython"},{"location":"using_weka_from_jython/#weka","text":"In order to execute the Jython classifier FunkyClassifier.py with Weka, one basically only needs to have the weka.jar and the jython.jar in the CLASSPATH and call the weka.classifiers.JythonClassifier classifier with the Jython classifier, i.e., FunkyClassifier.py , as parameter (\" -J \"): java -classpath weka.jar:jython.jar \\ weka.classifiers.JythonClassifier \\ -J /some/place/FunkyClassifier.py \\ -t /some/where/file.arff","title":"Weka"},{"location":"using_weka_from_jython/#downloads","text":"UsingJ48.py UsingJ48Ext.py JeroR.py - weka.classifiers.rules.ZeroR as Jython script implemented","title":"Downloads"},{"location":"using_weka_from_jython/#see-also","text":"Use Weka in your Java code - for general information on how to use the Weka API Using Weka via Jepp - using the javax.script approach to interface Java and Python","title":"See also"},{"location":"using_weka_from_jython/#links","text":"Jython Homepage Java arrays Registry Embedding Jython Python Homepage HappyDoc - generating documentation from Jython/Python modules Java Homepage Javadoc Eclipse Homepage PyDev plugin","title":"Links"},{"location":"using_weka_via_jepp/","text":"Jepp embeds CPython in Java. It is safe to use in a heavily threaded environment, it is quite fast and its stability is a main feature and goal. --taken from the Jepp homepage Prerequisites # Java 6 (jepp makes use of the javax.script package) Jepp 2.2 or higher suggested fix for the missing sys.argv problem Limitations # Jepp doesn't seem to be able to import third-party libraries like scipy , numpy or wx (pure Python modules can be imported, though). Accessing Weka classes within Jepp # Java classes are imported in one's Python script as follows: from < package > import < class > E.g., importing J48 looks like this: from weka.classifiers.trees import J48 In the following a little example script for loading a dataset, cross-validating J48 with it and outputting the results of the cross-validation in the console: # import classes from weka.core import Instances from weka.classifiers import Evaluation from weka.classifiers.trees import J48 from java.io import BufferedReader from java.io import FileReader from java.util import Random # load data reader = BufferedReader ( FileReader ( '/some/where/file.arff' )) data = Instances ( reader ) data . setClassIndex ( data . numAttributes () - 1 ) reader . close () # train classifier j48 = J48 () eval = Evaluation ( data ) rand = Random ( 1 ) eval . crossValidateModel ( j48 , data , 10 , rand ) # output summary print eval . toSummaryString () The script can be started like this (you will have to adjust the paths for the jars and the Python script): java -classpath jep.jar:weka.jar some_script.py See also # Use Weka in your Java code - for general information on how to use the Weka API Using Weka from Jython Links # Jepp homepage Python homepage Linux.com - more examples","title":"Using weka via jepp"},{"location":"using_weka_via_jepp/#prerequisites","text":"Java 6 (jepp makes use of the javax.script package) Jepp 2.2 or higher suggested fix for the missing sys.argv problem","title":"Prerequisites"},{"location":"using_weka_via_jepp/#limitations","text":"Jepp doesn't seem to be able to import third-party libraries like scipy , numpy or wx (pure Python modules can be imported, though).","title":"Limitations"},{"location":"using_weka_via_jepp/#accessing-weka-classes-within-jepp","text":"Java classes are imported in one's Python script as follows: from < package > import < class > E.g., importing J48 looks like this: from weka.classifiers.trees import J48 In the following a little example script for loading a dataset, cross-validating J48 with it and outputting the results of the cross-validation in the console: # import classes from weka.core import Instances from weka.classifiers import Evaluation from weka.classifiers.trees import J48 from java.io import BufferedReader from java.io import FileReader from java.util import Random # load data reader = BufferedReader ( FileReader ( '/some/where/file.arff' )) data = Instances ( reader ) data . setClassIndex ( data . numAttributes () - 1 ) reader . close () # train classifier j48 = J48 () eval = Evaluation ( data ) rand = Random ( 1 ) eval . crossValidateModel ( j48 , data , 10 , rand ) # output summary print eval . toSummaryString () The script can be started like this (you will have to adjust the paths for the jars and the Python script): java -classpath jep.jar:weka.jar some_script.py","title":"Accessing Weka classes within Jepp"},{"location":"using_weka_via_jepp/#see-also","text":"Use Weka in your Java code - for general information on how to use the Weka API Using Weka from Jython","title":"See also"},{"location":"using_weka_via_jepp/#links","text":"Jepp homepage Python homepage Linux.com - more examples","title":"Links"},{"location":"visualization_articles/","text":"Several articles are listed below that relate to visualizing results or customizing visual elements of WEKA: Visualizing a Tree Visualizing cluster assignments Visualizing ROC curve Changing the plot background Displaying results of cross-validation folds Explorer error visualization plugins Explorer graph visualization plugins Explorer prediction visualization plugins Explorer tree visualization plugins Explorer visualization plugins Exporting Charts from the Knowledge Flow Extensions for WEKAs main GUI Plotting error rate for incremental classifier","title":"Visualization Articles"},{"location":"weka_core_capabilities.props/","text":"File # weka/core/Capabilities.props Description # Customization of the Capabilities support in Weka. CAUTION: disabling any of these properties can lead to unreliable results within Weka! Version # 3.5.3 Fields # Test general switch for Capabilities tests InstancesTest enable/disables tests that are based on the data AttributeTest enable/disables tests that work only on the type of attribute MissingValuesTest test for missing values MissingClassValuesTest test for missing class values MinimumNumberInstancesTest test for minimum number of instances See also # Properties file Links #","title":"File"},{"location":"weka_core_capabilities.props/#file","text":"weka/core/Capabilities.props","title":"File"},{"location":"weka_core_capabilities.props/#description","text":"Customization of the Capabilities support in Weka. CAUTION: disabling any of these properties can lead to unreliable results within Weka!","title":"Description"},{"location":"weka_core_capabilities.props/#version","text":"3.5.3","title":"Version"},{"location":"weka_core_capabilities.props/#fields","text":"Test general switch for Capabilities tests InstancesTest enable/disables tests that are based on the data AttributeTest enable/disables tests that work only on the type of attribute MissingValuesTest test for missing values MissingClassValuesTest test for missing class values MinimumNumberInstancesTest test for minimum number of instances","title":"Fields"},{"location":"weka_core_capabilities.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_core_capabilities.props/#links","text":"","title":"Links"},{"location":"weka_core_logging_logging.props/","text":"File # weka/core/logging/Logging.props Description # Defines the type of logging Weka performs. The default is to log regular log messages from the GUI and everything that is output to stdout and stderr to $HOME/weka.log . Version # 3.5.8 Fields # Logger The logger class to use for logging. See Javadoc of respective logger class (derived from weka.core.logging.Logger ) for more information. MinLevel Sets the minimum level a log messages needs to have in order to end up in the log. ALL will log everything, OFF turns logging off. DateFormat The ISO-8601 format of the date. Here is the default value: yyyy-MM-dd HH:mm:ss LogFile In case a logger class logs to a file, like weka.core.logging.FileLogger and weka.core.logging.OutputLogger , this file will be used. These loggers clear the log-file everytime Weka is started. See also # Properties file Links # ISO-8601","title":"File"},{"location":"weka_core_logging_logging.props/#file","text":"weka/core/logging/Logging.props","title":"File"},{"location":"weka_core_logging_logging.props/#description","text":"Defines the type of logging Weka performs. The default is to log regular log messages from the GUI and everything that is output to stdout and stderr to $HOME/weka.log .","title":"Description"},{"location":"weka_core_logging_logging.props/#version","text":"3.5.8","title":"Version"},{"location":"weka_core_logging_logging.props/#fields","text":"Logger The logger class to use for logging. See Javadoc of respective logger class (derived from weka.core.logging.Logger ) for more information. MinLevel Sets the minimum level a log messages needs to have in order to end up in the log. ALL will log everything, OFF turns logging off. DateFormat The ISO-8601 format of the date. Here is the default value: yyyy-MM-dd HH:mm:ss LogFile In case a logger class logs to a file, like weka.core.logging.FileLogger and weka.core.logging.OutputLogger , this file will be used. These loggers clear the log-file everytime Weka is started.","title":"Fields"},{"location":"weka_core_logging_logging.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_core_logging_logging.props/#links","text":"ISO-8601","title":"Links"},{"location":"weka_examples/","text":"The Weka Examples collection is a comprehensive collection of examples for the different versions of Weka in the form of an ANT project. You can access these examples as follows: Git # Through git stable-3.8 version (3.8.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples/ developer version (3.9.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples","title":"Weka examples"},{"location":"weka_examples/#git","text":"Through git stable-3.8 version (3.8.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples/ developer version (3.9.x): https://git.cms.waikato.ac.nz/weka/weka/-/tree/stable-3-8/wekaexamples","title":"Git"},{"location":"weka_experiment_database_utils.props/","text":"File # weka/experiment/DatabaseUtils.props Description # Defines the Databases setup, i.e., JDBC driver information, JDBC URL, database type conversion, etc. Version # >= 3.1.3 Fields # General jdbcDriver the comma-separated list of jdbc drivers to try loading jdbcURL the JDBC URL to the database Table creation CREATE_STRING database specific datatype, e.g., TEXT CREATE_INT database specific datatype, e.g., INT CREATE_DOUBLE database specific datatype, e.g., DOUBLE Database flags checkUpperCaseNames necessary if database turns column names into upper case ones, e.g., HSQLDB checkLowerCaseNames (> 3.5.3) necessary if database turns column names into lower case ones, e.g., PostgreSQL checkForTable (> 3.5.3) Checks whether the tables in the query are available in the meta-data of the JDBC Connection. Some tables, like pg_tables , exist but are not available through the meta-data setAutoCommit setting for java.sql.Connection.setAutoCommit(boolean) createIndex whether to create a primary key Key_IDX in the results table of an experiment Special flags for DatabaseLoader/Saver (package weka.core.converters ) nominalToStringLimit (>= 3.4.1) beyond this limit, nominal columns are loaded as STRING attributes and no longer as NOMINAL ones idColumn (>= 3.4.1) unique key in table that allows ordering for incremental loading Keywords (> 3.5.8, > 3.6.0) lists all the reserved keywords of the current database type default: AND,ASC,BY,DESC,FROM,GROUP,INSERT,ORDER,SELECT,UPDATE,WHERE KeywordsMaskChar (> 3.5.8, > 3.6.0) the character to append to attribute names/table names that would be interpreted as keywords by the database, in order to avoid exceptions when executing SQL commands defaut: _ Database type mapping # In order to import the data from database correctly into Weka, one has to specify what JDBC datatype corresponds to what Java SQL retrieval method. Here's an overview of how the Java types are mapped to Weka's attribute types: Java type Java method Identifier Weka attribute type Version String getString() 0 nominal boolean getBoolean() 1 nominal double getDouble() 2 numeric byte getByte() 3 numeric short getByte() 4 numeric int getInteger() 5 numeric long getLong() 6 numeric float getFloat() 7 numeric date getDate() 8 date text getString() 9 string >3.5.5 time getTime() 10 string >3.5.8 In the props file one lists now the type names that the database returns and what Java type it represents (via the identifier), e.g.: CHAR = 0 VARCHAR = 0 CHAR and VARCHAR are both String types, hence they are interpreted as String (identifier 0 ) Note: in case database types have blanks, one needs to replace those blanks with an underscore, e.g., DOUBLE PRECISION must be listed like this: DOUBLE_PRECISION = 2 See also # Databases Properties file","title":"File"},{"location":"weka_experiment_database_utils.props/#file","text":"weka/experiment/DatabaseUtils.props","title":"File"},{"location":"weka_experiment_database_utils.props/#description","text":"Defines the Databases setup, i.e., JDBC driver information, JDBC URL, database type conversion, etc.","title":"Description"},{"location":"weka_experiment_database_utils.props/#version","text":">= 3.1.3","title":"Version"},{"location":"weka_experiment_database_utils.props/#fields","text":"General jdbcDriver the comma-separated list of jdbc drivers to try loading jdbcURL the JDBC URL to the database Table creation CREATE_STRING database specific datatype, e.g., TEXT CREATE_INT database specific datatype, e.g., INT CREATE_DOUBLE database specific datatype, e.g., DOUBLE Database flags checkUpperCaseNames necessary if database turns column names into upper case ones, e.g., HSQLDB checkLowerCaseNames (> 3.5.3) necessary if database turns column names into lower case ones, e.g., PostgreSQL checkForTable (> 3.5.3) Checks whether the tables in the query are available in the meta-data of the JDBC Connection. Some tables, like pg_tables , exist but are not available through the meta-data setAutoCommit setting for java.sql.Connection.setAutoCommit(boolean) createIndex whether to create a primary key Key_IDX in the results table of an experiment Special flags for DatabaseLoader/Saver (package weka.core.converters ) nominalToStringLimit (>= 3.4.1) beyond this limit, nominal columns are loaded as STRING attributes and no longer as NOMINAL ones idColumn (>= 3.4.1) unique key in table that allows ordering for incremental loading Keywords (> 3.5.8, > 3.6.0) lists all the reserved keywords of the current database type default: AND,ASC,BY,DESC,FROM,GROUP,INSERT,ORDER,SELECT,UPDATE,WHERE KeywordsMaskChar (> 3.5.8, > 3.6.0) the character to append to attribute names/table names that would be interpreted as keywords by the database, in order to avoid exceptions when executing SQL commands defaut: _","title":"Fields"},{"location":"weka_experiment_database_utils.props/#database-type-mapping","text":"In order to import the data from database correctly into Weka, one has to specify what JDBC datatype corresponds to what Java SQL retrieval method. Here's an overview of how the Java types are mapped to Weka's attribute types: Java type Java method Identifier Weka attribute type Version String getString() 0 nominal boolean getBoolean() 1 nominal double getDouble() 2 numeric byte getByte() 3 numeric short getByte() 4 numeric int getInteger() 5 numeric long getLong() 6 numeric float getFloat() 7 numeric date getDate() 8 date text getString() 9 string >3.5.5 time getTime() 10 string >3.5.8 In the props file one lists now the type names that the database returns and what Java type it represents (via the identifier), e.g.: CHAR = 0 VARCHAR = 0 CHAR and VARCHAR are both String types, hence they are interpreted as String (identifier 0 ) Note: in case database types have blanks, one needs to replace those blanks with an underscore, e.g., DOUBLE PRECISION must be listed like this: DOUBLE_PRECISION = 2","title":"Database type mapping"},{"location":"weka_experiment_database_utils.props/#see-also","text":"Databases Properties file","title":"See also"},{"location":"weka_for_newbies/","text":"Apart from the Wiki and the other standard sources of information about Weka, such as the manual, there is a lot of other relevant information available in books and online. This page has been filled mostly thanks to answers on the wekalist . Important Tips # Since many people use Weka, lots of (basic & advanced) questions have already been asked on the mailing list. Therefore, using \" wekalist\" in your preferred search engine might help you get an answer faster than asking the same question again on the list before doing any research on your own first. In case you would be really lazy : http://www.google.com/search?hl=en&q= %20wekalist Java programming help # http://mindprod.com/jgloss/jcheat.html http://mindprod.com/jgloss/jgloss.html In French: http://www.jmdoudoux.fr/accueil_java.htm AI and machine learning courses # Artificial intelligence https://www.udacity.com/wiki/cs271 Machine learning https://www.youtube.com/channel/UCXYXSGq6Oz21b43hpW2DCvw https://www.coursera.org/course/ml http://www.cs.cornell.edu/Courses/cs4780/2013fa/#lectures http://shop.oreilly.com/product/0636920025610.do Specialized (Probabilistic Graphical Models - PGM: Bayesian and Markov networks) https://www.coursera.org/course/pgm Introductory books on machine learning # https://www.amazon.com/Data-Mining-Practical-Techniques-Management/dp/0128042915/ref=sr_1_1?keywords=data+mining+weka&qid=1575607507&s=books&sr=1-1 http://www.amazon.com/Machine-Learning-Tom-M-Mitchell/dp/0070428077/ref=sr_1_1?ie=UTF8&qid=1394186512&sr=8-1&keywords=tom+mitchell http://www.amazon.com/Pattern-Classification-Pt-1-Richard-Duda/dp/0471056693/ref=pd_sim_b_5?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Artificial-Intelligence-Modern-Approach-Edition/dp/0136042597/ref=pd_sim_b_6?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Introduction-Machine-Learning-Adaptive-Computation/dp/026201243X/ref=sr_1_1?s=books&ie=UTF8&qid=1394186894&sr=1-1&keywords=Alpaydin-Introduction+to+Machine+Learning In French: http://www.amazon.com/Apprentissage-artificiel-algorithmes-Antoine-Cornu%C3%A9jols/dp/2212110200/ref=sr_1_6?ie=UTF8&qid=1394186532&sr=8-6&keywords=cornu%C3%A9jols Third-party introductions to Weka # http://www.ibm.com/developerworks/library/os-weka1/ https://www.youtube.com/watch?v=TF1yh5PKaqI&t=13s https://www.youtube.com/playlist?list=PLJbE6j2EG1pZnBhOg3_Rb63WLCprtyJag https://www.youtube.com/watch?v=m7kpIBGEdkI Weka-based development # Code Examples A Simple Text Classifier in Java with WEKA presents and discuses two little programs as examples of how to integrate WEKA into your Java code for text mining: http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html Language Identification as Text Classification with WEKA explains how to build an automated language guesser for texts as a complete example of a text mining process with WEKA, and in order to demonstrate a more advanced usage of the StringToWordVector class: http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html Sample Code for Text Indexing with WEKA shows how to index a text dataset using your own Java code and the StringToWordVector filter in WEKA: http://jmgomezhidalgo.blogspot.com.es/2013/06/sample-code-for-text-indexing-with-weka.html Text Mining in WEKA Revisited: Selecting Attributes by Chaining Filters: http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html Specific Applications and other tools # Search Engine / Reranking https://www.lemurproject.org/sifaka.php https://cs.uni-paderborn.de/de/is/research/research-projects/software/weka-lr-a-label-ranking-extension-for-weka/ https://github.com/quansun/fantail-ml (ranking prediction, multi-target regression, label ranking and metalearning) Extraction, Transformation Loading (ETL) https://community.hitachivantara.com/s/article/data-integration-kettle","title":"Weka for newbies"},{"location":"weka_for_newbies/#important-tips","text":"Since many people use Weka, lots of (basic & advanced) questions have already been asked on the mailing list. Therefore, using \" wekalist\" in your preferred search engine might help you get an answer faster than asking the same question again on the list before doing any research on your own first. In case you would be really lazy : http://www.google.com/search?hl=en&q= %20wekalist","title":"Important Tips"},{"location":"weka_for_newbies/#java-programming-help","text":"http://mindprod.com/jgloss/jcheat.html http://mindprod.com/jgloss/jgloss.html In French: http://www.jmdoudoux.fr/accueil_java.htm","title":"Java programming help"},{"location":"weka_for_newbies/#ai-and-machine-learning-courses","text":"Artificial intelligence https://www.udacity.com/wiki/cs271 Machine learning https://www.youtube.com/channel/UCXYXSGq6Oz21b43hpW2DCvw https://www.coursera.org/course/ml http://www.cs.cornell.edu/Courses/cs4780/2013fa/#lectures http://shop.oreilly.com/product/0636920025610.do Specialized (Probabilistic Graphical Models - PGM: Bayesian and Markov networks) https://www.coursera.org/course/pgm","title":"AI and machine learning courses"},{"location":"weka_for_newbies/#introductory-books-on-machine-learning","text":"https://www.amazon.com/Data-Mining-Practical-Techniques-Management/dp/0128042915/ref=sr_1_1?keywords=data+mining+weka&qid=1575607507&s=books&sr=1-1 http://www.amazon.com/Machine-Learning-Tom-M-Mitchell/dp/0070428077/ref=sr_1_1?ie=UTF8&qid=1394186512&sr=8-1&keywords=tom+mitchell http://www.amazon.com/Pattern-Classification-Pt-1-Richard-Duda/dp/0471056693/ref=pd_sim_b_5?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Artificial-Intelligence-Modern-Approach-Edition/dp/0136042597/ref=pd_sim_b_6?ie=UTF8&refRID=1Z8Y81J1WHER2HDRYGP3 http://www.amazon.com/Introduction-Machine-Learning-Adaptive-Computation/dp/026201243X/ref=sr_1_1?s=books&ie=UTF8&qid=1394186894&sr=1-1&keywords=Alpaydin-Introduction+to+Machine+Learning In French: http://www.amazon.com/Apprentissage-artificiel-algorithmes-Antoine-Cornu%C3%A9jols/dp/2212110200/ref=sr_1_6?ie=UTF8&qid=1394186532&sr=8-6&keywords=cornu%C3%A9jols","title":"Introductory books on machine learning"},{"location":"weka_for_newbies/#third-party-introductions-to-weka","text":"http://www.ibm.com/developerworks/library/os-weka1/ https://www.youtube.com/watch?v=TF1yh5PKaqI&t=13s https://www.youtube.com/playlist?list=PLJbE6j2EG1pZnBhOg3_Rb63WLCprtyJag https://www.youtube.com/watch?v=m7kpIBGEdkI","title":"Third-party introductions to Weka"},{"location":"weka_for_newbies/#weka-based-development","text":"Code Examples A Simple Text Classifier in Java with WEKA presents and discuses two little programs as examples of how to integrate WEKA into your Java code for text mining: http://jmgomezhidalgo.blogspot.com.es/2013/04/a-simple-text-classifier-in-java-with.html Language Identification as Text Classification with WEKA explains how to build an automated language guesser for texts as a complete example of a text mining process with WEKA, and in order to demonstrate a more advanced usage of the StringToWordVector class: http://jmgomezhidalgo.blogspot.com.es/2013/05/language-identification-as-text.html Sample Code for Text Indexing with WEKA shows how to index a text dataset using your own Java code and the StringToWordVector filter in WEKA: http://jmgomezhidalgo.blogspot.com.es/2013/06/sample-code-for-text-indexing-with-weka.html Text Mining in WEKA Revisited: Selecting Attributes by Chaining Filters: http://jmgomezhidalgo.blogspot.com.es/2013/02/text-mining-in-weka-revisited-selecting.html","title":"Weka-based development"},{"location":"weka_for_newbies/#specific-applications-and-other-tools","text":"Search Engine / Reranking https://www.lemurproject.org/sifaka.php https://cs.uni-paderborn.de/de/is/research/research-projects/software/weka-lr-a-label-ranking-extension-for-weka/ https://github.com/quansun/fantail-ml (ranking prediction, multi-target regression, label ranking and metalearning) Extraction, Transformation Loading (ETL) https://community.hitachivantara.com/s/article/data-integration-kettle","title":"Specific Applications and other tools"},{"location":"weka_gui_beans_beans.props/","text":"File # weka/gui/beans/Beans.props Description # Properties file that customizes the KnowledgeFlow. Version # >= 3.3.2 Fields # weka.gui.beans.KnowledgeFlow.standardToolBars list of standard toolbars (containing bean tools that do not wrap weka base class types) *.order toolbar ordering information for wrapper types *.alias toolbar naming aliases for weka algorithm classes GUI behavior (>= 3.5.1) ScrollBarIncrementLayout the increment for scrollbars using the mouse's scroll-wheel in the layout area ScrollBarIncrementComponents the increment for scrollbars using the mouse's scroll-wheel in the component area (i.e., the toolbars) FlowWidth the size of the layout area FlowHeight the size of the layout area PreferredExtension the preferred file extension (and therefore format) for saving a flow layout UserComponentsInXML whether the user components (\"meta\"-beans) are saved in XML (i.e., .kfml ) or not Colours (> 3.5.7) weka.gui.beans.StripChart.backgroundColour the background color of the StripChart, default is black (can use R,G,B format) weka.gui.beans.StripChart$LegendPanel.borderColour the color of the text on the legend's border, default is blue (can use R,G,B format) See also # Properties File","title":"File"},{"location":"weka_gui_beans_beans.props/#file","text":"weka/gui/beans/Beans.props","title":"File"},{"location":"weka_gui_beans_beans.props/#description","text":"Properties file that customizes the KnowledgeFlow.","title":"Description"},{"location":"weka_gui_beans_beans.props/#version","text":">= 3.3.2","title":"Version"},{"location":"weka_gui_beans_beans.props/#fields","text":"weka.gui.beans.KnowledgeFlow.standardToolBars list of standard toolbars (containing bean tools that do not wrap weka base class types) *.order toolbar ordering information for wrapper types *.alias toolbar naming aliases for weka algorithm classes GUI behavior (>= 3.5.1) ScrollBarIncrementLayout the increment for scrollbars using the mouse's scroll-wheel in the layout area ScrollBarIncrementComponents the increment for scrollbars using the mouse's scroll-wheel in the component area (i.e., the toolbars) FlowWidth the size of the layout area FlowHeight the size of the layout area PreferredExtension the preferred file extension (and therefore format) for saving a flow layout UserComponentsInXML whether the user components (\"meta\"-beans) are saved in XML (i.e., .kfml ) or not Colours (> 3.5.7) weka.gui.beans.StripChart.backgroundColour the background color of the StripChart, default is black (can use R,G,B format) weka.gui.beans.StripChart$LegendPanel.borderColour the color of the text on the legend's border, default is blue (can use R,G,B format)","title":"Fields"},{"location":"weka_gui_beans_beans.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_experiment_experimenter.props/","text":"File # weka/gui/experiment/Experimenter.props Description # Used for customizing the initial Experimenter settings. Version # >= 3.4.6 >= 3.5.1 Fields # Extension the default extension in the file-dialog (and therefore format) .exp - uses Java Serialization [.xml](xml#serialization of experiments.md) [.koml](xml#serialization of experiments.md) Destination - simple the default destination ARFF file CSV file JDBC database ExperimentType - simple the experiment type Cross-validation Train/Test Percentage Split (data randomized) Train/Test Percentage Split (order preserved) UseClassification - simple whether classification is the default ( true ) or regression ( false ) Folds - simple the default number of CV folds TrainPercentage - simple the default percentage for training (0 - 100) Repetitions - simple the default number of repetitions DatasetsFirst whether datasets are first iterated ( true ) or the algorithms ( false ) InitialDatasetsDirectory the initial datasets directory Note for Win32: the path backslashes have to written as \"\\\" UseRelativePaths whether to use relative paths ( true ) or absolute ones ( false ) Tester the default tester to use Paired T-Tester (corrected) Paired T-Tester Row the row selection Column the column selection ComparisonField the default comparison field (lower case!), cf. combobox Significance the default significance (0.0 - 1.0) Sorting the default sorting, left empty means no sorting at all ShowStdDev whether stddevs are displayed by default ShowAverage whether the Average is displayed by default (prints an additional list in the results) MeanPrecision the default precision for the mean StdDevPrecision the default precision for the stdev OutputFormat the classname of the ResultMatrix, responsible for the default output format (see weka.experiment package) RemoveFilterClassnames whether filter classnames are removed by default Note: simple means that this option is only available in the simple version of the Experimenter, not the advanced one See also # Properties File","title":"File"},{"location":"weka_gui_experiment_experimenter.props/#file","text":"weka/gui/experiment/Experimenter.props","title":"File"},{"location":"weka_gui_experiment_experimenter.props/#description","text":"Used for customizing the initial Experimenter settings.","title":"Description"},{"location":"weka_gui_experiment_experimenter.props/#version","text":">= 3.4.6 >= 3.5.1","title":"Version"},{"location":"weka_gui_experiment_experimenter.props/#fields","text":"Extension the default extension in the file-dialog (and therefore format) .exp - uses Java Serialization [.xml](xml#serialization of experiments.md) [.koml](xml#serialization of experiments.md) Destination - simple the default destination ARFF file CSV file JDBC database ExperimentType - simple the experiment type Cross-validation Train/Test Percentage Split (data randomized) Train/Test Percentage Split (order preserved) UseClassification - simple whether classification is the default ( true ) or regression ( false ) Folds - simple the default number of CV folds TrainPercentage - simple the default percentage for training (0 - 100) Repetitions - simple the default number of repetitions DatasetsFirst whether datasets are first iterated ( true ) or the algorithms ( false ) InitialDatasetsDirectory the initial datasets directory Note for Win32: the path backslashes have to written as \"\\\" UseRelativePaths whether to use relative paths ( true ) or absolute ones ( false ) Tester the default tester to use Paired T-Tester (corrected) Paired T-Tester Row the row selection Column the column selection ComparisonField the default comparison field (lower case!), cf. combobox Significance the default significance (0.0 - 1.0) Sorting the default sorting, left empty means no sorting at all ShowStdDev whether stddevs are displayed by default ShowAverage whether the Average is displayed by default (prints an additional list in the results) MeanPrecision the default precision for the mean StdDevPrecision the default precision for the stdev OutputFormat the classname of the ResultMatrix, responsible for the default output format (see weka.experiment package) RemoveFilterClassnames whether filter classnames are removed by default Note: simple means that this option is only available in the simple version of the Experimenter, not the advanced one","title":"Fields"},{"location":"weka_gui_experiment_experimenter.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_explorer_explorer.props/","text":"File # weka/gui/explorer/Explorer.props Description # This props file determines what schemes and options are initially set in the Explorer. Version # 3.5.3 Fields # Preprocess panel # InitGenericObjectEditorFilter if set to true the Capabilities filters in the GOE will be initialized based on the full dataset that has been loaded into the Explorer otherwise only the header Tabs Lists all the tabs that should be displayed in the Explorer. Apart from the Preprocess panel itself, all other panels are basically plugins. See the Adding tabs in the Explorer article for more details on adding custom panels. InitialDirectory (> 3.6.0, developer version) Defines the initial directory for opening datasets in the Preprocess panel. The following placeholders are recognized (work across platforms): %t - the temp directory %h - the user's home directory %c - the current directory (the default setting) %% - gets replaced by a single percentage sign enableUndo (> 3.6.5, > 3.7.4) Enable/disable the creation of undo files (default is enabled) undoDirectory (> 3.6.5, > 3.7.4) Specify the directory to use for saving undo files The following placeholders are recognized (work across platforms): %t - the temp directory Filter the filter to use, none if left empty Classify panel # Classifier the classifier to use ClassifierTestMode the default test mode in the classify tab 1 - cross-validation (default) 2 - percentage split 3 - use training set 4 - supplied test set ClassifierCrossvalidationFolds the default number of folds for CV ClassifierCostSensitiveEval whether the evaluation of the classifier is done cost-sensitively a cost matrix still has to be provided! ClassifierOutputConfusionMatrix whether the confusion matrix is output ClassifierOutputEntropyEvalMeasures whether the entropy based evaluation measures of the classifier model are output ClassifierOutputModel whether the classifier model is output ClassifierOutputPerClassStats whether additional per-class stats of the classifier model are output ClassifierOutputPredictions whether the predictions of the classifier output as well ClassifierPercentageSplit the default percentage split in % ClassifierPreserveOrder whether the order is preserved in case of percentage split ClassifierRandomSeed the default random seed ClassifierStorePredictionsForVis whether the predictions of the classifier are stored for visulization purposes ClassifierOutputSourceCode (> 3.5.5) whether to output Java source code for classifiers that implement the weka.classifiers.Sourcable interface ClassifierSourceCodeClass (> 3.5.5) the default classname of the generated Java source code ClassifierErrorsPlotInstances (> 3.7.0) the default classname for the class generating the plot instances of the classifier errors ClassifierErrorsMinimumPlotSizeNumeric (> 3.7.0) the minimum size for the crosses that display the classifier errors for numeric class attributes ClassifierErrorsMaximumPlotSizeNumeric (> 3.7.0) the maximum size for the crosses that display the classifier errors for numeric class attributes Cluster panel # Clusterer the clusterer to use ClustererTestMode the default test mode 2 - percentage split 3 - use training set (default) 4 - supplied test set 5 - classes to clusters evaluation ClustererStoreClustersForVis whether the clusters are stored for visualization purposes Associations panel Associator the default associator Attribute selection panel ASEvaluation the default attribute evaluator ASSearch the default attribute selection search scheme ASTestMode the default test mode 0 - use full training set (default) 1 - cross-validation ASCrossvalidationFolds the default number of folds for CV ASRandomSeed the default random seed See also # Properties File","title":"File"},{"location":"weka_gui_explorer_explorer.props/#file","text":"weka/gui/explorer/Explorer.props","title":"File"},{"location":"weka_gui_explorer_explorer.props/#description","text":"This props file determines what schemes and options are initially set in the Explorer.","title":"Description"},{"location":"weka_gui_explorer_explorer.props/#version","text":"3.5.3","title":"Version"},{"location":"weka_gui_explorer_explorer.props/#fields","text":"","title":"Fields"},{"location":"weka_gui_explorer_explorer.props/#preprocess-panel","text":"InitGenericObjectEditorFilter if set to true the Capabilities filters in the GOE will be initialized based on the full dataset that has been loaded into the Explorer otherwise only the header Tabs Lists all the tabs that should be displayed in the Explorer. Apart from the Preprocess panel itself, all other panels are basically plugins. See the Adding tabs in the Explorer article for more details on adding custom panels. InitialDirectory (> 3.6.0, developer version) Defines the initial directory for opening datasets in the Preprocess panel. The following placeholders are recognized (work across platforms): %t - the temp directory %h - the user's home directory %c - the current directory (the default setting) %% - gets replaced by a single percentage sign enableUndo (> 3.6.5, > 3.7.4) Enable/disable the creation of undo files (default is enabled) undoDirectory (> 3.6.5, > 3.7.4) Specify the directory to use for saving undo files The following placeholders are recognized (work across platforms): %t - the temp directory Filter the filter to use, none if left empty","title":"Preprocess panel"},{"location":"weka_gui_explorer_explorer.props/#classify-panel","text":"Classifier the classifier to use ClassifierTestMode the default test mode in the classify tab 1 - cross-validation (default) 2 - percentage split 3 - use training set 4 - supplied test set ClassifierCrossvalidationFolds the default number of folds for CV ClassifierCostSensitiveEval whether the evaluation of the classifier is done cost-sensitively a cost matrix still has to be provided! ClassifierOutputConfusionMatrix whether the confusion matrix is output ClassifierOutputEntropyEvalMeasures whether the entropy based evaluation measures of the classifier model are output ClassifierOutputModel whether the classifier model is output ClassifierOutputPerClassStats whether additional per-class stats of the classifier model are output ClassifierOutputPredictions whether the predictions of the classifier output as well ClassifierPercentageSplit the default percentage split in % ClassifierPreserveOrder whether the order is preserved in case of percentage split ClassifierRandomSeed the default random seed ClassifierStorePredictionsForVis whether the predictions of the classifier are stored for visulization purposes ClassifierOutputSourceCode (> 3.5.5) whether to output Java source code for classifiers that implement the weka.classifiers.Sourcable interface ClassifierSourceCodeClass (> 3.5.5) the default classname of the generated Java source code ClassifierErrorsPlotInstances (> 3.7.0) the default classname for the class generating the plot instances of the classifier errors ClassifierErrorsMinimumPlotSizeNumeric (> 3.7.0) the minimum size for the crosses that display the classifier errors for numeric class attributes ClassifierErrorsMaximumPlotSizeNumeric (> 3.7.0) the maximum size for the crosses that display the classifier errors for numeric class attributes","title":"Classify panel"},{"location":"weka_gui_explorer_explorer.props/#cluster-panel","text":"Clusterer the clusterer to use ClustererTestMode the default test mode 2 - percentage split 3 - use training set (default) 4 - supplied test set 5 - classes to clusters evaluation ClustererStoreClustersForVis whether the clusters are stored for visualization purposes Associations panel Associator the default associator Attribute selection panel ASEvaluation the default attribute evaluator ASSearch the default attribute selection search scheme ASTestMode the default test mode 0 - use full training set (default) 1 - cross-validation ASCrossvalidationFolds the default number of folds for CV ASRandomSeed the default random seed","title":"Cluster panel"},{"location":"weka_gui_explorer_explorer.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_generic_object_editor.props/","text":"File # weka/gui/GenericObjectEditor.props Description # Has been superceded by weka/gui/GenericPropertiesCreator.props which performs a dynamic discovery of classes. Version # >= 3.1.3 See also # weka/gui/GenericPropertiesCreator.props GenericObjectEditor (explains how to add new schemes) Properties File","title":"File"},{"location":"weka_gui_generic_object_editor.props/#file","text":"weka/gui/GenericObjectEditor.props","title":"File"},{"location":"weka_gui_generic_object_editor.props/#description","text":"Has been superceded by weka/gui/GenericPropertiesCreator.props which performs a dynamic discovery of classes.","title":"Description"},{"location":"weka_gui_generic_object_editor.props/#version","text":">= 3.1.3","title":"Version"},{"location":"weka_gui_generic_object_editor.props/#see-also","text":"weka/gui/GenericPropertiesCreator.props GenericObjectEditor (explains how to add new schemes) Properties File","title":"See also"},{"location":"weka_gui_generic_properties_creator.excludes/","text":"File # weka/gui/GenericPropertiesCreator.excludes Description # List classes for corresponding superclasses in GenericPropertiesCreator.props that shouldn't appear in the GenericObjectEditor popup tree. Version # >= 3.5.3 Fields # Format <key>=<prefix>:<class>[,<prefix>:<class>] <key> the key from GenericPropertiesCreator.props (class or interface) <prefix> S (\"Superclass\"): any class derived from this one will be excluded I (\"Interface\"): any class implementing this interface will be excluded C (\"Class\"): exactly this class will be excluded Example weka.experiment.ResultListener=I:weka.experiment.ResultProducer See also # weka/gui/GenericPropertiesCreator.props GenericObjectEditor Properties file","title":"File"},{"location":"weka_gui_generic_properties_creator.excludes/#file","text":"weka/gui/GenericPropertiesCreator.excludes","title":"File"},{"location":"weka_gui_generic_properties_creator.excludes/#description","text":"List classes for corresponding superclasses in GenericPropertiesCreator.props that shouldn't appear in the GenericObjectEditor popup tree.","title":"Description"},{"location":"weka_gui_generic_properties_creator.excludes/#version","text":">= 3.5.3","title":"Version"},{"location":"weka_gui_generic_properties_creator.excludes/#fields","text":"Format <key>=<prefix>:<class>[,<prefix>:<class>] <key> the key from GenericPropertiesCreator.props (class or interface) <prefix> S (\"Superclass\"): any class derived from this one will be excluded I (\"Interface\"): any class implementing this interface will be excluded C (\"Class\"): exactly this class will be excluded Example weka.experiment.ResultListener=I:weka.experiment.ResultProducer","title":"Fields"},{"location":"weka_gui_generic_properties_creator.excludes/#see-also","text":"weka/gui/GenericPropertiesCreator.props GenericObjectEditor Properties file","title":"See also"},{"location":"weka_gui_generic_properties_creator.props/","text":"File # weka/gui/GenericPropertiesCreator.props Description # Lists all the packages to look in for subclasses of a certain superclass to be displayed in the GenericObjectEditor . Note: Weka 3.5.8 turned the automatic discovery off by default. In this case, the weka/gui/GenericObjectEditor.props is used again. Version # >= 3.4.4 Fields # enable/disable dynamic discovery (> 3.5.5) UseDynamic=true|false Format (a backslash at the end continues the package list on the next line) <superclass>=<package>[,package[,...]] Filter example weka.filters.Filter = \\ weka.filters, \\ weka.filters.supervised.attribute, \\ weka.filters.supervised.instance, \\ weka.filters.unsupervised.attribute, \\ weka.filters.unsupervised.instance Classifier example weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.meta.nestedDichotomies, \\ weka.classifiers.mi, \\ weka.classifiers.misc, \\ weka.classifiers.trees, \\ weka.classifiers.rules See also # GenericObjectEditor (explains how to add new schemes) Properties file","title":"File"},{"location":"weka_gui_generic_properties_creator.props/#file","text":"weka/gui/GenericPropertiesCreator.props","title":"File"},{"location":"weka_gui_generic_properties_creator.props/#description","text":"Lists all the packages to look in for subclasses of a certain superclass to be displayed in the GenericObjectEditor . Note: Weka 3.5.8 turned the automatic discovery off by default. In this case, the weka/gui/GenericObjectEditor.props is used again.","title":"Description"},{"location":"weka_gui_generic_properties_creator.props/#version","text":">= 3.4.4","title":"Version"},{"location":"weka_gui_generic_properties_creator.props/#fields","text":"enable/disable dynamic discovery (> 3.5.5) UseDynamic=true|false Format (a backslash at the end continues the package list on the next line) <superclass>=<package>[,package[,...]] Filter example weka.filters.Filter = \\ weka.filters, \\ weka.filters.supervised.attribute, \\ weka.filters.supervised.instance, \\ weka.filters.unsupervised.attribute, \\ weka.filters.unsupervised.instance Classifier example weka.classifiers.Classifier = \\ weka.classifiers.bayes, \\ weka.classifiers.functions, \\ weka.classifiers.lazy, \\ weka.classifiers.meta, \\ weka.classifiers.meta.nestedDichotomies, \\ weka.classifiers.mi, \\ weka.classifiers.misc, \\ weka.classifiers.trees, \\ weka.classifiers.rules","title":"Fields"},{"location":"weka_gui_generic_properties_creator.props/#see-also","text":"GenericObjectEditor (explains how to add new schemes) Properties file","title":"See also"},{"location":"weka_gui_gui_editors.props/","text":"File # weka/gui/GUIEditors.props Description # Lists what classes are handled by which GUI editor. Formerly done statically (but centralized) by the class weka.gui.GenericObjectEditor or in versions older than 3.5.2 all over the place. Version # 3.5.3 Fields # Format <class>=<editor> e.g., java.io.File=weka.gui.FileEditor <class[]>=<editor> (for arrays) e.g., java.lang.Object[]=weka.gui.GenericArrayEditor Available editors General editors weka.gui.GenericObjectEditor weka.gui.GenericArrayEditor Specialized editors weka.gui.CostMatrixEditor weka.gui.EnsembleSelectionLibraryEditor weka.gui.FileEditor weka.gui.SelectedTagEditor weka.gui.SimpleDateFormatEditor See also # GenericObjectEditor Properties File","title":"File"},{"location":"weka_gui_gui_editors.props/#file","text":"weka/gui/GUIEditors.props","title":"File"},{"location":"weka_gui_gui_editors.props/#description","text":"Lists what classes are handled by which GUI editor. Formerly done statically (but centralized) by the class weka.gui.GenericObjectEditor or in versions older than 3.5.2 all over the place.","title":"Description"},{"location":"weka_gui_gui_editors.props/#version","text":"3.5.3","title":"Version"},{"location":"weka_gui_gui_editors.props/#fields","text":"Format <class>=<editor> e.g., java.io.File=weka.gui.FileEditor <class[]>=<editor> (for arrays) e.g., java.lang.Object[]=weka.gui.GenericArrayEditor Available editors General editors weka.gui.GenericObjectEditor weka.gui.GenericArrayEditor Specialized editors weka.gui.CostMatrixEditor weka.gui.EnsembleSelectionLibraryEditor weka.gui.FileEditor weka.gui.SelectedTagEditor weka.gui.SimpleDateFormatEditor","title":"Fields"},{"location":"weka_gui_gui_editors.props/#see-also","text":"GenericObjectEditor Properties File","title":"See also"},{"location":"weka_gui_look_and_feel.props/","text":"File # weka/gui/LookAndFeel.props Description # Defines with what theme Weka is displayed. By default Weka starts with the system default one but sometimes it can help to change this to get Weka going or present user interfaces more nicely. Version # >= 3.4.5 >= 3.5.0 Fields # Theme a few possible values javax.swing.plaf.metal.MetalLookAndFeel works on all platforms fixes problem with Java 5/6 and Linux/Gnome com.sun.java.swing.plaf.windows.WindowsLookAndFeel theme for Win32 ... See also # Troubleshooting (GUIChooser starts but not Experimenter or Explorer) Troubleshooting (KnowledgeFlow toolbars are empty) Properties file","title":"File"},{"location":"weka_gui_look_and_feel.props/#file","text":"weka/gui/LookAndFeel.props","title":"File"},{"location":"weka_gui_look_and_feel.props/#description","text":"Defines with what theme Weka is displayed. By default Weka starts with the system default one but sometimes it can help to change this to get Weka going or present user interfaces more nicely.","title":"Description"},{"location":"weka_gui_look_and_feel.props/#version","text":">= 3.4.5 >= 3.5.0","title":"Version"},{"location":"weka_gui_look_and_feel.props/#fields","text":"Theme a few possible values javax.swing.plaf.metal.MetalLookAndFeel works on all platforms fixes problem with Java 5/6 and Linux/Gnome com.sun.java.swing.plaf.windows.WindowsLookAndFeel theme for Win32 ...","title":"Fields"},{"location":"weka_gui_look_and_feel.props/#see-also","text":"Troubleshooting (GUIChooser starts but not Experimenter or Explorer) Troubleshooting (KnowledgeFlow toolbars are empty) Properties file","title":"See also"},{"location":"weka_gui_memory_usage.props/","text":"File # weka/gui/MemoryUsage.props Description # Contains properties for the memory usage panel/frame, which can be launched from: weka.gui.Main File -> Memory usage weka.gui.GUIChooser click on the Memory usage button Version # 3.5.7 Fields # Width the width of the panel in pixel. The default is 400 . Height the height of the panel - normally not set, since the height of the garbage collector button is used Left , Top specify a fixed location on the screen in pixel. Both of these parameters have to be different from -1 to place the window. The default is -1 for both, i.e., top-left corner of the screen. BackgroundColor defines the background color of the panel, can either be the name of a standard Java color or an RGB triplet (= R,G,B ). Default is white . Interval the refresh interval in milli-second. The default is 1000 . Percentages comma-separated list of percentage number that indicate when to switch colors. E.g., 70,80,90 (which is the default) indicates to change the color of the bar from the default one (see DefaultColor ) as soon as the percentage reaches this threshold. For example, if the percentage reaches 70 percent, then the color specified with the key 70 is used (default is yellow for 70 ). If it reaches 80 or more then orange is used (the default for 80 is orange ) and everything above and including 90 will be displayed red ( red is the default for 90 ). DefaultColor the default color to display the percentage bar with, i.e., if the percentage is below the lowest percentage listed in Percentages . See also # Properties file","title":"File"},{"location":"weka_gui_memory_usage.props/#file","text":"weka/gui/MemoryUsage.props","title":"File"},{"location":"weka_gui_memory_usage.props/#description","text":"Contains properties for the memory usage panel/frame, which can be launched from: weka.gui.Main File -> Memory usage weka.gui.GUIChooser click on the Memory usage button","title":"Description"},{"location":"weka_gui_memory_usage.props/#version","text":"3.5.7","title":"Version"},{"location":"weka_gui_memory_usage.props/#fields","text":"Width the width of the panel in pixel. The default is 400 . Height the height of the panel - normally not set, since the height of the garbage collector button is used Left , Top specify a fixed location on the screen in pixel. Both of these parameters have to be different from -1 to place the window. The default is -1 for both, i.e., top-left corner of the screen. BackgroundColor defines the background color of the panel, can either be the name of a standard Java color or an RGB triplet (= R,G,B ). Default is white . Interval the refresh interval in milli-second. The default is 1000 . Percentages comma-separated list of percentage number that indicate when to switch colors. E.g., 70,80,90 (which is the default) indicates to change the color of the bar from the default one (see DefaultColor ) as soon as the percentage reaches this threshold. For example, if the percentage reaches 70 percent, then the color specified with the key 70 is used (default is yellow for 70 ). If it reaches 80 or more then orange is used (the default for 80 is orange ) and everything above and including 90 will be displayed red ( red is the default for 90 ). DefaultColor the default color to display the percentage bar with, i.e., if the percentage is below the lowest percentage listed in Percentages .","title":"Fields"},{"location":"weka_gui_memory_usage.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_gui_scripting_groovy.props/","text":"File # weka/gui/scripting/Groovy.props Description # This props file determines the look and feel of the minimalistic scripting IDE for Groovy . Version # 3.5.8 Fields # FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 2 Tabs The number of spaces a tab represents. default: 2 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: /* MultiLineCommentEnd The character sequence that ends a multi-line comment. default: */ SingleLineCommentStart The character sequence that starts a single-line comment. default: * AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: true BlockStart The character sequence that starts a block. default: { BlockEndd The character sequence that ends a block. default: } Keywords Comma-separated list of keywords to highlight. Notes # Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE. See also # Properties File Using Weka from Groovy Links # Groovy homepage","title":"File"},{"location":"weka_gui_scripting_groovy.props/#file","text":"weka/gui/scripting/Groovy.props","title":"File"},{"location":"weka_gui_scripting_groovy.props/#description","text":"This props file determines the look and feel of the minimalistic scripting IDE for Groovy .","title":"Description"},{"location":"weka_gui_scripting_groovy.props/#version","text":"3.5.8","title":"Version"},{"location":"weka_gui_scripting_groovy.props/#fields","text":"FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 2 Tabs The number of spaces a tab represents. default: 2 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: /* MultiLineCommentEnd The character sequence that ends a multi-line comment. default: */ SingleLineCommentStart The character sequence that starts a single-line comment. default: * AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: true BlockStart The character sequence that starts a block. default: { BlockEndd The character sequence that ends a block. default: } Keywords Comma-separated list of keywords to highlight.","title":"Fields"},{"location":"weka_gui_scripting_groovy.props/#notes","text":"Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE.","title":"Notes"},{"location":"weka_gui_scripting_groovy.props/#see-also","text":"Properties File Using Weka from Groovy","title":"See also"},{"location":"weka_gui_scripting_groovy.props/#links","text":"Groovy homepage","title":"Links"},{"location":"weka_gui_scripting_jython.props/","text":"File # weka/gui/scripting/Jython.props Description # This props file determines the look and feel of the minimalistic scripting IDE for Jython . Version # 3.5.8 Fields # FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 4 Tabs The number of spaces a tab represents. default: 4 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: \"\"\" MultiLineCommentEnd The character sequence that ends a multi-line comment. default: \"\"\" SingleLineCommentStart The character sequence that starts a single-line comment. default: # AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: false BlockStart The character sequence that starts a block. default: -none- BlockEndd The character sequence that ends a block. default: -none- Keywords Comma-separated list of keywords to highlight. Notes # Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE. See also # Properties File Using Weka from Jython Links # Jython homepage","title":"File"},{"location":"weka_gui_scripting_jython.props/#file","text":"weka/gui/scripting/Jython.props","title":"File"},{"location":"weka_gui_scripting_jython.props/#description","text":"This props file determines the look and feel of the minimalistic scripting IDE for Jython .","title":"Description"},{"location":"weka_gui_scripting_jython.props/#version","text":"3.5.8","title":"Version"},{"location":"weka_gui_scripting_jython.props/#fields","text":"FontName Specifies the name of the font for displaying the code. default: monospaced FontSize The font size. default: 12 ForegroundColor The color of the font (if not comment or keyword). Can take R,G,B format. default: black BackgroundColor The background color. Can take R,G,B format. default: white KeywordColor The color for keywords (see list in field Keywords ). Can take R,G,B format. default: blue CommentColor The color for comments (single-line and multi-line). Can take R,G,B format. default: gray StringColor The color for strings (enclosed in single or double quotes). Can take R,G,B format. default: red Syntax Whether syntax highlighting is turned on or not (true|false) default: true Indentation The number of spaces to use for indentation. default: 4 Tabs The number of spaces a tab represents. default: 4 UseBlanks Whether to use blanks instead of tabs (true|false). default: true Delimiters The characters that define word limits. default: ;:{}()[]+-/%<=>!&|^~* QuoteDelimiters The characters that enclose a string. default: \"' QuoteEscape The character to escape the QuoteDelimiters with. default: \\ (backslash) MultiLineComment Whether to enable multi-line comments (true|false). default: true MultiLineCommentStart The character sequence that starts a multi-line comment. default: \"\"\" MultiLineCommentEnd The character sequence that ends a multi-line comment. default: \"\"\" SingleLineCommentStart The character sequence that starts a single-line comment. default:","title":"Fields"},{"location":"weka_gui_scripting_jython.props/#_1","text":"AddMatchingBlockEnd Whether to automatically add matching block end character sequences while typing (true|false). default: false BlockStart The character sequence that starts a block. default: -none- BlockEndd The character sequence that ends a block. default: -none- Keywords Comma-separated list of keywords to highlight.","title":""},{"location":"weka_gui_scripting_jython.props/#notes","text":"Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE.","title":"Notes"},{"location":"weka_gui_scripting_jython.props/#see-also","text":"Properties File Using Weka from Jython","title":"See also"},{"location":"weka_gui_scripting_jython.props/#links","text":"Jython homepage","title":"Links"},{"location":"weka_gui_simple_cli.props/","text":"File # weka/gui/SimpleCLI.props Description # Contains properties for the SimpleCLI, e.g., the command history. Whenever the user issues a command, the history gets saved in the user's home directory ( $HOME/SimpleCLI.props on Linux or %USERPROFILE%/SimpleCLI.props on Windows). Version # 3.5.6 Fields # HistorySize the maximum number of most recent commands to store in the properties file (in the user's home directory). Command X lists command X of the history, with X being an integer starting from 0. See also # Properties file","title":"File"},{"location":"weka_gui_simple_cli.props/#file","text":"weka/gui/SimpleCLI.props","title":"File"},{"location":"weka_gui_simple_cli.props/#description","text":"Contains properties for the SimpleCLI, e.g., the command history. Whenever the user issues a command, the history gets saved in the user's home directory ( $HOME/SimpleCLI.props on Linux or %USERPROFILE%/SimpleCLI.props on Windows).","title":"Description"},{"location":"weka_gui_simple_cli.props/#version","text":"3.5.6","title":"Version"},{"location":"weka_gui_simple_cli.props/#fields","text":"HistorySize the maximum number of most recent commands to store in the properties file (in the user's home directory). Command X lists command X of the history, with X being an integer starting from 0.","title":"Fields"},{"location":"weka_gui_simple_cli.props/#see-also","text":"Properties file","title":"See also"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/","text":"File # weka/gui/treevisualizer/TreeVisualizer.props Description # Customizes the TreeVisualizer display. The TreeVisualizer is used in the Explorer to display trees, e.g., generated by J48. Version # 3.6.0 (stable-3.6 version) 3.5.8 (developer version) Fields # FontColor (can use R,G,B format) The color of the text being displayed, node and edge labels. BackgroundColor (can use R,G,B format) The background color, by default empty in order to use the platforms default background color. Note: on Mac OS X, this seems to result in BLACK when saving the tree to an image file. Mac OS X users should fill in a color. NodeColor (can use R,G,B format) The color in which the node boxes are painted. LineColor (can use R,G,B format) The color of the edges. ZoomBoxColor (can use R,G,B format) The color of the zoom box. ZoomBoxXORColor (can use R,G,B format) The XOR color of the zoom box. ShowBorder Indicates whether to show the border around the graph (labeled \"Tree View\") or not. See also # Properties File","title":"File"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#file","text":"weka/gui/treevisualizer/TreeVisualizer.props","title":"File"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#description","text":"Customizes the TreeVisualizer display. The TreeVisualizer is used in the Explorer to display trees, e.g., generated by J48.","title":"Description"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#version","text":"3.6.0 (stable-3.6 version) 3.5.8 (developer version)","title":"Version"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#fields","text":"FontColor (can use R,G,B format) The color of the text being displayed, node and edge labels. BackgroundColor (can use R,G,B format) The background color, by default empty in order to use the platforms default background color. Note: on Mac OS X, this seems to result in BLACK when saving the tree to an image file. Mac OS X users should fill in a color. NodeColor (can use R,G,B format) The color in which the node boxes are painted. LineColor (can use R,G,B format) The color of the edges. ZoomBoxColor (can use R,G,B format) The color of the zoom box. ZoomBoxXORColor (can use R,G,B format) The XOR color of the zoom box. ShowBorder Indicates whether to show the border around the graph (labeled \"Tree View\") or not.","title":"Fields"},{"location":"weka_gui_tree_visualizer_tree_visualizes.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_gui_visualize_visualize.props/","text":"File # weka/gui/visualize/Visualize.props Description # Customizes display of plots and certain curves in the GUI. Version # >= 3.1.9 Fields # weka.gui.visualize.precision Maximum precision for numeric values weka.gui.visualize.Plot2D.axisColour Colour for the axis in the 2D plot (can use R,G,B format) weka.gui.visualize.Plot2D.backgroundColour Colour for the background of the 2D plot (can use R,G,B format) weka.gui.visualize.VisualizePanel.displayAttributeBars Display the list of one dimensional attribute visualizations weka.gui.visualize.AttributePanel.barColour Colour for the background of the attribute bars in the AttributePanel (can use R,G,B format) weka.gui.visualize.Plot2D.instanceInfoFrame (developer version later than 3.5.8, not in stable-3.6) Lists the classname for displaying the instance info, e.g., when visualizing the classifier errors in the Explorer. Custom classes only need to be derived from javax.swing.JFrame and implement the weka.gui.visualize.InstanceInfo interface. Threshold curve plots weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.XDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.YDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.ColourDimension Cost curve plots weka.gui.visualize.VisualizePanel.CostCurve.XDimension weka.gui.visualize.VisualizePanel.CostCurve.YDimension weka.gui.visualize.VisualizePanel.CostCurve.ColourDimension Margin curve plots weka.gui.visualize.VisualizePanel.MarginCurve.XDimension weka.gui.visualize.VisualizePanel.MarginCurve.YDimension weka.gui.visualize.VisualizePanel.MarginCurve.ColourDimension Notes # Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE. See also # Properties File","title":"File"},{"location":"weka_gui_visualize_visualize.props/#file","text":"weka/gui/visualize/Visualize.props","title":"File"},{"location":"weka_gui_visualize_visualize.props/#description","text":"Customizes display of plots and certain curves in the GUI.","title":"Description"},{"location":"weka_gui_visualize_visualize.props/#version","text":">= 3.1.9","title":"Version"},{"location":"weka_gui_visualize_visualize.props/#fields","text":"weka.gui.visualize.precision Maximum precision for numeric values weka.gui.visualize.Plot2D.axisColour Colour for the axis in the 2D plot (can use R,G,B format) weka.gui.visualize.Plot2D.backgroundColour Colour for the background of the 2D plot (can use R,G,B format) weka.gui.visualize.VisualizePanel.displayAttributeBars Display the list of one dimensional attribute visualizations weka.gui.visualize.AttributePanel.barColour Colour for the background of the attribute bars in the AttributePanel (can use R,G,B format) weka.gui.visualize.Plot2D.instanceInfoFrame (developer version later than 3.5.8, not in stable-3.6) Lists the classname for displaying the instance info, e.g., when visualizing the classifier errors in the Explorer. Custom classes only need to be derived from javax.swing.JFrame and implement the weka.gui.visualize.InstanceInfo interface. Threshold curve plots weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.XDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.YDimension weka.gui.visualize.ThresholdVisualizePanel.ThresholdCurve.ColourDimension Cost curve plots weka.gui.visualize.VisualizePanel.CostCurve.XDimension weka.gui.visualize.VisualizePanel.CostCurve.YDimension weka.gui.visualize.VisualizePanel.CostCurve.ColourDimension Margin curve plots weka.gui.visualize.VisualizePanel.MarginCurve.XDimension weka.gui.visualize.VisualizePanel.MarginCurve.YDimension weka.gui.visualize.VisualizePanel.MarginCurve.ColourDimension","title":"Fields"},{"location":"weka_gui_visualize_visualize.props/#notes","text":"Recognized color names black blue cyan darkGray gray green lightGray magenta orange pink red white yellow R,G,B format The RGB format is a comma-separated list three integer values (values ranging from 0-255) for RED, GREEN and BLUE.","title":"Notes"},{"location":"weka_gui_visualize_visualize.props/#see-also","text":"Properties File","title":"See also"},{"location":"weka_on_a_memory_stick/","text":"The following guide explains how to put Weka on a USB memory stick. This works both for Linux and Windows (Mac OSX as well). For simplicity, this example is demonstrated with the following versions: Weka: 3.5.3 JRE: 1.5.0_10 Preliminaries # download a Weka ZIP (Windows: don't download the Installer!) download the JRE (Java Runtime Environemnt) that works with the downloaded Weka version. (Linux: don't download the RPM , but the Linux self-extracting file ) install the downloaded JRE Windows: the JRE location is normally in C:\\Program Files\\Java Linux: the self-extracting file creates a directory containing the JRE at the same location as the installation file Setup # create a directory on your memory stick that will hold Weka and the JRE: weka * unzip the Weka ZIP into the weka directory, which will create the following sub-directory; weka-3-5-3 * copy the JRE onto the stick in the weka directory, which will be this sub-directory: jre1.5.0_10 Script # As a final step, create a script to start Weka: Windows create a new batch file called weka.bat in the directory weka with the following content @ echo off set CP = %CLASSPATH% ;.\\weka-3-5-3\\weka.jar start .\\jre1.5.0_10\\bin\\javaw -classpath \" %CP% \" weka.gui.GUIChooser Note: If start is not available in your flavor of Windows, you can drop it. It is only used to get rid of the DOS-Box. Linux create a new bash script called weka.sh in the directory weka with the following content #!/bin/bash CP = $CLASSPATH :./weka-3-5-3/weka.jar ./jre1.5.0_10/bin/java -classpath $CP weka.gui.GUIChooser Note: since memory sticks normally use the FAT32 file-system you probably won't need to make it executable Execution # Windows: just double-click on the batch file Linux: open a terminal and execute the bash script Links # Weka homepage Sun Java homepage start command in Windows NT/2000","title":"Weka on a memory stick"},{"location":"weka_on_a_memory_stick/#preliminaries","text":"download a Weka ZIP (Windows: don't download the Installer!) download the JRE (Java Runtime Environemnt) that works with the downloaded Weka version. (Linux: don't download the RPM , but the Linux self-extracting file ) install the downloaded JRE Windows: the JRE location is normally in C:\\Program Files\\Java Linux: the self-extracting file creates a directory containing the JRE at the same location as the installation file","title":"Preliminaries"},{"location":"weka_on_a_memory_stick/#setup","text":"create a directory on your memory stick that will hold Weka and the JRE: weka * unzip the Weka ZIP into the weka directory, which will create the following sub-directory; weka-3-5-3 * copy the JRE onto the stick in the weka directory, which will be this sub-directory: jre1.5.0_10","title":"Setup"},{"location":"weka_on_a_memory_stick/#script","text":"As a final step, create a script to start Weka: Windows create a new batch file called weka.bat in the directory weka with the following content @ echo off set CP = %CLASSPATH% ;.\\weka-3-5-3\\weka.jar start .\\jre1.5.0_10\\bin\\javaw -classpath \" %CP% \" weka.gui.GUIChooser Note: If start is not available in your flavor of Windows, you can drop it. It is only used to get rid of the DOS-Box. Linux create a new bash script called weka.sh in the directory weka with the following content #!/bin/bash CP = $CLASSPATH :./weka-3-5-3/weka.jar ./jre1.5.0_10/bin/java -classpath $CP weka.gui.GUIChooser Note: since memory sticks normally use the FAT32 file-system you probably won't need to make it executable","title":"Script"},{"location":"weka_on_a_memory_stick/#execution","text":"Windows: just double-click on the batch file Linux: open a terminal and execute the bash script","title":"Execution"},{"location":"weka_on_a_memory_stick/#links","text":"Weka homepage Sun Java homepage start command in Windows NT/2000","title":"Links"},{"location":"where_does_weka_look_for_props_files/","text":"WEKA not only uses the .props files that are present in the weka.jar archive, but also the ones in the user's home directory and the current directory, i.e., the one WEKA was started from. For a complete overview, see the section Precedence in the Properties file article. The same article also explains how to modify these .props files in section How to modify a .props file? .","title":"Where does weka look for props files"},{"location":"windows_databases/","text":"A common query we get from our users is how to open a Windows database in the Weka Explorer. This page is intended as a guide to help you achieve this. It is a complicated process and we cannot guarantee that it will work for you. The process described makes use of the JDBC-ODBC bridge that is part of Sun's JRE/JDK 1.3 (and higher). The following instructions are for Windows 2000. Under other Windows versions there may be slight differences. Step 1: Create a User DSN # Go to the Control Panel Choose Adminstrative Tools Choose Data Sources (ODBC) At the User DSN tab, choose Add... Choose database Microsoft Access Note: Make sure your database is not open in another application before following the steps below. Choose the Microsoft Access driver and click Finish Give the source a name by typing it into the Data Source Name field In the Database section, choose Select... Browse to find your database file, select it and click OK Click OK to finalize your DSN Microsoft SQL Server 2000 (Desktop Engine) Choose the SQL Server driver and click Finish Give the source a name by typing it into the Name field Add a description for this source in the Description field Select the server you're connecting to from the Server combobox For the verification of the authenticity of the login ID choose With SQL Server... Check Connect to SQL Server to obtain default settings... and supply the user ID and password with which you installed the Desktop Engine Just click on Next until it changes into Finish and click this, too For testing purposes, click on Test Data Source... - the result should be TESTS COMPLETED SUCCESSFULLY! Click on OK MySQL Choose the MySQL ODBC driver and click Finish Give the source a name by typing it into the Data Source Name field Add a description for this source in the Description field Specify the server you're connecting to in Server Fill in the user to use for connecting to the database in the User field, the same for the password Choose the database for this DSN from the Database combobox Click on OK Your DSN should now be listed in the User Data Sources list Step 2: Set up the DatabaseUtils.props file # You will need to configure a file called DatabaseUtils.props . This file already exists under the path weka/experiment/ in the weka.jar file (which is just a ZIP file) that is part of the Weka download. In this directory you will also find a sample file for ODBC connectivity, called DatabaseUtils.props.odbc , and one specifically for MS Access, called DatabaseUtils.props.msaccess (>3.4.14, >3.5.8, >3.6.0), also using ODBC. You should use one of the sample files as basis for your setup, since they already contain default values specific to ODBC access. This file needs to be recognized when the Explorer starts. You can achieve this by making sure it is in the working directory or the home directory (if you are unsure what the terms working directory and home directory mean, see the Notes section). The easiest is probably the second alternative, as the setup will apply to all the Weka instances on your machine. Just make sure that the file contains the following lines at least: jdbcDriver = sun.jdbc.odbc.JdbcOdbcDriver jdbcURL = jdbc:odbc:dbname where dbname is the name you gave the user DSN. (This can also be changed once the Explorer is running.) Step 3: Open the database # Book version # Start up the Weka Explorer. Choose Open DB... Edit the query field to read select * from tablename where tablename is the name of the database table you want to read, or you could put a more complicated SQL query here instead. The databaseURL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click OK At this point the data should be read from the database. Stable 3.6 and developer version # Start up the Weka Explorer. Choose Open DB... The URL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click Connect Enter a Query , e.g., \" select * from tablename \" where tablename is the name of the database table you want to read. Or you could put a more complicated SQL query here instead. Click Execute When you're satisfied with the returned data, click OK to load the data into the Preprocess panel. Notes # Working directory The directory a process is started from. When you start Weka from the Windows Start Menu, then this directory would be Weka's installation directory (the java process is started from that directory). Home directory The directory that contains all the user's data. The exact location depends on the operating system and the version of the operating system. It is stored in the following environment variable: Unix/Linux $HOME Windows %USERPROFILE% Cygwin $USERPROFILE You should be able output the value in a command prompt/terminal with the echo command. E.g., for Windows this would be: echo %USERPROFILE% See also # Databases CLASSPATH DatabaseUtils.props","title":"Windows databases"},{"location":"windows_databases/#step-1-create-a-user-dsn","text":"Go to the Control Panel Choose Adminstrative Tools Choose Data Sources (ODBC) At the User DSN tab, choose Add... Choose database Microsoft Access Note: Make sure your database is not open in another application before following the steps below. Choose the Microsoft Access driver and click Finish Give the source a name by typing it into the Data Source Name field In the Database section, choose Select... Browse to find your database file, select it and click OK Click OK to finalize your DSN Microsoft SQL Server 2000 (Desktop Engine) Choose the SQL Server driver and click Finish Give the source a name by typing it into the Name field Add a description for this source in the Description field Select the server you're connecting to from the Server combobox For the verification of the authenticity of the login ID choose With SQL Server... Check Connect to SQL Server to obtain default settings... and supply the user ID and password with which you installed the Desktop Engine Just click on Next until it changes into Finish and click this, too For testing purposes, click on Test Data Source... - the result should be TESTS COMPLETED SUCCESSFULLY! Click on OK MySQL Choose the MySQL ODBC driver and click Finish Give the source a name by typing it into the Data Source Name field Add a description for this source in the Description field Specify the server you're connecting to in Server Fill in the user to use for connecting to the database in the User field, the same for the password Choose the database for this DSN from the Database combobox Click on OK Your DSN should now be listed in the User Data Sources list","title":"Step 1: Create a User DSN"},{"location":"windows_databases/#step-2-set-up-the-databaseutilsprops-file","text":"You will need to configure a file called DatabaseUtils.props . This file already exists under the path weka/experiment/ in the weka.jar file (which is just a ZIP file) that is part of the Weka download. In this directory you will also find a sample file for ODBC connectivity, called DatabaseUtils.props.odbc , and one specifically for MS Access, called DatabaseUtils.props.msaccess (>3.4.14, >3.5.8, >3.6.0), also using ODBC. You should use one of the sample files as basis for your setup, since they already contain default values specific to ODBC access. This file needs to be recognized when the Explorer starts. You can achieve this by making sure it is in the working directory or the home directory (if you are unsure what the terms working directory and home directory mean, see the Notes section). The easiest is probably the second alternative, as the setup will apply to all the Weka instances on your machine. Just make sure that the file contains the following lines at least: jdbcDriver = sun.jdbc.odbc.JdbcOdbcDriver jdbcURL = jdbc:odbc:dbname where dbname is the name you gave the user DSN. (This can also be changed once the Explorer is running.)","title":"Step 2: Set up the DatabaseUtils.props file"},{"location":"windows_databases/#step-3-open-the-database","text":"","title":"Step 3: Open the database"},{"location":"windows_databases/#book-version","text":"Start up the Weka Explorer. Choose Open DB... Edit the query field to read select * from tablename where tablename is the name of the database table you want to read, or you could put a more complicated SQL query here instead. The databaseURL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click OK At this point the data should be read from the database.","title":"Book version"},{"location":"windows_databases/#stable-36-and-developer-version","text":"Start up the Weka Explorer. Choose Open DB... The URL should read \"jdbc:odbc: dbname \" where dbname is the name you gave the user DSN. Click Connect Enter a Query , e.g., \" select * from tablename \" where tablename is the name of the database table you want to read. Or you could put a more complicated SQL query here instead. Click Execute When you're satisfied with the returned data, click OK to load the data into the Preprocess panel.","title":"Stable 3.6 and developer version"},{"location":"windows_databases/#notes","text":"Working directory The directory a process is started from. When you start Weka from the Windows Start Menu, then this directory would be Weka's installation directory (the java process is started from that directory). Home directory The directory that contains all the user's data. The exact location depends on the operating system and the version of the operating system. It is stored in the following environment variable: Unix/Linux $HOME Windows %USERPROFILE% Cygwin $USERPROFILE You should be able output the value in a command prompt/terminal with the echo command. E.g., for Windows this would be: echo %USERPROFILE%","title":"Notes"},{"location":"windows_databases/#see-also","text":"Databases CLASSPATH DatabaseUtils.props","title":"See also"},{"location":"writing_classifier/","text":"In case you have a flash idea for a new classifier and want to write one for Weka, this HOWTO will help you developing it. The Mindmap ( Build_classifier.pdf , produced with FreeMind ) helps you decide from which base classifier to start, what methods are to be implemented and general guidelines. The base classifiers are all located in the following package: weka.classifiers Note: This is also covered in chapter Extending WEKA of the WEKA manual. Packages # A few comments about the different classifier sub-packages: bayes - contains bayesian classifiers, e.g. NaiveBayes evaluation - classes related to evaluation, e.g., cost matrix functions - e.g., Support Vector Machines, regression algorithms, neural nets lazy - no offline learning, that is done during runtime, e.g., k-NN meta - Meta classifiers that use a base classifier as input, e.g., boosting or bagging mi - classifiers that handle multi-instance data misc - various classifiers that don't fit in any another category rules - rule-based classifiers, e.g. ZeroR trees - tree classifiers, like decision trees Coding # In the following you'll find notes about certain implementation parts listed in the Mindmap, which need a bit more explanation. Random number generators # In order to get repeatable experiments, one is not allowed to use unseeded random number generators like Math.random() . Instead, one has to instantiate a java.util.Random object in the buildClassifier(Instances) method with a specific seed value. The seed value can be user supplied, of course, which all the Randomizable... abstract classifiers already implement. Capabilities # In old versions of Weka (up to version 3.5.2), all classifiers could handle basically every kind of data by default, unless they were throwing an Exception (in the buildClassifier(Instances) method). Since this behavior makes it cumbersome to introduce new attribute types, for instance ( all classifiers have to be modified, which can't handle the new attribute type!), the general Capabilities were introduced. Base-classifier # Normal classifiers only state what kind of attributes and what kind of classes they can handle. The getCapabilities() method of weka.classifiers.trees.RandomTree , for instance, looks like this: public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); // returns the object from weka.classifiers.Classifier // attributes result . enable ( Capability . NOMINAL_ATTRIBUTES ); result . enable ( Capability . NUMERIC_ATTRIBUTES ); result . enable ( Capability . DATE_ATTRIBUTES ); result . enable ( Capability . MISSING_VALUES ); // class result . enable ( Capability . NOMINAL_CLASS ); result . enable ( Capability . MISSING_CLASS_VALUES ); return result ; } Special cases: incremental classifiers - By default, at least 1 instance has to be in the dataset, which does not apply for incremental classifiers. They have to lower the limit to 0 : result.setMinimumNumberInstances(0); multi-instance classifiers - The structure for multi-instance classifiers is always fixed to bagID,bag-data,class . To restrict the data to multi-instance data, add the following: result.enable(Capability.ONLY_MULTIINSTANCE); Multi-instance classifiers also implement the following interface, which returns the Capabilities for the bag-data, which is just a relational attribute (the reason why RELATIONAL_ATTRIBUTES has to be enabled): weka.core.MultiInstanceCapabilitiesHandler clusterer - Since clusterer don't need a class attribute like classifiers, the following Capability has to be specified to enable datasets without a class attribute (which is already done in the superclass weka.clusterers.Clusterer ): result.enable(Capability.NO_CLASS); Meta-classifier # Meta-classifiers, by default, just return the capabilities of their base classifiers - in case of descendants of the weka.classifier.MultipleClassifiersCombiner , an AND over all the Capabilities of the base classifiers is returned. Due to this behavior, the Capabilities depend (normally) only on the currently configured base classifier(s). To soften filtering for certain behavior, meta-classifiers also define so-called Dependencies on a per-Capability basis. These dependencies tell the filter that even though a certain capability is not supported right now, it is possible that it will be supported with a different base classifier. By default, all Capabilities are initialized as Dependencies. weka.classifiers.meta.LogitBoost , e.g., is restricted to nominal classes. For that reason it disables the Dependencies for the class: result . disableAllClasses (); // disable all class types result . disableAllClassDependencies (); // no dependencies! result . enable ( Capability . NOMINAL_CLASS ); // only nominal classes allowed Relevant classes # weka.core.Capabilities weka.core.CapabilitiesHandler weka.core.MultiInstanceCapabilitiesHandler (for multi-instance classifiers) Paper reference(s) # In order to make it easy to generate a bibliography of all the algorithms in Weka, the paper references located so far in the Javadoc were extracted and placed in the code. Classes that are based on some technical paper should implement the TechnicalInformationHandler interface and return a customized TechnicalInformation instance. The format used is based on BibTeX and the TechnicalInformation class can either return a plain text string via the toString() method or a real BibTeX entry via the toBibTex() method. This two methods are then used to automatically update the Javadoc (see Javadoc further down) of a class. Relevant classes: weka.core.TechnicalInformation weka.core.TechnicalInformationHandler Javadoc # Open-source software is only as good as its documentation. Hence, correct and up-to-date documentation is vital. So far most of the Javadoc was maintained manually, which made it hard to maintain, e.g., as soon as new options were added the Javadoc had to be changed accordingly, too. And that normally in several places: Class description setOptions(String[]) method Over the time the documentation got out of sync, which made it frustrating determining what options were really relevant and active. Since a lot of the documentation is already available in the code itself, the next logical step was to automate the Javadoc generation as much as possible. In the following you will see how to structure your Javadoc to reduce maintainance. For this purpose special comment tags are used, where the content in between will be replaced automatically by the classes listed below in the Relevant classes section. The indentation of the generated Javadoc depends on the indentation of the &lt; of the starting comment tag. This general layout order should be used for all classes: class description Javadoc globalinfo bibtex - if available commandline options setOptions Javadoc commandline options General # The general description for all classes displayed in the GenericObjectEditor was already in place, with the following method: globalInfo () The return value can be placed in the Javadoc, surrounded by the following comment tags: <!-- globalinfo-start --> will be automatically replaced <!-- globalinfo-end --> Paper reference(s) # If available, the paper reference should also be listed in the Javadoc. Since the globalInfo() method should return a short version of the reference, it is sufficient to list the full BibTeX documentation: <!-- technical-bibtex-start --> will be automatically replaced <!-- technical-bibtex-end --> In case it is necessary to list the short, plain text version, too, one can use the following tags: <!-- technical-plaintext-start --> will be automatically replaced <!-- technical-plaintext-end --> Options # To place the commandline options, use the following comment tags: <!-- options-start --> will be automatically replaced <!-- options-end --> Relevant classes # weka.core.AllJavadoc - executes all Javadoc-producing classes weka.core.GlobalInfoJavadoc - updates the globalInfo tags weka.core.OptionHandlerJavadoc - updates the option tags weka.core.TechnicalInformationHandlerJavadoc - updates the technical tags (plain text and BibTeX ) Integration # After finishing the coding stage, it's time to integrate your classifier in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your classifier and therefore displaying it in the GenericObjectEditor . Revisions # Classifiers also implement the weka.core.RevisionHandler interface. Classifiers that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); } Testing # Weka provides already a test framework to ensure the basic functionality of a classifier. It is essential for the classifier to pass these tests. Commandline test # General # Use the CheckClassifier class to test your classifier from commandline: weka.classifiers.CheckClassifier -W classname [-- additional parameters] Only the following tests may have \"no\" as result, the others must have a \"no (OK error message)\" or \"yes\": options updateable classifier weighted instances classifier multi-instance classifier Option handling # Additionally, check the option handling of your classifier with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes . GenericObjectEditor # The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes . Source code # Classifiers that implement the weka.classifiers.Sourcable interface can output Java code of their model. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.classifiers.CheckSource This class takes the original Weka classifier, the generated code and the dataset used for generating the source code as parameters. It builds the Weka classifier on the dataset and compares the predictions, the ones from the Weka classifier and the ones from the generated source code, whether they are the same. Here's an example call for weka.classifiers.trees.Id3 and the generated class weka.classifiers.WekaWrapper (it wraps the actual generated code in a pseudo-classifier): java weka.classifiers.CheckSource \\ -W \"weka.classifiers.trees.Id3\" \\ -S weka.classifiers.WekaWrapper \\ -t data.arff \\ -c last It needs to return Tests OK! . Unit tests # In order to make sure that your classifier applies to the Weka criteria, you should add your classifier to the junit unit test framework, i.e., by creating a Test class derived from AbstractClassifierTest . This class uses the CheckClassifier , CheckOptionHandler and CheckGOE class to run a battery of tests. See also # GenericObjectEditor Paper References Writing your own Classifier Article Links # Build_classifier.pdf - MindMap for implementing a new classifier Weka API ( stable , developer ) Freemind junit","title":"Writing classifier"},{"location":"writing_classifier/#packages","text":"A few comments about the different classifier sub-packages: bayes - contains bayesian classifiers, e.g. NaiveBayes evaluation - classes related to evaluation, e.g., cost matrix functions - e.g., Support Vector Machines, regression algorithms, neural nets lazy - no offline learning, that is done during runtime, e.g., k-NN meta - Meta classifiers that use a base classifier as input, e.g., boosting or bagging mi - classifiers that handle multi-instance data misc - various classifiers that don't fit in any another category rules - rule-based classifiers, e.g. ZeroR trees - tree classifiers, like decision trees","title":"Packages"},{"location":"writing_classifier/#coding","text":"In the following you'll find notes about certain implementation parts listed in the Mindmap, which need a bit more explanation.","title":"Coding"},{"location":"writing_classifier/#random-number-generators","text":"In order to get repeatable experiments, one is not allowed to use unseeded random number generators like Math.random() . Instead, one has to instantiate a java.util.Random object in the buildClassifier(Instances) method with a specific seed value. The seed value can be user supplied, of course, which all the Randomizable... abstract classifiers already implement.","title":"Random number generators"},{"location":"writing_classifier/#capabilities","text":"In old versions of Weka (up to version 3.5.2), all classifiers could handle basically every kind of data by default, unless they were throwing an Exception (in the buildClassifier(Instances) method). Since this behavior makes it cumbersome to introduce new attribute types, for instance ( all classifiers have to be modified, which can't handle the new attribute type!), the general Capabilities were introduced.","title":"Capabilities"},{"location":"writing_classifier/#base-classifier","text":"Normal classifiers only state what kind of attributes and what kind of classes they can handle. The getCapabilities() method of weka.classifiers.trees.RandomTree , for instance, looks like this: public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); // returns the object from weka.classifiers.Classifier // attributes result . enable ( Capability . NOMINAL_ATTRIBUTES ); result . enable ( Capability . NUMERIC_ATTRIBUTES ); result . enable ( Capability . DATE_ATTRIBUTES ); result . enable ( Capability . MISSING_VALUES ); // class result . enable ( Capability . NOMINAL_CLASS ); result . enable ( Capability . MISSING_CLASS_VALUES ); return result ; } Special cases: incremental classifiers - By default, at least 1 instance has to be in the dataset, which does not apply for incremental classifiers. They have to lower the limit to 0 : result.setMinimumNumberInstances(0); multi-instance classifiers - The structure for multi-instance classifiers is always fixed to bagID,bag-data,class . To restrict the data to multi-instance data, add the following: result.enable(Capability.ONLY_MULTIINSTANCE); Multi-instance classifiers also implement the following interface, which returns the Capabilities for the bag-data, which is just a relational attribute (the reason why RELATIONAL_ATTRIBUTES has to be enabled): weka.core.MultiInstanceCapabilitiesHandler clusterer - Since clusterer don't need a class attribute like classifiers, the following Capability has to be specified to enable datasets without a class attribute (which is already done in the superclass weka.clusterers.Clusterer ): result.enable(Capability.NO_CLASS);","title":"Base-classifier"},{"location":"writing_classifier/#meta-classifier","text":"Meta-classifiers, by default, just return the capabilities of their base classifiers - in case of descendants of the weka.classifier.MultipleClassifiersCombiner , an AND over all the Capabilities of the base classifiers is returned. Due to this behavior, the Capabilities depend (normally) only on the currently configured base classifier(s). To soften filtering for certain behavior, meta-classifiers also define so-called Dependencies on a per-Capability basis. These dependencies tell the filter that even though a certain capability is not supported right now, it is possible that it will be supported with a different base classifier. By default, all Capabilities are initialized as Dependencies. weka.classifiers.meta.LogitBoost , e.g., is restricted to nominal classes. For that reason it disables the Dependencies for the class: result . disableAllClasses (); // disable all class types result . disableAllClassDependencies (); // no dependencies! result . enable ( Capability . NOMINAL_CLASS ); // only nominal classes allowed","title":"Meta-classifier"},{"location":"writing_classifier/#relevant-classes","text":"weka.core.Capabilities weka.core.CapabilitiesHandler weka.core.MultiInstanceCapabilitiesHandler (for multi-instance classifiers)","title":"Relevant classes"},{"location":"writing_classifier/#paper-references","text":"In order to make it easy to generate a bibliography of all the algorithms in Weka, the paper references located so far in the Javadoc were extracted and placed in the code. Classes that are based on some technical paper should implement the TechnicalInformationHandler interface and return a customized TechnicalInformation instance. The format used is based on BibTeX and the TechnicalInformation class can either return a plain text string via the toString() method or a real BibTeX entry via the toBibTex() method. This two methods are then used to automatically update the Javadoc (see Javadoc further down) of a class. Relevant classes: weka.core.TechnicalInformation weka.core.TechnicalInformationHandler","title":"Paper reference(s)"},{"location":"writing_classifier/#javadoc","text":"Open-source software is only as good as its documentation. Hence, correct and up-to-date documentation is vital. So far most of the Javadoc was maintained manually, which made it hard to maintain, e.g., as soon as new options were added the Javadoc had to be changed accordingly, too. And that normally in several places: Class description setOptions(String[]) method Over the time the documentation got out of sync, which made it frustrating determining what options were really relevant and active. Since a lot of the documentation is already available in the code itself, the next logical step was to automate the Javadoc generation as much as possible. In the following you will see how to structure your Javadoc to reduce maintainance. For this purpose special comment tags are used, where the content in between will be replaced automatically by the classes listed below in the Relevant classes section. The indentation of the generated Javadoc depends on the indentation of the &lt; of the starting comment tag. This general layout order should be used for all classes: class description Javadoc globalinfo bibtex - if available commandline options setOptions Javadoc commandline options","title":"Javadoc"},{"location":"writing_classifier/#general","text":"The general description for all classes displayed in the GenericObjectEditor was already in place, with the following method: globalInfo () The return value can be placed in the Javadoc, surrounded by the following comment tags: <!-- globalinfo-start --> will be automatically replaced <!-- globalinfo-end -->","title":"General"},{"location":"writing_classifier/#paper-references_1","text":"If available, the paper reference should also be listed in the Javadoc. Since the globalInfo() method should return a short version of the reference, it is sufficient to list the full BibTeX documentation: <!-- technical-bibtex-start --> will be automatically replaced <!-- technical-bibtex-end --> In case it is necessary to list the short, plain text version, too, one can use the following tags: <!-- technical-plaintext-start --> will be automatically replaced <!-- technical-plaintext-end -->","title":"Paper reference(s)"},{"location":"writing_classifier/#options","text":"To place the commandline options, use the following comment tags: <!-- options-start --> will be automatically replaced <!-- options-end -->","title":"Options"},{"location":"writing_classifier/#relevant-classes_1","text":"weka.core.AllJavadoc - executes all Javadoc-producing classes weka.core.GlobalInfoJavadoc - updates the globalInfo tags weka.core.OptionHandlerJavadoc - updates the option tags weka.core.TechnicalInformationHandlerJavadoc - updates the technical tags (plain text and BibTeX )","title":"Relevant classes"},{"location":"writing_classifier/#integration","text":"After finishing the coding stage, it's time to integrate your classifier in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your classifier and therefore displaying it in the GenericObjectEditor .","title":"Integration"},{"location":"writing_classifier/#revisions","text":"Classifiers also implement the weka.core.RevisionHandler interface. Classifiers that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); }","title":"Revisions"},{"location":"writing_classifier/#testing","text":"Weka provides already a test framework to ensure the basic functionality of a classifier. It is essential for the classifier to pass these tests.","title":"Testing"},{"location":"writing_classifier/#commandline-test","text":"","title":"Commandline test"},{"location":"writing_classifier/#general_1","text":"Use the CheckClassifier class to test your classifier from commandline: weka.classifiers.CheckClassifier -W classname [-- additional parameters] Only the following tests may have \"no\" as result, the others must have a \"no (OK error message)\" or \"yes\": options updateable classifier weighted instances classifier multi-instance classifier","title":"General"},{"location":"writing_classifier/#option-handling","text":"Additionally, check the option handling of your classifier with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes .","title":"Option handling"},{"location":"writing_classifier/#genericobjecteditor","text":"The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes .","title":"GenericObjectEditor"},{"location":"writing_classifier/#source-code","text":"Classifiers that implement the weka.classifiers.Sourcable interface can output Java code of their model. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.classifiers.CheckSource This class takes the original Weka classifier, the generated code and the dataset used for generating the source code as parameters. It builds the Weka classifier on the dataset and compares the predictions, the ones from the Weka classifier and the ones from the generated source code, whether they are the same. Here's an example call for weka.classifiers.trees.Id3 and the generated class weka.classifiers.WekaWrapper (it wraps the actual generated code in a pseudo-classifier): java weka.classifiers.CheckSource \\ -W \"weka.classifiers.trees.Id3\" \\ -S weka.classifiers.WekaWrapper \\ -t data.arff \\ -c last It needs to return Tests OK! .","title":"Source code"},{"location":"writing_classifier/#unit-tests","text":"In order to make sure that your classifier applies to the Weka criteria, you should add your classifier to the junit unit test framework, i.e., by creating a Test class derived from AbstractClassifierTest . This class uses the CheckClassifier , CheckOptionHandler and CheckGOE class to run a battery of tests.","title":"Unit tests"},{"location":"writing_classifier/#see-also","text":"GenericObjectEditor Paper References Writing your own Classifier Article","title":"See also"},{"location":"writing_classifier/#links","text":"Build_classifier.pdf - MindMap for implementing a new classifier Weka API ( stable , developer ) Freemind junit","title":"Links"},{"location":"writing_classifier_article/","text":"Here you'll find instructions of how to create an article that describes the classifier you developed, including the upload of the source code itself. As an example, the Weka classifier ZeroR is used. Preparation # The content of the wiki is available as repository on GitHub , which also contains a guide on how to build and test the repository using MkDocs . Add or modify items as necessary and then do a pull request . See this link for details on writing articles. Article Layout # To provide fast access to a certain classifier, the following topics should be covered in an article about a classifier: insert a full description of your classifier, whether it handles numerical and/or nominal classes/attributes; in a nutshell, information a potential user needs to decide whether this specific classifier is suitable for a certain problem # Description Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class). add the reference paper, if any # Reference -none- add the package the classifier belongs to, that people can adapt their GenericObjectEditor props file and use the classifier within Weka . # Package weka.classifiers.rules add the link to download the source code, either an internal one, if you uploaded the source code, or an external one, if you want to point to a different resource; here it is an internal one, pointing to ZeroR.java # Download Source code: [ZeroR.java](files/ZeroR.java) (optional) add additional information, e.g., some benchmarks; which is nothing in our case # Additional Information -none- finally, save and deploy the article","title":"Writing classifier article"},{"location":"writing_classifier_article/#preparation","text":"The content of the wiki is available as repository on GitHub , which also contains a guide on how to build and test the repository using MkDocs . Add or modify items as necessary and then do a pull request . See this link for details on writing articles.","title":"Preparation"},{"location":"writing_classifier_article/#article-layout","text":"To provide fast access to a certain classifier, the following topics should be covered in an article about a classifier: insert a full description of your classifier, whether it handles numerical and/or nominal classes/attributes; in a nutshell, information a potential user needs to decide whether this specific classifier is suitable for a certain problem # Description Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class). add the reference paper, if any # Reference -none- add the package the classifier belongs to, that people can adapt their GenericObjectEditor props file and use the classifier within Weka . # Package weka.classifiers.rules add the link to download the source code, either an internal one, if you uploaded the source code, or an external one, if you want to point to a different resource; here it is an internal one, pointing to ZeroR.java # Download Source code: [ZeroR.java](files/ZeroR.java) (optional) add additional information, e.g., some benchmarks; which is nothing in our case # Additional Information -none- finally, save and deploy the article","title":"Article Layout"},{"location":"writing_filter/","text":"Note: This is also covered in chapter Extending WEKA of the WEKA manual. Packages # A few comments about the different filter sub-packages: supervised - contains supervised filters, i.e., filters that take class distributions into account. Must implement the weka.filters.SupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise. unsupervised - contains unsupervised filters, i.e., they work without taking any class distributions into account. Must implement the weka.filters.UnsupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise. Choosing the superclass # The base filters and interfaces are all located in the following package: weka.filters One can basically distinguish between two different kinds of filters: batch filters - they need to see the whole dataset before they can start processing it, which they do in one go stream filters - they can start producing output right away and the data just passes through while being modified All filters are derived from the abstract superclass weka.filters.Filter . To speed up development, there are also the following abstract filters, depending on the kind of classifier you want to implement: weka.filters.SimpleBatchFilter weka.filters.SimpleStreamFilter These filters simplify the rather general and complex framework introduced by the abstract superclass weka.filters.Filter . One only needs to implement a couple of abstract methods that will process the actual data and override, if necessary, a few existing methods for option handling. Filter # Implementation # The following methods are of importance for the implementation of a filter and explained in detail further down: getCapabilities() setInputFormat(Instances) getInputFormat() setOutputFormat(Instances) getOutputFormat() input(Instance) bufferInput(Instance) push(Instance) output() batchFinished() flushInput() getRevision() But only the following ones need normally be modified: getCapabilities() setInputFormat(Instances) input(Instance) batchFinished() getRevision() getCapabilities() # Filters implement the weka.core.CapabilitiesHandler interface like the classifiers. This method returns what kind of data the filter is able to process. Needs to be adapted for each individual filter. setInputFormat(Instances) # With this format, the user tells the filter what format, i.e., attributes, the input data has. This method also tests, whether the filter can actually process this data. All older Weka versions or book branch versions need to check the data manually and throw fitting exceptions, e.g., not being able to handle String attributes. If the output format of the filter, i.e., the new Instances header, can be determined based alone on this information, then the method should set the output format via setOutputFormat(Instances) and return true , otherwise it has to return false . getInputFormat() # This method returns an Instances object containing all currently buffered Instance objects from the input queue. setOutputFormat(Instances) # setOutputFormat(Instances) defines the new Instances header for the output data. For filters that work on a row-basis, there shouldn't be any changes between the input and output format. But filters that work on attributes, e.g., removing, adding, modifying, will affect this format. This method must be called with the appropriate Instances object as parameter, since all Instance objects being processed will rely on the output format. getOutputFormat() # This method returns the currently set Instances object that defines the output format. In case setOutputFormat(Instances) hasn't been called yet, this method will return null . input(Instance) # The input(Instance) method returns true if the given Instance can be processed straight away and can be collected immediately via the output() method (after adding it to the output queue via push(Instance) , of course). This is also the case if the first batch of data has been processed and the instance belongs to the second batch. Via isFirstBatchDone() one can query whether this instance is still part of the first batch or of the second. If the Instance cannot be processed immediately, e.g., the filter needs to collect all the data first before doing some calculations, then it needs to be buffered with bufferInput(Instance) until batchFinished() is called. bufferInput(Instance) # In case an Instance cannot be processed immediately, one can use this method to buffer them in the input queue. All buffered Instance objects are available via the getInputFormat() method. push(Instance) # push(Instance) adds the given Instance to the output queue. output() # Returns the next Instance object from the output queue and removes it from there. In case there is no Instance available this method returns null . batchFinished() # The batchFinished() method signifies the end of a dataset being pushed through the filter. In case of a filter that couldn't process the data of the first batch immediately, this is the place to determine what the output format will be (and set if via setOutputFormat(Instances) ) and process the actual data. The currently available data can be retrieved with the getInputFormat() method. After processing the data, one needs to call flushInput() to remove all the pending input data. flushInput() # flushInput() removes all buffered Instance objects from the input queue. This method must be called after all the Instance objects have been processed in the batchFinished() method. Option handling # If the filter should be able to handle commandline options, then the weka.core.OptionHandler interface needs to be implemented. In addition to that, the following code should be added at the end of the setOptions(String[]) method: if ( getInputFormat () != null ) setInputFormat ( getInputFormat ()); This will inform the filter about changes in the options and therefore reset it. Examples # The following examples are to illustrate the filter framework. Note: unseeded random number generators like Math.random() should never be used since they will produce different results in each run and repeatable results are essential in machine learning. BatchFilter # This simple batch filter adds a new attribute called //bla// at the end of the dataset. The rows of this attribute contain only the row's index in the data. Since the batch-filter need not see all the data before creating the output format, the setInputFormat(Instances) sets the output format and returns true (indicating that the output format can be queried immediately). The batchFinished() method performs the processing of all the data. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"can be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); return true ; // output format is immediately available } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter (), args ); } } BatchFilter2 # In contrast to the first batch filter, this one here cannot determine the output format immediately (the number of instances in the first batch is part of the attribute name now). This is done in the batchFinished() method. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter2 extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"cannot be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (depends on first batch of data) if ( ! isFirstBatchDone ()) { Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter2 (), args ); } } BatchFilter3 # As soon as this batch filter's first batch is done, it can process Instance objects immediately in the input(Instance) method. It adds a new attribute which contains just a random number, but the random number generator being used is seeded with the number of instances from the first batch. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class BatchFilter3 extends Filter { protected int m_Seed ; protected Random m_Random ; public String globalInfo () { return \"A batch filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format cannot be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } if ( isFirstBatchDone ()) convertInstance ( instance ); else bufferInput ( instance ); return isFirstBatchDone (); } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (random number generator is seeded // with number of instances of first batch) if ( ! isFirstBatchDone ()) { m_Seed = getInputFormat (). numInstances (); Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { convertInstance ( inst . instance ( i )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; m_Random = null ; return ( numPendingOutput () != 0 ); } protected void convertInstance ( Instance instance ) { if ( m_Random = null ) m_Random = new Random ( m_Seed ); double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new BatchFilter3 (), args ); } } StreamFilter # This stream filter adds a random number at the end of each instance of the input data. Since this doesn't rely on having access to the full data of the first batch, the output format is accessible immediately after using setInputFormat(Instances) . All the Instance objects are immediately processed in input(Instance) via the convertInstance(Instance) method, which pushes them immediately to the output queue. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class StreamFilter extends Filter { protected Random m_Random ; public String globalInfo () { return \"A stream filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format can be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); m_Random = new Random ( 1 ); return true ; // output format is immediately available } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } convertInstance ( instance ); return true ; // can be immediately collected via output() } protected void convertInstance ( Instance instance ) { double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new StreamFilter (), args ); } } SimpleBatchFilter # Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instances) - processes the whole dataset in one go getRevision() - returns the revision information If you need access to the full input dataset in determineOutputFormat(Instances) , then you need to also override the method allowAccessToFullInputFormat() and make it return true. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation that adds an additional attribute at the end, containing the index of the processed instance: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; public class SimpleBatch extends SimpleBatchFilter { public String globalInfo () { return \"A simple batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instances process ( Instances inst ) { Instances result = new Instances ( determineOutputFormat ( inst ), 0 ); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] values = new double [ result . numAttributes () ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . instance ( i ). value ( n ); values [ values . length - 1 ] = i ; result . add ( new Instance ( 1 , values )); } return result ; } public static void main ( String [] args ) { runFilter ( new SimpleBatch (), args ); } } SimpleStreamFilter # Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instance) processes a single instance and turns it from the old format into the new one getRevision() - returns the revision information The reset() method is only used, since the random number generator needs to be re-initialized in order to obtain repeatable results. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation of a stream filter that adds an extra attribute at the end, which is filled with random numbers: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; import java.util.Random ; public class SimpleStream extends SimpleStreamFilter { protected Random m_Random ; public String globalInfo () { return \"A simple stream filter that adds an attribute 'bla' at the end \" + \"containing a random number.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected void reset () { super . reset (); m_Random = new Random ( 1 ); } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instance process ( Instance inst ) { double [] values = new double [ inst . numAttributes () + 1 ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . value ( n ); values [ values . length - 1 ] = m_Random . nextInt (); Instance result = new Instance ( 1 , values ); return result ; } public static void main ( String [] args ) { runFilter ( new SimpleStream (), args ); } } A real-world implementation of a stream filter is the MultiFilter (package weka.filters ), which passes the data through all the filters it contains. Depending on whether all the used filters are streamable or not, it acts either as a stream filter or as batch filter. Internals # Some useful methods of the filter classes: isNewBatch() - returns true if an instance of the filter was just instantiated via new or a new batch was started via the batchFinished() method. isFirstBatchDone() - returns true as soon as the first batch was finished via the batchFinished() method. Useful for supervised filters, which should not be altered after being trained with the first batch of instances. Revisions # Filters also implement the weka.core.RevisionHandler interface. Filters that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); } Integration # After finishing the coding stage, it's time to integrate your filter in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your filter and therefore displaying it in the GenericObjectEditor (filters work in the same fashion as classifiers, regarding the discovery). Testing # Weka provides already a test framework to ensure the basic functionality of a filter. It is essential for the filter to pass these tests. Option handling # If your filter implements weka.core.OptionHandler , check the option handling of your filter with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes . GenericObjectEditor # The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes . Source code # Filters that implement the weka.filters.Sourcable interface can output Java code of their internal representation. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.filters.CheckSource This class takes the original Weka filter, the generated code and the dataset used for generating the source code (and an optional class index) as parameters. It builds the Weka filter on the dataset and compares the output, the one from the Weka filter and the one from the generated source code, whether they are the same. Here's an example call for weka.filters.unsupervised.attribute.ReplaceMissingValues and the generated class weka.filters.WekaWrapper (it wraps the actual generated code in a pseudo-filter): java weka.filters.CheckSource \\ -W weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -S weka.filters.WekaWrapper \\ -t data.arff It needs to return Tests OK! . Unit tests # In order to make sure that your filter applies to the Weka criteria, you should add your filter to the junit unit test framework, i.e., by creating a Test class. See also # GenericObjectEditor Links # junit","title":"Writing filter"},{"location":"writing_filter/#packages","text":"A few comments about the different filter sub-packages: supervised - contains supervised filters, i.e., filters that take class distributions into account. Must implement the weka.filters.SupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise. unsupervised - contains unsupervised filters, i.e., they work without taking any class distributions into account. Must implement the weka.filters.UnsupervisedFilter interface. attribute - filters that work column-wise. instance - filters that work row-wise.","title":"Packages"},{"location":"writing_filter/#choosing-the-superclass","text":"The base filters and interfaces are all located in the following package: weka.filters One can basically distinguish between two different kinds of filters: batch filters - they need to see the whole dataset before they can start processing it, which they do in one go stream filters - they can start producing output right away and the data just passes through while being modified All filters are derived from the abstract superclass weka.filters.Filter . To speed up development, there are also the following abstract filters, depending on the kind of classifier you want to implement: weka.filters.SimpleBatchFilter weka.filters.SimpleStreamFilter These filters simplify the rather general and complex framework introduced by the abstract superclass weka.filters.Filter . One only needs to implement a couple of abstract methods that will process the actual data and override, if necessary, a few existing methods for option handling.","title":"Choosing the superclass"},{"location":"writing_filter/#filter","text":"","title":"Filter"},{"location":"writing_filter/#implementation","text":"The following methods are of importance for the implementation of a filter and explained in detail further down: getCapabilities() setInputFormat(Instances) getInputFormat() setOutputFormat(Instances) getOutputFormat() input(Instance) bufferInput(Instance) push(Instance) output() batchFinished() flushInput() getRevision() But only the following ones need normally be modified: getCapabilities() setInputFormat(Instances) input(Instance) batchFinished() getRevision()","title":"Implementation"},{"location":"writing_filter/#getcapabilities","text":"Filters implement the weka.core.CapabilitiesHandler interface like the classifiers. This method returns what kind of data the filter is able to process. Needs to be adapted for each individual filter.","title":"getCapabilities()"},{"location":"writing_filter/#setinputformatinstances","text":"With this format, the user tells the filter what format, i.e., attributes, the input data has. This method also tests, whether the filter can actually process this data. All older Weka versions or book branch versions need to check the data manually and throw fitting exceptions, e.g., not being able to handle String attributes. If the output format of the filter, i.e., the new Instances header, can be determined based alone on this information, then the method should set the output format via setOutputFormat(Instances) and return true , otherwise it has to return false .","title":"setInputFormat(Instances)"},{"location":"writing_filter/#getinputformat","text":"This method returns an Instances object containing all currently buffered Instance objects from the input queue.","title":"getInputFormat()"},{"location":"writing_filter/#setoutputformatinstances","text":"setOutputFormat(Instances) defines the new Instances header for the output data. For filters that work on a row-basis, there shouldn't be any changes between the input and output format. But filters that work on attributes, e.g., removing, adding, modifying, will affect this format. This method must be called with the appropriate Instances object as parameter, since all Instance objects being processed will rely on the output format.","title":"setOutputFormat(Instances)"},{"location":"writing_filter/#getoutputformat","text":"This method returns the currently set Instances object that defines the output format. In case setOutputFormat(Instances) hasn't been called yet, this method will return null .","title":"getOutputFormat()"},{"location":"writing_filter/#inputinstance","text":"The input(Instance) method returns true if the given Instance can be processed straight away and can be collected immediately via the output() method (after adding it to the output queue via push(Instance) , of course). This is also the case if the first batch of data has been processed and the instance belongs to the second batch. Via isFirstBatchDone() one can query whether this instance is still part of the first batch or of the second. If the Instance cannot be processed immediately, e.g., the filter needs to collect all the data first before doing some calculations, then it needs to be buffered with bufferInput(Instance) until batchFinished() is called.","title":"input(Instance)"},{"location":"writing_filter/#bufferinputinstance","text":"In case an Instance cannot be processed immediately, one can use this method to buffer them in the input queue. All buffered Instance objects are available via the getInputFormat() method.","title":"bufferInput(Instance)"},{"location":"writing_filter/#pushinstance","text":"push(Instance) adds the given Instance to the output queue.","title":"push(Instance)"},{"location":"writing_filter/#output","text":"Returns the next Instance object from the output queue and removes it from there. In case there is no Instance available this method returns null .","title":"output()"},{"location":"writing_filter/#batchfinished","text":"The batchFinished() method signifies the end of a dataset being pushed through the filter. In case of a filter that couldn't process the data of the first batch immediately, this is the place to determine what the output format will be (and set if via setOutputFormat(Instances) ) and process the actual data. The currently available data can be retrieved with the getInputFormat() method. After processing the data, one needs to call flushInput() to remove all the pending input data.","title":"batchFinished()"},{"location":"writing_filter/#flushinput","text":"flushInput() removes all buffered Instance objects from the input queue. This method must be called after all the Instance objects have been processed in the batchFinished() method.","title":"flushInput()"},{"location":"writing_filter/#option-handling","text":"If the filter should be able to handle commandline options, then the weka.core.OptionHandler interface needs to be implemented. In addition to that, the following code should be added at the end of the setOptions(String[]) method: if ( getInputFormat () != null ) setInputFormat ( getInputFormat ()); This will inform the filter about changes in the options and therefore reset it.","title":"Option handling"},{"location":"writing_filter/#examples","text":"The following examples are to illustrate the filter framework. Note: unseeded random number generators like Math.random() should never be used since they will produce different results in each run and repeatable results are essential in machine learning.","title":"Examples"},{"location":"writing_filter/#batchfilter","text":"This simple batch filter adds a new attribute called //bla// at the end of the dataset. The rows of this attribute contain only the row's index in the data. Since the batch-filter need not see all the data before creating the output format, the setInputFormat(Instances) sets the output format and returns true (indicating that the output format can be queried immediately). The batchFinished() method performs the processing of all the data. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"can be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); return true ; // output format is immediately available } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter (), args ); } }","title":"BatchFilter"},{"location":"writing_filter/#batchfilter2","text":"In contrast to the first batch filter, this one here cannot determine the output format immediately (the number of instances in the first batch is part of the attribute name now). This is done in the batchFinished() method. import weka.core.* ; import weka.core.Capabilities.* ; public class BatchFilter2 extends Filter { public String globalInfo () { return \"A batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance. The output format \" + \"cannot be collected immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (depends on first batch of data) if ( ! isFirstBatchDone ()) { Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); Instances outFormat = getOutputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] newValues = new double [ outFormat . numAttributes () ] ; double [] oldValues = inst . instance ( i ). toDoubleArray (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); newValues [ newValues . length - 1 ] = i ; push ( new Instance ( 1.0 , newValues )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; return ( numPendingOutput () != 0 ); } public static void main ( String [] args ) { runFilter ( new BatchFilter2 (), args ); } }","title":"BatchFilter2"},{"location":"writing_filter/#batchfilter3","text":"As soon as this batch filter's first batch is done, it can process Instance objects immediately in the input(Instance) method. It adds a new attribute which contains just a random number, but the random number generator being used is seeded with the number of instances from the first batch. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class BatchFilter3 extends Filter { protected int m_Seed ; protected Random m_Random ; public String globalInfo () { return \"A batch filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format cannot be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } if ( isFirstBatchDone ()) convertInstance ( instance ); else bufferInput ( instance ); return isFirstBatchDone (); } public boolean batchFinished () throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); // output format still needs to be set (random number generator is seeded // with number of instances of first batch) if ( ! isFirstBatchDone ()) { m_Seed = getInputFormat (). numInstances (); Instances outFormat = new Instances ( getInputFormat (), 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla-\" + getInputFormat (). numInstances ()), outFormat . numAttributes ()); setOutputFormat ( outFormat ); } Instances inst = getInputFormat (); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { convertInstance ( inst . instance ( i )); } flushInput (); m_NewBatch = true ; m_FirstBatchDone = true ; m_Random = null ; return ( numPendingOutput () != 0 ); } protected void convertInstance ( Instance instance ) { if ( m_Random = null ) m_Random = new Random ( m_Seed ); double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new BatchFilter3 (), args ); } }","title":"BatchFilter3"},{"location":"writing_filter/#streamfilter","text":"This stream filter adds a random number at the end of each instance of the input data. Since this doesn't rely on having access to the full data of the first batch, the output format is accessible immediately after using setInputFormat(Instances) . All the Instance objects are immediately processed in input(Instance) via the convertInstance(Instance) method, which pushes them immediately to the output queue. import weka.core.* ; import weka.core.Capabilities.* ; import java.util.Random ; public class StreamFilter extends Filter { protected Random m_Random ; public String globalInfo () { return \"A stream filter that adds an attribute 'bla' at the end \" + \"containing a random number. The output format can be collected \" + \"immediately.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); // filter doesn't need class to be set return result ; } public boolean setInputFormat ( Instances instanceInfo ) throws Exception { super . setInputFormat ( instanceInfo ); Instances outFormat = new Instances ( instanceInfo , 0 ); outFormat . insertAttributeAt ( new Attribute ( \"bla\" ), outFormat . numAttributes ()); setOutputFormat ( outFormat ); m_Random = new Random ( 1 ); return true ; // output format is immediately available } public boolean input ( Instance instance ) throws Exception { if ( getInputFormat () = null ) throw new NullPointerException ( \"No input instance format defined\" ); if ( isNewBatch ()) { resetQueue (); m_NewBatch = false ; } convertInstance ( instance ); return true ; // can be immediately collected via output() } protected void convertInstance ( Instance instance ) { double [] newValues = new double [ instance . numAttributes () + 1 ] ; double [] oldValues = instance . toDoubleArray (); newValues [ newValues . length - 1 ] = m_Random . nextInt (); System . arraycopy ( oldValues , 0 , newValues , 0 , oldValues . length ); push ( new Instance ( 1.0 , newValues )); } public static void main ( String [] args ) { runFilter ( new StreamFilter (), args ); } }","title":"StreamFilter"},{"location":"writing_filter/#simplebatchfilter","text":"Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instances) - processes the whole dataset in one go getRevision() - returns the revision information If you need access to the full input dataset in determineOutputFormat(Instances) , then you need to also override the method allowAccessToFullInputFormat() and make it return true. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation that adds an additional attribute at the end, containing the index of the processed instance: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; public class SimpleBatch extends SimpleBatchFilter { public String globalInfo () { return \"A simple batch filter that adds an additional attribute 'bla' at the end \" + \"containing the index of the processed instance.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instances process ( Instances inst ) { Instances result = new Instances ( determineOutputFormat ( inst ), 0 ); for ( int i = 0 ; i < inst . numInstances (); i ++ ) { double [] values = new double [ result . numAttributes () ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . instance ( i ). value ( n ); values [ values . length - 1 ] = i ; result . add ( new Instance ( 1 , values )); } return result ; } public static void main ( String [] args ) { runFilter ( new SimpleBatch (), args ); } }","title":"SimpleBatchFilter"},{"location":"writing_filter/#simplestreamfilter","text":"Only the following abstract methods need to be implemented: globalInfo() - returns a short description of what the filter does; will be displayed in the GUI determineOutputFormat(Instances) - generates the new format, based on the input data process(Instance) processes a single instance and turns it from the old format into the new one getRevision() - returns the revision information The reset() method is only used, since the random number generator needs to be re-initialized in order to obtain repeatable results. If more options are necessary, then the following methods need to be overridden: listOptions() - returns an enumeration of the available options; these are printed if one calls the filter with the -h option setOptions(String[]) - parses the given option array, that were passed from commandline getOptions() - returns an array of options, resembling the current setup of the filter In the following an example implementation of a stream filter that adds an extra attribute at the end, which is filled with random numbers: import weka.core.* ; import weka.core.Capabilities.* ; import weka.filters.* ; import java.util.Random ; public class SimpleStream extends SimpleStreamFilter { protected Random m_Random ; public String globalInfo () { return \"A simple stream filter that adds an attribute 'bla' at the end \" + \"containing a random number.\" ; } public Capabilities getCapabilities () { Capabilities result = super . getCapabilities (); result . enableAllAttributes (); result . enableAllClasses (); result . enable ( Capability . NO_CLASS ); //// filter doesn't need class to be set// return result ; } protected void reset () { super . reset (); m_Random = new Random ( 1 ); } protected Instances determineOutputFormat ( Instances inputFormat ) { Instances result = new Instances ( inputFormat , 0 ); result . insertAttributeAt ( new Attribute ( \"bla\" ), result . numAttributes ()); return result ; } protected Instance process ( Instance inst ) { double [] values = new double [ inst . numAttributes () + 1 ] ; for ( int n = 0 ; n < inst . numAttributes (); n ++ ) values [ n ] = inst . value ( n ); values [ values . length - 1 ] = m_Random . nextInt (); Instance result = new Instance ( 1 , values ); return result ; } public static void main ( String [] args ) { runFilter ( new SimpleStream (), args ); } } A real-world implementation of a stream filter is the MultiFilter (package weka.filters ), which passes the data through all the filters it contains. Depending on whether all the used filters are streamable or not, it acts either as a stream filter or as batch filter.","title":"SimpleStreamFilter"},{"location":"writing_filter/#internals","text":"Some useful methods of the filter classes: isNewBatch() - returns true if an instance of the filter was just instantiated via new or a new batch was started via the batchFinished() method. isFirstBatchDone() - returns true as soon as the first batch was finished via the batchFinished() method. Useful for supervised filters, which should not be altered after being trained with the first batch of instances.","title":"Internals"},{"location":"writing_filter/#revisions","text":"Filters also implement the weka.core.RevisionHandler interface. Filters that are not part of the official Weka distribution will have to implement the method getRevision() as follows, which will return a dummy revision of 1.0 : /** * Returns the revision string. * * @return the revision */ public String getRevision () { return RevisionUtils . extract ( \"$Revision: 1.0 $\" ); }","title":"Revisions"},{"location":"writing_filter/#integration","text":"After finishing the coding stage, it's time to integrate your filter in the Weka framework, i.e., to make it available in the Explorer, Experimenter, etc. The GenericObjectEditor article shows you how to tell Weka where to find your filter and therefore displaying it in the GenericObjectEditor (filters work in the same fashion as classifiers, regarding the discovery).","title":"Integration"},{"location":"writing_filter/#testing","text":"Weka provides already a test framework to ensure the basic functionality of a filter. It is essential for the filter to pass these tests.","title":"Testing"},{"location":"writing_filter/#option-handling_1","text":"If your filter implements weka.core.OptionHandler , check the option handling of your filter with the following tool from commandline: weka.core.CheckOptionHandler -W classname [-- additional parameters] All tests need to return yes .","title":"Option handling"},{"location":"writing_filter/#genericobjecteditor","text":"The CheckGOE class checks whether all the properties available in the GUI have a tooltip accompanying them and whether the globalInfo() method is declared: weka.core.CheckGOE -W classname [-- additional parameters] All tests, once again, need to return yes .","title":"GenericObjectEditor"},{"location":"writing_filter/#source-code","text":"Filters that implement the weka.filters.Sourcable interface can output Java code of their internal representation. In order to check the generated code, one should not only compile the code, but also test it with the following test class: weka.filters.CheckSource This class takes the original Weka filter, the generated code and the dataset used for generating the source code (and an optional class index) as parameters. It builds the Weka filter on the dataset and compares the output, the one from the Weka filter and the one from the generated source code, whether they are the same. Here's an example call for weka.filters.unsupervised.attribute.ReplaceMissingValues and the generated class weka.filters.WekaWrapper (it wraps the actual generated code in a pseudo-filter): java weka.filters.CheckSource \\ -W weka.filters.unsupervised.attribute.ReplaceMissingValues \\ -S weka.filters.WekaWrapper \\ -t data.arff It needs to return Tests OK! .","title":"Source code"},{"location":"writing_filter/#unit-tests","text":"In order to make sure that your filter applies to the Weka criteria, you should add your filter to the junit unit test framework, i.e., by creating a Test class.","title":"Unit tests"},{"location":"writing_filter/#see-also","text":"GenericObjectEditor","title":"See also"},{"location":"writing_filter/#links","text":"junit","title":"Links"},{"location":"zero_r/","text":"Description # Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class). Reference # -none- Package # weka.classifiers.rules Download # Source code: ZeroR.java Additional Information # -none-","title":"Description"},{"location":"zero_r/#description","text":"Class for building and using a 0-R classifier. Predicts the mean (for a numeric class) or the mode (for a nominal class).","title":"Description"},{"location":"zero_r/#reference","text":"-none-","title":"Reference"},{"location":"zero_r/#package","text":"weka.classifiers.rules","title":"Package"},{"location":"zero_r/#download","text":"Source code: ZeroR.java","title":"Download"},{"location":"zero_r/#additional-information","text":"-none-","title":"Additional Information"},{"location":"academic/academic_or_related_projects/","text":"Below are a number of articles referring to academic and/or related projects: Paper References Related Research Groups Related Projects Piqle PROMPT","title":"Academic or related projects"},{"location":"academic/paper_references/","text":"Due to the introduction of the weka.core.TechnicalInformationHandler interface it is now easy to extract all the paper references via weka.core.ClassDiscovery and weka.core.TechnicalInformation . The get_wekatechinfo.sh . Typical use (after an ant exejar ) for BibTeX : get_wekatechinfo.sh -d ../ -w ../dist/weka.jar -b > ../tech.txt (command is issued from the same directory the Weka build.xml is located in) Links # get_wekatechinfo.sh Writing a Classifier","title":"Paper references"},{"location":"academic/paper_references/#links","text":"get_wekatechinfo.sh Writing a Classifier","title":"Links"},{"location":"academic/piqle/","text":"PIQLE is a set of Java classes for quickly experimenting reinforcement learning schemes. For understanding what the algorithms have learned, one can use Weka classification/clustering algorithms : the (s,a,Q(s,a)) values can be formatted in ARFF files. External links # PIQLE homepage","title":"Piqle"},{"location":"academic/piqle/#external-links","text":"PIQLE homepage","title":"External links"},{"location":"academic/prompt/","text":"PROMPT is a open source platform independent system for retrieval, analysis, mapping and comparison of protein sets. PROMPT's focus lies on statistical testing; Data in WEKA's ARFF format can be imported and other data formats can be exported to WEKA. It thus complements the machine learning techniques of the WEKA workbench with statistical significance tests. External links # PROMPT Website","title":"Prompt"},{"location":"academic/prompt/#external-links","text":"PROMPT Website","title":"External links"},{"location":"academic/related_projects/","text":"There are many software projects that are related to Weka because they use it in some form. An incomplete list can be found below. Perhaps particularly noteworthy are RWeka, which provides an interface to Weka from R, python-weka-wrapper, which provides a wrapper for using Weka from Python, and ADAMS, which provides a workflow environment integrating Weka. You may also want to take a look at the (incomplete) list of \"unofficial\" packages for WEKA 3.7 . Active links # ADAMS - A dvanced D ata mining A nd M achine learning S ystem offers all of WEKA's functionality (and lots more) in its workflow engine. Agent Academy - Java integrated development framework for creating Intelligent Agents and Multi Agent Systems Auto-WEKA - Automatic parameter tuning and algorithm selection for Weka. Balie* - BAseLine Information Extraction. Bayesian Network Classifiers - with bindings for Weka. BioWeka - Knowledge discovery and data analysis for biologists. C4.5Rule-PANE and NeC4.5 Cahit Arf - a data extraction utility for Weka. Contrast Mining - Mining the interesting differences between pre-defined data groups. Cost-sensitive classifiers - Adaboost extensions for cost-sensitive classification. Dataconda - Builds a flat table (and an .arff file) from a relational database. Debellor - Data mining platform for data streams. DecisionTemplate - Combining classifiers using Decision Templates. distributedWekaSpark - A proof of concept for running Weka in Apache Spark. ELKI - Similar project, with focus on clustering and outlier detection, as well as on using database index structures for acceleration. Fuzzyweka - Classifier for fuzzy classification based on fuzzy if-then rules. GATE - NLP workbench with Weka interface. GeneticProgramming - Genetic Programming Classifier for Weka Graph RAT - A framework for combining graph and non-graph algorithms. GroovyLab - Groovy based Matlab-like interface to Weka's algorithms. http://sourceforge.net/projects/wekainterfacetranslator/ - A translation tool to facilitate the creation of message.properties files for Weka 3.6. Courtesy of the University of Jordan. Java Framework to crate ARFF from JPA Entity - Use the JPA Entities to create automatically you ARFF file. Kea - automatic keyphrase extraction. KeplerWeka - Weka module for the Kepler workflow environment. Learning Vector Quanization - and more with Weka. Matlab Weka Interface - A module for using Weka from Matlab. Mayday - Machine Learning for Microarrays - plugin for the WEKA machine Learning Library. Meka - A multi-label extension for Weka. Milk - a workbench for multi-instance learning. MOA - Massive online analysis for data streams. Modified version of Weka, including time series mining and visualization tools. Mulan - Multi-label classification. Pattern Miner - Integrated Pattern Management (extraction, storage, retrieval and comparison of data mining patterns) pHMM4weka - Profile Hidden Markov Models for Weka. PROMPT - Statistical comparison and mapping of protein sets. Import/Export of WEKA arff data files. python-arff - Pure Python library for parsing and writing ARFF files. python-weka-wrapper - Wrapper for conveniently using Weka from Python. RWeka - an R interface to Weka. ScalaLab - Scala based Matlab-like interface to Weka's algorithms. Semi-Supervised and Collective Classification using Weka. Spectral clustering by Luigi Dragone. StarSystem command-line tool implementing best practices in supervised classification, including \"agnostic\" feature selection. TClass - classifying multivariate time series. Tertius - a system for rule discovery. TUBE - Tree-based Density Estimation Algorithms. TunedIT - Automated tests of machine-learning algorithms. Repository of datasets, algorithms and benchmarks. Weka for Computational Genetics - Multifactor Dimensionality Reduction (MDR) added to the Weka package. Weka Proper - Database propositionalization for Weka. weka4WS - distributed data mining. Weka-GDPM and Weka-STPM - Weka for geographic data processing and Moving Object Data Analysis and Mining. WekaMetal - a meta-learning extension to Weka. Weka-Parallel - parallel processing for Weka. Word sense disambiguation by Ted Pedersen. x2arff - A simple VB application to convert data stored in excel files into Attribute-Relation File Format. YALE - Yet Another Learning Environment. Dead links (information potentially accessible through Internet Archive) # BenchMarking Via Weka - Programming-language agnostic experimental framework. CMAC - The Cerebellar Model Articulation Controller CVS to ARFF converter - an online tool for the conversion from CSV files to ARFF files Epitopes Toolkit (EpiT) - A platform for developing epitope prediction tools. FAEHIM - Data Mining Web services. FastKMeans - a faster version of k-means clustering (.zip file). Fuzzy algorithms - for clustering and classification. GRB Tool Shed - a tool to aid gamma ray burst research. Grid Weka - grid computing with Weka. Instance-based Classifiers - a collection of Instance-based Classifiers. Judge - software for document classification and clustering. KDDML-MQL - support for KDD process. LocBoost classification demo applet. MARFF - extension of ARFF for Multi-Relational Applications. Mathematica interface for Weka. maxdView visualisation tool for microarray data. Mobile Data Mining (in portuguese) - software for mobile data mining OlexSuite - Text classification methods. OpenSubspace - Subspace-clustering for Weka. Rarff - A Ruby library for manipulating ARFF files. RSW - sequential classification with Weka. TagHelper Tools - a tool for analysis of conversational data. Weka on Text - software for text mining. Weka Visualization tools - using PMML, VisWiz, and ROCOn. Weka-GDPM - extended version of Weka 3.4 to support automatic geographic data preprocessing for spatial data mining. Faster implementation of a NeuralNet with backpropagation (only nominal classes).","title":"Related projects"},{"location":"academic/related_projects/#active-links","text":"ADAMS - A dvanced D ata mining A nd M achine learning S ystem offers all of WEKA's functionality (and lots more) in its workflow engine. Agent Academy - Java integrated development framework for creating Intelligent Agents and Multi Agent Systems Auto-WEKA - Automatic parameter tuning and algorithm selection for Weka. Balie* - BAseLine Information Extraction. Bayesian Network Classifiers - with bindings for Weka. BioWeka - Knowledge discovery and data analysis for biologists. C4.5Rule-PANE and NeC4.5 Cahit Arf - a data extraction utility for Weka. Contrast Mining - Mining the interesting differences between pre-defined data groups. Cost-sensitive classifiers - Adaboost extensions for cost-sensitive classification. Dataconda - Builds a flat table (and an .arff file) from a relational database. Debellor - Data mining platform for data streams. DecisionTemplate - Combining classifiers using Decision Templates. distributedWekaSpark - A proof of concept for running Weka in Apache Spark. ELKI - Similar project, with focus on clustering and outlier detection, as well as on using database index structures for acceleration. Fuzzyweka - Classifier for fuzzy classification based on fuzzy if-then rules. GATE - NLP workbench with Weka interface. GeneticProgramming - Genetic Programming Classifier for Weka Graph RAT - A framework for combining graph and non-graph algorithms. GroovyLab - Groovy based Matlab-like interface to Weka's algorithms. http://sourceforge.net/projects/wekainterfacetranslator/ - A translation tool to facilitate the creation of message.properties files for Weka 3.6. Courtesy of the University of Jordan. Java Framework to crate ARFF from JPA Entity - Use the JPA Entities to create automatically you ARFF file. Kea - automatic keyphrase extraction. KeplerWeka - Weka module for the Kepler workflow environment. Learning Vector Quanization - and more with Weka. Matlab Weka Interface - A module for using Weka from Matlab. Mayday - Machine Learning for Microarrays - plugin for the WEKA machine Learning Library. Meka - A multi-label extension for Weka. Milk - a workbench for multi-instance learning. MOA - Massive online analysis for data streams. Modified version of Weka, including time series mining and visualization tools. Mulan - Multi-label classification. Pattern Miner - Integrated Pattern Management (extraction, storage, retrieval and comparison of data mining patterns) pHMM4weka - Profile Hidden Markov Models for Weka. PROMPT - Statistical comparison and mapping of protein sets. Import/Export of WEKA arff data files. python-arff - Pure Python library for parsing and writing ARFF files. python-weka-wrapper - Wrapper for conveniently using Weka from Python. RWeka - an R interface to Weka. ScalaLab - Scala based Matlab-like interface to Weka's algorithms. Semi-Supervised and Collective Classification using Weka. Spectral clustering by Luigi Dragone. StarSystem command-line tool implementing best practices in supervised classification, including \"agnostic\" feature selection. TClass - classifying multivariate time series. Tertius - a system for rule discovery. TUBE - Tree-based Density Estimation Algorithms. TunedIT - Automated tests of machine-learning algorithms. Repository of datasets, algorithms and benchmarks. Weka for Computational Genetics - Multifactor Dimensionality Reduction (MDR) added to the Weka package. Weka Proper - Database propositionalization for Weka. weka4WS - distributed data mining. Weka-GDPM and Weka-STPM - Weka for geographic data processing and Moving Object Data Analysis and Mining. WekaMetal - a meta-learning extension to Weka. Weka-Parallel - parallel processing for Weka. Word sense disambiguation by Ted Pedersen. x2arff - A simple VB application to convert data stored in excel files into Attribute-Relation File Format. YALE - Yet Another Learning Environment.","title":"Active links"},{"location":"academic/related_projects/#dead-links-information-potentially-accessible-through-internet-archive","text":"BenchMarking Via Weka - Programming-language agnostic experimental framework. CMAC - The Cerebellar Model Articulation Controller CVS to ARFF converter - an online tool for the conversion from CSV files to ARFF files Epitopes Toolkit (EpiT) - A platform for developing epitope prediction tools. FAEHIM - Data Mining Web services. FastKMeans - a faster version of k-means clustering (.zip file). Fuzzy algorithms - for clustering and classification. GRB Tool Shed - a tool to aid gamma ray burst research. Grid Weka - grid computing with Weka. Instance-based Classifiers - a collection of Instance-based Classifiers. Judge - software for document classification and clustering. KDDML-MQL - support for KDD process. LocBoost classification demo applet. MARFF - extension of ARFF for Multi-Relational Applications. Mathematica interface for Weka. maxdView visualisation tool for microarray data. Mobile Data Mining (in portuguese) - software for mobile data mining OlexSuite - Text classification methods. OpenSubspace - Subspace-clustering for Weka. Rarff - A Ruby library for manipulating ARFF files. RSW - sequential classification with Weka. TagHelper Tools - a tool for analysis of conversational data. Weka on Text - software for text mining. Weka Visualization tools - using PMML, VisWiz, and ROCOn. Weka-GDPM - extended version of Weka 3.4 to support automatic geographic data preprocessing for spatial data mining. Faster implementation of a NeuralNet with backpropagation (only nominal classes).","title":"Dead links (information potentially accessible through Internet Archive)"},{"location":"academic/related_research_groups/","text":"Gregory Piatetsky-Shapiro's Knowledge Discovery Mine Journal of Data Mining and Knowledge Discovery Multivariate Data Analysis Software and Resources Machine Learning # MLC++, A Machine Learning Library in C++ UCI - Machine Learning information , software and databases UTCS Machine Learning Research Group Machine Learning Journal Other Sites # AAAI Home Page SRI Artificial Intelligence Center Classification Society of North America MIT Artificial Intelligence Laboratory SIGART Electronic Information Service StatLib Index Dead Links # Microsoft Belief Networks Tools","title":"Related research groups"},{"location":"academic/related_research_groups/#machine-learning","text":"MLC++, A Machine Learning Library in C++ UCI - Machine Learning information , software and databases UTCS Machine Learning Research Group Machine Learning Journal","title":"Machine Learning"},{"location":"academic/related_research_groups/#other-sites","text":"AAAI Home Page SRI Artificial Intelligence Center Classification Society of North America MIT Artificial Intelligence Laboratory SIGART Electronic Information Service StatLib Index","title":"Other Sites"},{"location":"academic/related_research_groups/#dead-links","text":"Microsoft Belief Networks Tools","title":"Dead Links"},{"location":"experimenter/experimenter/","text":"Several articles describe certain aspects of using the WEKA Experimenter: Using the Experimenter API Learning Curves Remote Experiments Running an Experiment using Clusterers Of interest may also be: WEKA experiment DatabaseUtils.props Serialization of Experiments","title":"Experimenter"},{"location":"experimenter/learning_curves/","text":"The Advanced mode of the Experimenter can be used to generated learning curves for classifiers. These approaches can be setup in the Simple mode as well, but it is more cumbersome than in the advanced mode. Number of instances # For varying the number of instances a classifier is trained on, we use the FilteredClassifier classifier (package weka.classifiers.meta ) in conjunction with the RemovePercentage filter (package weka.filters.unsupervised.instance ) and J48 as base classifier (package weka.classifiers.trees ): start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is FilteredClassifier with J48 as base classifier and RemovePercentage as filter close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> filter -> percentage * now you can add all the percentages that you want to test, e.g. (NB: this is the percentage being removed !): 90, 80, 70, 60, 50, 40, 30, 20, 10 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results Classifier parameter # This example shows how to generate a learning curve that does not vary on the number of instances, but on a specific classifier parameter, e.g., the confidenceFactor (= commandline option -C ) of J48 . start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is J48 close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> confidenceFactor * now you can add all the factors that you want to test, e.g.: 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results See also # Databases","title":"Learning curves"},{"location":"experimenter/learning_curves/#number-of-instances","text":"For varying the number of instances a classifier is trained on, we use the FilteredClassifier classifier (package weka.classifiers.meta ) in conjunction with the RemovePercentage filter (package weka.filters.unsupervised.instance ) and J48 as base classifier (package weka.classifiers.trees ): start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is FilteredClassifier with J48 as base classifier and RemovePercentage as filter close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> filter -> percentage * now you can add all the percentages that you want to test, e.g. (NB: this is the percentage being removed !): 90, 80, 70, 60, 50, 40, 30, 20, 10 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results","title":"Number of instances"},{"location":"experimenter/learning_curves/#classifier-parameter","text":"This example shows how to generate a learning curve that does not vary on the number of instances, but on a specific classifier parameter, e.g., the confidenceFactor (= commandline option -C ) of J48 . start the Experimenter (class weka.gui.experiment.Experimenter ) select the configuration mode Advanced in the Setup panel choose as Destination either an ARFF file (= InstancesResultListener ) or a database (= DatabaseResultListener ) and configure the listener to your needs choose as Result generator the CrossValidationResultProducer (or leave the RandomSplitResultProducer ) open the options dialog of the CrossValidationResultProducer by left-clicking on the edit field in case of regression datasets, choose the RegressionSplitEvaluator instead of the ClassifierSplitEvaluator (the latter is used for classification problems) open the options dialog for the splitEvaluator by left-clicking on the edit field choose the classifier that you want to analyze and setup it's parameters, in our case this is J48 close all dialogs again (accepting them with OK) set the Generator properties to enabled choose as property percentage and click on Select : splitEvaluator -> classifier -> confidenceFactor * now you can add all the factors that you want to test, e.g.: 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 * add the datasets you want to generate the learning curve for * save the experiment * go to the Run panel and start the experiment * after the experiment has finished, select the Analyse panel and perform your analysis on the results","title":"Classifier parameter"},{"location":"experimenter/learning_curves/#see-also","text":"Databases","title":"See also"},{"location":"experimenter/remote_experiment/","text":"Remote experiments enable you to distribute the computing load across multiple computers. In the following we will discuss the setup and operation for HSQLDB and MySQL . Preparation # To run a remote experiment you will need: A database server. A number of computers to run remote engines on. To edit the remote engine policy file included in the Weka distribution to allow class and dataset loading from your home directory. An invocation of the Experimenter on a machine somewhere (any will do). For the following examples, we assume a user called johndoe with this setup: Access to a set of computers running a flavour of Unix (pathnames need to be changed for Windows). The home directory is located at /home/johndoe . Weka is found in /home/johndoe/weka . Additional jar archives, i.e., JDBC drivers, are stored in /home/johndoe/jars . The directory for the datasets is /home/johndoe/datasets . Note: The example policy file remote.policy.example is using this setup (available in weka/experiment ). Database Server Setup # HSQLDB Download the JDBC driver for HSQLDB, extract the hsqldb.jar and place it in the directory /home/johndoe/jars . To set up the database server, choose or create a directory to run the database server from, and start the server with: java -classpath /home/johndoe/jars/hsqldb.jar \\ org.hsqldb.Server -database.0 experiment -dbname.0 experiment Note: This will start up a database with the alias experiment ( -dbname.0 <alias> ) and create a properties and a log file at the current location prefixed with experiment ( -database.0 <file> ). MySQL We won't go into details of setting up a MySQL server, but this is rather straightforward and includes the following steps: Download a suitable version of MySQL for your server machine. Start the MySQL server. Create a database - for our example we will use experiment as database name. Download the appropriate JDBC driver, extract the JDBC jar and place it as mysql.jar in /home/johndoe/jars . Remote Engine Setup # First, set up a directory for scripts and policy files: /home/johndoe/remote_engine * Unzip the remoteExperimentServer.jar (from the Weka distribution; or build it from the sources with ant remotejar ) into a temporary directory. * Next, copy the remoteEngine.jar to the /home/johndoe/remote_engine directory. * Create a script, called /home/johndoe/remote_engine/startRemoteEngine , with the following content (don't forget to make it executable with chmod a+x startRemoteEngine ): HSQLDB java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & MySQL java -Xmx256m \\ -classpath /home/johndoe/jars/mysql.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & From Weka 3.7.2 you will need to include the core weka.jar file in the classpath for the RemoteEngine. Assuming that the weka.jar file has been copied to /home/johndoe/remote_engine : java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:weka.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & Now we will start the remote engines (note that the same version of Java must be used for the Experimenter and remote engines) : Copy the remote.policy.example file to /home/johndoe/remote_engine as remote.policy . For each machine you want to run an engine on: ssh to the machine. cd to /home/johndoe/remote_engine . Run /home/johndoe/startRemoteEngine (to enable the remote engines to use more memory, modify the -Xmx option in the startRemoteEngine script) . Configuring the Experimenter # Now we will run the Experimenter: HSQLDB Copy the DatabaseUtils.props.hsql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . Edit this file and change the \" jdbcURL=jdbc:hsqldb:hsql://server_name/database_name \" entry to include the name of the machine that is running your database server (e.g., jdbcURL=jdbc:hsqldb:hsql://dodo.company.com/experiment ). Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter * MySQL * Copy the DatabaseUtils.props.mysql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . * Edit this file and change the \" jdbcURL=jdbc:mysql://server_name:3306/database_name \" entry to include the name of the machine that is running your database server and the name of the database the result will be stored in (e.g., jdbcURL=jdbc:mysql://dodo.company.com:3306/experiment ). * Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/mysql.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter Note: the database name experiment can be still modified in the Experimenter, this is just the default setup. Now we will configure the experiment: First of all select the Advanced mode in the Setup tab Now choose the DatabaseResultListener in the Destination panel. Configure this result producer: HSQLDB Supply the value sa for the username and leave the password empty. MySQL Provide username and password that you need for connecting to the database. From the Result generator panel choose either the CrossValidationResultProducer or the RandomSplitResultProducer (these are the most commonly used ones) and then configure the remaining experiment details (e.g., datasets and classifiers). Now enable the Distribute Experiment panel by checking the tick box. Click on the Hosts button and enter the names of the machines that you started remote engines on ( <Enter> adds the host to the list). You can choose to distribute by run or dataset (try to get a balance). Save your experiment configuration. Now start your experiment as you would do normally. Check your results in the Analyse tab by clicking either the Database or Experiment buttons. Multi-core support # If you want to utilize all the cores on a multi-core machine, then you can do so with Weka version 3.6.x and developer versions later than 3.5.7. All you have to do, is define the port alongside the hostname in the Experimenter (format: hostname:port ) and then start the RemoteEngine with the -p option, specifying the port to listen on. See also this post on the Wekalist . Troubleshooting # If you get an error at the start of an experiment that looks a bit like this: {{01:13:19: RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )] 01:13:19: dataset :vineyard.arff RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )]. Scheduling for execution on another host.}} then do not panic - this happens because multiple remote machines are trying to create the same table and are temporarily locked out - this will resolve itself so just leave your experiment running - in fact, it is a sign that the experiment is working! If you serialized an experiment and then modify your DatabaseUtils.props file due to an error (e.g., a missing type-mapping), the Experimenter will use the DatabaseUtils.props you had at the time you serialized the experiment. Keep in mind that the serialization process also serializes the DatabaseUtils class and therefore stored your props-file! This is another reason for storing your experiments as XML and not in the properietary binary format the Java serialization produces. Using a corrupt or incomplete DatabaseUtils.props file can cause peculiar interface errors, for example disabling the use of the User button alongside the database URL. If in doubt copy a clean DatabaseUtils.props from git . If you get NullPointerException at java.util.Hashtable.get() in the Remote Engine do not be alarmed. This will have no effect on the results of your experiment. Links # Databases weka/experiment/DatabaseUtils.props git HSQLDB MySQL","title":"Remote experiment"},{"location":"experimenter/remote_experiment/#preparation","text":"To run a remote experiment you will need: A database server. A number of computers to run remote engines on. To edit the remote engine policy file included in the Weka distribution to allow class and dataset loading from your home directory. An invocation of the Experimenter on a machine somewhere (any will do). For the following examples, we assume a user called johndoe with this setup: Access to a set of computers running a flavour of Unix (pathnames need to be changed for Windows). The home directory is located at /home/johndoe . Weka is found in /home/johndoe/weka . Additional jar archives, i.e., JDBC drivers, are stored in /home/johndoe/jars . The directory for the datasets is /home/johndoe/datasets . Note: The example policy file remote.policy.example is using this setup (available in weka/experiment ).","title":"Preparation"},{"location":"experimenter/remote_experiment/#database-server-setup","text":"HSQLDB Download the JDBC driver for HSQLDB, extract the hsqldb.jar and place it in the directory /home/johndoe/jars . To set up the database server, choose or create a directory to run the database server from, and start the server with: java -classpath /home/johndoe/jars/hsqldb.jar \\ org.hsqldb.Server -database.0 experiment -dbname.0 experiment Note: This will start up a database with the alias experiment ( -dbname.0 <alias> ) and create a properties and a log file at the current location prefixed with experiment ( -database.0 <file> ). MySQL We won't go into details of setting up a MySQL server, but this is rather straightforward and includes the following steps: Download a suitable version of MySQL for your server machine. Start the MySQL server. Create a database - for our example we will use experiment as database name. Download the appropriate JDBC driver, extract the JDBC jar and place it as mysql.jar in /home/johndoe/jars .","title":"Database Server Setup"},{"location":"experimenter/remote_experiment/#remote-engine-setup","text":"First, set up a directory for scripts and policy files: /home/johndoe/remote_engine * Unzip the remoteExperimentServer.jar (from the Weka distribution; or build it from the sources with ant remotejar ) into a temporary directory. * Next, copy the remoteEngine.jar to the /home/johndoe/remote_engine directory. * Create a script, called /home/johndoe/remote_engine/startRemoteEngine , with the following content (don't forget to make it executable with chmod a+x startRemoteEngine ): HSQLDB java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & MySQL java -Xmx256m \\ -classpath /home/johndoe/jars/mysql.jar:remoteEngine.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & From Weka 3.7.2 you will need to include the core weka.jar file in the classpath for the RemoteEngine. Assuming that the weka.jar file has been copied to /home/johndoe/remote_engine : java -Xmx256m \\ -classpath /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:weka.jar \\ -Djava.security.policy = remote.policy \\ weka.experiment.RemoteEngine & Now we will start the remote engines (note that the same version of Java must be used for the Experimenter and remote engines) : Copy the remote.policy.example file to /home/johndoe/remote_engine as remote.policy . For each machine you want to run an engine on: ssh to the machine. cd to /home/johndoe/remote_engine . Run /home/johndoe/startRemoteEngine (to enable the remote engines to use more memory, modify the -Xmx option in the startRemoteEngine script) .","title":"Remote Engine Setup"},{"location":"experimenter/remote_experiment/#configuring-the-experimenter","text":"Now we will run the Experimenter: HSQLDB Copy the DatabaseUtils.props.hsql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . Edit this file and change the \" jdbcURL=jdbc:hsqldb:hsql://server_name/database_name \" entry to include the name of the machine that is running your database server (e.g., jdbcURL=jdbc:hsqldb:hsql://dodo.company.com/experiment ). Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/hsqldb.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter * MySQL * Copy the DatabaseUtils.props.mysql file to the /home/johndoe/remote_engine directory and rename it to DatabaseUtils.props - a copy comes with your Weka distribution in weka/experiment . * Edit this file and change the \" jdbcURL=jdbc:mysql://server_name:3306/database_name \" entry to include the name of the machine that is running your database server and the name of the database the result will be stored in (e.g., jdbcURL=jdbc:mysql://dodo.company.com:3306/experiment ). * Now start the experimenter (inside this directory): java \\ -cp /home/johndoe/jars/mysql.jar:remoteEngine.jar:/home/johndoe/weka/weka.jar \\ -Djava.rmi.server.codebase = file:/home/johndoe/weka/weka.jar \\ weka.gui.experiment.Experimenter Note: the database name experiment can be still modified in the Experimenter, this is just the default setup. Now we will configure the experiment: First of all select the Advanced mode in the Setup tab Now choose the DatabaseResultListener in the Destination panel. Configure this result producer: HSQLDB Supply the value sa for the username and leave the password empty. MySQL Provide username and password that you need for connecting to the database. From the Result generator panel choose either the CrossValidationResultProducer or the RandomSplitResultProducer (these are the most commonly used ones) and then configure the remaining experiment details (e.g., datasets and classifiers). Now enable the Distribute Experiment panel by checking the tick box. Click on the Hosts button and enter the names of the machines that you started remote engines on ( <Enter> adds the host to the list). You can choose to distribute by run or dataset (try to get a balance). Save your experiment configuration. Now start your experiment as you would do normally. Check your results in the Analyse tab by clicking either the Database or Experiment buttons.","title":"Configuring the Experimenter"},{"location":"experimenter/remote_experiment/#multi-core-support","text":"If you want to utilize all the cores on a multi-core machine, then you can do so with Weka version 3.6.x and developer versions later than 3.5.7. All you have to do, is define the port alongside the hostname in the Experimenter (format: hostname:port ) and then start the RemoteEngine with the -p option, specifying the port to listen on. See also this post on the Wekalist .","title":"Multi-core support"},{"location":"experimenter/remote_experiment/#troubleshooting","text":"If you get an error at the start of an experiment that looks a bit like this: {{01:13:19: RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )] 01:13:19: dataset :vineyard.arff RemoteExperiment (//blabla.company.com/RemoteEngine) (sub)experiment (datataset vineyard.arff) failed : java.sql.SQLException: Table already exists: EXPERIMENT_INDEX in statement [CREATE TABLE Experiment_index ( Experiment_type LONGVARCHAR, Experiment_setup LONGVARCHAR, Result_table INT )]. Scheduling for execution on another host.}} then do not panic - this happens because multiple remote machines are trying to create the same table and are temporarily locked out - this will resolve itself so just leave your experiment running - in fact, it is a sign that the experiment is working! If you serialized an experiment and then modify your DatabaseUtils.props file due to an error (e.g., a missing type-mapping), the Experimenter will use the DatabaseUtils.props you had at the time you serialized the experiment. Keep in mind that the serialization process also serializes the DatabaseUtils class and therefore stored your props-file! This is another reason for storing your experiments as XML and not in the properietary binary format the Java serialization produces. Using a corrupt or incomplete DatabaseUtils.props file can cause peculiar interface errors, for example disabling the use of the User button alongside the database URL. If in doubt copy a clean DatabaseUtils.props from git . If you get NullPointerException at java.util.Hashtable.get() in the Remote Engine do not be alarmed. This will have no effect on the results of your experiment.","title":"Troubleshooting"},{"location":"experimenter/remote_experiment/#links","text":"Databases weka/experiment/DatabaseUtils.props git HSQLDB MySQL","title":"Links"},{"location":"experimenter/running_an_experiment_using_clusterers/","text":"Using the advanced mode of the Experimenter you can now run experiments on clustering algorithms as well as classifiers (Note: this is a new feature available with Weka 3.5.8). The main evaluation metric for this type of experiment is the log likelihood of the clusters found by each clusterer. Here is an example of setting up a cross-validation experiment using clusterers. Choose CrossValidationResultProducer from the Result generator panel. Next, choose DensityBasedClustererSplitEvaluator as the split evaluator to use. If you click on DensityBasedClustererSplitEvaluator you will see its options. Note that there is an option for removing the class column from the data. In the Experimenter, the class column is set to be the last column by default. Turn this off if you want to keep this column in the data. Once DensityBasedClustererSplitEvaluator has been selected, you will notice that the Generator properties have become disabled. Enable them again and expand splitEvaluator . Select the clusterer node. Now you will see that EM becomes the default clusterer and gets added to the list of schemes. You can now add/delete other clusterers. IMPORTANT : in order to any clusterer that does not produce density estimates (i.e. most other clusterers in Weka), they will have to wrapped in the MakeDensityBasedClusterer . Once and experiment has been run, you can analyze results in the Analyse panel. In the Comparison field you will need to scroll down and select \"Log_likelihood\".","title":"Running an experiment using clusterers"},{"location":"experimenter/using_the_experiment_api/","text":"General # The ExperimentDemo.java class demonstrates the use of the Experiment API (stable 3.6 or developer version): setting up an experiment one classifier one or more datasets classification or regression cross-validation or random split running the experiment evaluating the experiment and outputting the results Classes of the Experiment API being used: weka.experiment.Experiment - the class for peforming experiments weka.experiment.ClassifierSplitEvaluator - for classification weka.experiment.RegressionSplitEvaluator - for regression weka.experiment.CrossValidationResultProducer - for cross-validation weka.experiment.RandomSplitResultProducer - for random splits weka.experiment.InstancesResultListener - for storing the results of the experiment, used as input for the TTester algorithm weka.experiment.PairedCorrectedTTester - for generating the statistics see Claude Nadeau, Yoshua Bengio (2001). Inference for the Generalization Error. Machine Learning. weka.experiment.ResultMatrixPlainText - for storing the statistics Examples # Usage: java ExperimentDemo -classifier <classifier incl. parameters> -exptype <classification | regression> -splittype <crossvalidation | randomsplit> -runs < # of runs> -folds <folds for CV> -percentage <percentage for randomsplit> -result <ARFF file for storing the results> -t <dataset> ( can be supplied multiple times ) Classification # An example run with J48 and two UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.J48 -exptype classification -splittype crossvalidation -runs 10 -folds 10 -result /some/where/results.arff -t vote.arff -t iris.arff And the output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) vote Perc. correct: 96.57135311000002 StdDev: 2.560851001842444 (2) iris Perc. correct: 94.73333325999994 StdDev: 5.300826810632913 Regression # Another example with M5P and two numeric UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.M5P -exptype regression -splittype randomsplit -runs 10 -percentage 66 -result /some/where/results.arff -t bolts.arff -t bodyfat.arff And the associated output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) bolts Perc. correct: 0.9701825 StdDev: 0.017970627641614084 (2) bodyfat.names Perc. correct: 0.9795883 StdDev: 0.011646527074622525 See also # Use Weka in your Java code - for general use of the Weka API Downloads # ExperimentDemo.java","title":"General"},{"location":"experimenter/using_the_experiment_api/#general","text":"The ExperimentDemo.java class demonstrates the use of the Experiment API (stable 3.6 or developer version): setting up an experiment one classifier one or more datasets classification or regression cross-validation or random split running the experiment evaluating the experiment and outputting the results Classes of the Experiment API being used: weka.experiment.Experiment - the class for peforming experiments weka.experiment.ClassifierSplitEvaluator - for classification weka.experiment.RegressionSplitEvaluator - for regression weka.experiment.CrossValidationResultProducer - for cross-validation weka.experiment.RandomSplitResultProducer - for random splits weka.experiment.InstancesResultListener - for storing the results of the experiment, used as input for the TTester algorithm weka.experiment.PairedCorrectedTTester - for generating the statistics see Claude Nadeau, Yoshua Bengio (2001). Inference for the Generalization Error. Machine Learning. weka.experiment.ResultMatrixPlainText - for storing the statistics","title":"General"},{"location":"experimenter/using_the_experiment_api/#examples","text":"Usage: java ExperimentDemo -classifier <classifier incl. parameters> -exptype <classification | regression> -splittype <crossvalidation | randomsplit> -runs < # of runs> -folds <folds for CV> -percentage <percentage for randomsplit> -result <ARFF file for storing the results> -t <dataset> ( can be supplied multiple times )","title":"Examples"},{"location":"experimenter/using_the_experiment_api/#classification","text":"An example run with J48 and two UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.J48 -exptype classification -splittype crossvalidation -runs 10 -folds 10 -result /some/where/results.arff -t vote.arff -t iris.arff And the output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) vote Perc. correct: 96.57135311000002 StdDev: 2.560851001842444 (2) iris Perc. correct: 94.73333325999994 StdDev: 5.300826810632913","title":"Classification"},{"location":"experimenter/using_the_experiment_api/#regression","text":"Another example with M5P and two numeric UCI datasets: java ExperimentDemo -classifier weka.classifiers.trees.M5P -exptype regression -splittype randomsplit -runs 10 -percentage 66 -result /some/where/results.arff -t bolts.arff -t bodyfat.arff And the associated output: Setting up... Initializing... Running... Finishing... Evaluating... Result: (1) bolts Perc. correct: 0.9701825 StdDev: 0.017970627641614084 (2) bodyfat.names Perc. correct: 0.9795883 StdDev: 0.011646527074622525","title":"Regression"},{"location":"experimenter/using_the_experiment_api/#see-also","text":"Use Weka in your Java code - for general use of the Weka API","title":"See also"},{"location":"experimenter/using_the_experiment_api/#downloads","text":"ExperimentDemo.java","title":"Downloads"},{"location":"faqs/OutOfMemoryException/","text":"Most Java virtual machines only allocate a certain maximum amount of memory to run Java programs. Usually, this is much less than the amount of RAM in your computer. There is some information on default heap sizes in Oracle Java virtual machines for Java 8 here . However, you can extend the memory available for the virtual machine by setting appropriate options. With Oracle's JDK, for example, you can go java -Xmx2g ... to set the maximum Java heap size to 2GB. A reliable way to set the maximum heap size for Oracle, Zulu and OpenJDK Java virtual machines (and overwrite any other settings that might be provided in startup scripts, etc.) is to use the _JAVA_OPTIONS environment variable to specify the -Xmx option. There is more information here . Setting environment variables under Windows: Windows 10 Windows 11","title":"OutOfMemoryException"},{"location":"faqs/arff_does_not_load/","text":"One way to figure out why ARFF files are failing to load is to give them to the weka.core.Instances class. In the SimpleCLI or in the terminal, type the following: java weka.core.Instances filename.arff where you substitute filename for the actual name of your file. This should return an error if there is a problem reading the file, or show some statistics if the file is ok. The error message you get should give some indication of what is wrong. nominal value not declared in header, read Token[X], line Y # If you get this error message than you seem to have declared a nominal attribute in the ARFF header section, but WEKA came across a value ( \"X\" ) in the data (in line Y ) for this particular attribute that wasn't listed as possible value. All nominal values that appear in the data must be declared in the header.","title":"Arff does not load"},{"location":"faqs/arff_does_not_load/#nominal-value-not-declared-in-header-read-tokenx-line-y","text":"If you get this error message than you seem to have declared a nominal attribute in the ARFF header section, but WEKA came across a value ( \"X\" ) in the data (in line Y ) for this particular attribute that wasn't listed as possible value. All nominal values that appear in the data must be declared in the header.","title":"nominal value not declared in header, read Token[X], line Y"},{"location":"faqs/can_i_change_the_colors_background_axes_etc_of_the_plots_in_weka/","text":"Sure, this information is stored in the Visualize.props properties file: weka.gui.visualize.Plot2D.axisColour defines the color of the axes weka.gui.visualize.Plot2D.backgroundColour sets the background color For more information see the articles about properties file (especially the section Precedence will tell you where to place the .props file.) and Visualize.props itself.","title":"Can i change the colors background axes etc of the plots in weka"},{"location":"faqs/can_i_compile_weka_into_native_code/","text":"Yes, you have the following options: Excelsior JET - a commercial tool for compiling Java into native code (Windows/Linux) gcj - a free, cross-platform tool for compiling Java into native code See the article Compiling WEKA with gcj for more details.","title":"Can i compile weka into native code"},{"location":"faqs/can_i_make_a_screenshot_of_a_plot_or_graph_directly_in_weka/","text":"Yes, you can. The currently supported formats are BMP, EPS, JPEG and PNG. The magic button is Alt+Shift+Left-Click . From Weka 3.7.5 it is also possible to export various charts as PNG files non-interactively from a Knowledge Flow process. See Exporting Charts from the Knowledge Flow .","title":"Can i make a screenshot of a plot or graph directly in weka"},{"location":"faqs/can_i_process_utf8_datasets_or_files/","text":"Java can process UTF-8 files without any problems, it is just that Java uses a different encoding for displaying them under Windows (= \"Cp1252\"). If you change the file encoding to \"utf-8\" everything should be fine. If you are running WEKA directly from the commandline, just add the following parameter to your commandline: -Dfile.encoding=utf-8 If you are starting WEKA from the Start menu, then edit the RunWEKA.ini file: If a fileEncoding placeholder already exists, then just change the value from \"Cp1252\" to \"utf-8\" (without the quotes of course). If there isn't a fileEncoding yet, just add the -Dfile.encoding=utf-8 parameter to all the java / javaw commands). For Korean users, the following was suggested : Save ARFF file as UTF-8 Use cp949 for RunWeka.ini Check Unicode Utf-8 in Region Settings","title":"Can i process utf8 datasets or files"},{"location":"faqs/can_i_run_an_experiment_using_clusterers_in_the_experimenter/","text":"Yes, see the article Running an Experiment Using Clusterers .","title":"Can i run an experiment using clusterers in the experimenter"},{"location":"faqs/can_i_tune_the_parameters_of_a_classifier/","text":"Yes, you can do that with one of the following meta-classifiers: weka.classifiers.meta.CVParameterSelection weka.classifiers.meta.GridSearch (only developer version) weka.classifiers.meta.AutoWEKAClassifier (via external package) weka.classifiers.meta.MultiSearch (via external package ) See the Javadoc of the respective classifier or the Optimizing parameters article for more information.","title":"Can i tune the parameters of a classifier"},{"location":"faqs/can_i_use_gpus_to_speed_up_weka/","text":"Possibly, have a look at the article Speeding up Weka for more details.","title":"Can i use gpus to speed up weka"},{"location":"faqs/can_i_use_weka_for_time_series_analysis/","text":"Weka 3.7.3 has a new package that provides an environment for time series analysis. The article How do I use the package manager? can be followed to install this package. Once installed, the package provides a plugin tab in the Explorer. Documentation on the time series environment can be found here Older versions of Weka have limited support for time series analysis and consists of only two filters, TimeSeriesDelta and TimeSeriesTranslate . There are modified (not supported by the University of Waikato) versions of WEKA out there, that offer additional functionality ( 1 , 2 ).","title":"Can i use weka for time series analysis"},{"location":"faqs/can_i_use_weka_from_c_sharp/","text":"Yes, you can. Read the Use WEKA with the Microsoft .NET Framework article for more information. There is also a tutorial for IKVM available .","title":"Can i use weka from c sharp"},{"location":"faqs/can_i_use_weka_from_groovy/","text":"Yes, you can. Read the Using WEKA from Groovy article for more information. This article tells you how to setup the Groovy CLASSPATH, in order to make the WEKA classes available to Groovy, and also contains some sample code.","title":"Can i use weka from groovy"},{"location":"faqs/can_i_use_weka_from_python/","text":"There are several ways of using Weka in Python or Python-like environment. Jython # If you're starting from scratch, you might want to consider Jython , a rewrite of Python to seamlessly integrate with Java. The drawback is, that you can only use the libraries that Jython implements, not others like NumPy or SciPy . The article Using WEKA from Jython explains how to use WEKA classes from Jython and how to implement a new classifier in Jython, with an example of ZeroR implemented in Jython. Jepp # An approach making use of the javax.script package (new in Java 6) is Jepp , Java embedded Python . Jepp seems to have the same limitations as Jython, not being able to import Scipy or Numpy, but one can import pure Python libraries. The arcticle Using WEKA via Jepp contains more information and examples. JPype # Another solution, to access Java from within Python applications is JPype . python-weka-wrapper3 # You can use the python-weka-wrapper3 Python 3 library to access most of the non-GUI functionality of Weka (3.9.x): pypi github examples sklearn-weka-plugin # With the sklearn-weka-plugin library, you can use Weka from within the scikit-learn framework. The library itself uses python-weka-wrapper3 under the hood to make use of the Weka algorithms. pypi github examples","title":"Can i use weka from python"},{"location":"faqs/can_i_use_weka_from_python/#jython","text":"If you're starting from scratch, you might want to consider Jython , a rewrite of Python to seamlessly integrate with Java. The drawback is, that you can only use the libraries that Jython implements, not others like NumPy or SciPy . The article Using WEKA from Jython explains how to use WEKA classes from Jython and how to implement a new classifier in Jython, with an example of ZeroR implemented in Jython.","title":"Jython"},{"location":"faqs/can_i_use_weka_from_python/#jepp","text":"An approach making use of the javax.script package (new in Java 6) is Jepp , Java embedded Python . Jepp seems to have the same limitations as Jython, not being able to import Scipy or Numpy, but one can import pure Python libraries. The arcticle Using WEKA via Jepp contains more information and examples.","title":"Jepp"},{"location":"faqs/can_i_use_weka_from_python/#jpype","text":"Another solution, to access Java from within Python applications is JPype .","title":"JPype"},{"location":"faqs/can_i_use_weka_from_python/#python-weka-wrapper3","text":"You can use the python-weka-wrapper3 Python 3 library to access most of the non-GUI functionality of Weka (3.9.x): pypi github examples","title":"python-weka-wrapper3"},{"location":"faqs/can_i_use_weka_from_python/#sklearn-weka-plugin","text":"With the sklearn-weka-plugin library, you can use Weka from within the scikit-learn framework. The library itself uses python-weka-wrapper3 under the hood to make use of the Weka algorithms. pypi github examples","title":"sklearn-weka-plugin"},{"location":"faqs/check_classpath_within_weka/","text":"Yes, you can. Just start up the SimpleCLI and issue the following command: java weka.core.SystemInfo Look for the property java.class.path , which lists the CLASSPATH WEKA was started with.","title":"Check classpath within weka"},{"location":"faqs/check_memory_available/","text":"You can easily check, how much memory WEKA can use (this depends on the maximum heap size the Java Virtual Machine was started with). SimpelCLI start the SimpleCLI run the following command: java weka.core.SystemInfo the property memory.max lists the maximum amount of memory available to WEKA GUIChooser select Help -> SystemInfo the property memory.max lists the maximum amount of memory available to WEKA In case you should run into an OutOfMemoryException , you will have to increase the maximum heap size . How much you can allocate, depends heavily on the operating system and the underlying hardware, see the Java Virtual Machine article). Also, have a look at the OutOfMemoryException section further down.","title":"Check memory available"},{"location":"faqs/commercial_applications/","text":"WEKA is licensed under the GNU General Public license ( GPL 2.0 for Weka 3.6 ) and ( GPL 3.0 for Weka > 3.7.5 ). Any derivative work obtained under this license must be licensed under the GPL if this derivative work is distributed to a third party. For commercial projects that require the ability to distribute WEKA code as part of a program that cannot be distributed under the GPL, it may be possible to purchase an appropriate license from the copyright holders listed in the corresponding Java classes. Projects that only require a small subset of algorithms or filters can use the tiny-weka library as basis. This library consists of core classes of WEKA and is released under the liberal MIT license . Only additional classes need then be licensed. A ready-to-use maven template is available as well. The copyright for most WEKA code is owned by the University of Waikato. For information on licenses for this code please contact Rosanne Ellis, Director of Innovation and Impact, here at the University of Waikato , by sending an email to rosanne dot ellis at waikato dot ac dot nz .","title":"Commercial applications"},{"location":"faqs/contribution/","text":"Information on how to contribute to WEKA can be found in the Contributing a package section of the How are packages structured for the package management system? article. The conditions for new classifiers (schemes in general) are that, firstly, they have to be published in the proceedings of a renowned conference (e.g., ICML) or as an article of respected journal (e.g., Machine Learning) and, secondly, that they outperform other standard schemes (e.g., J48/C4.5). But please bear in mind, that we don't have a lot of man power, i.e., being the WEKA maintainer is NOT a full-time position.","title":"Contribution"},{"location":"faqs/couldnt_read_from_database_unknown_data_type/","text":"Since there is a plethora of different databases out there, each with their own data types, it is impossible to define all of them beforehand. WEKA therefore comes with setups for different databases that allow you to run experiments without any additional tuning. But if you want to read database from a different data source, then it can happen that you have to tell WEKA how to import these data types. Here is what to do: Extract the weka/experiment/DatabaseUtils.props file from either the weka.jar or weka-src.jar and place it in your home directory. Finally, check out the section Missing Datatypes in the Databases article and add the missing data types accordingly. Notes: jar files are just ZIP files . Just use an archive manager that can handle ZIP files to open them. Windows users can use 7-zip for instance. More information on props files can be found in the Properties file article. If you don't know where to find your home directory , see FAQ Where is my home directory located? .","title":"Couldnt read from database unknown data type"},{"location":"faqs/csv_file_conversion/","text":"Either load the CSV file in the Explorer or use the CSV converter on the commandline as follows: java weka.core.converters.CSVLoader filename.csv > filename.arff See also the Converting CSV to ARFF article and FAQ Can I use CSV files? .","title":"Csv file conversion"},{"location":"faqs/different_versions/","text":"Refer to History for a tabular overview of all Weka releases. Several branches are associated with the 1st, 2nd, 3rd, and 4th edition of the book Data Mining: Practical Machine Learning Tools and Techniques by Ian H. Witten and Eibe Frank , joined by Mark Hall for the 3rd edition and Chris Pal for the 4th edition. Once created, non-development branches receive bug fixes, but no new features (classifiers, filters, etc.). Version name Most recent base number Associated with book edition Book 1st ed. version 3.0.x 1st edition Old GUI version 3.2.x none Book 2nd ed. version 3.4.x 2nd edition Book 3rd ed. version 3.6.x 3rd edition Book 4th ed. version 3.8.x 4th edition Development version 3.9.x none For contributions , you should always develop against the developer version.","title":"Different versions"},{"location":"faqs/does_weka_support_multi_label_classification/","text":"No, WEKA only allows you to specify a single class attribute (which can be numeric or contain an arbitrary number of labels). There are other third-party frameworks available that can handle this type of data. One of them is Mulan , which is built on top of WEKA.","title":"Does weka support multi label classification"},{"location":"faqs/home_directory_location/","text":"Where a user's home directory is located varies from platform to platform and among the users on a single computer. But the actual location of the home directory is available through special environment variables: Unix/Linux $HOME Windows %USERPROFILE% Cygwin $USERPROFILE In order to find out where these environment variables actually point to, do the following: on Unix/Linux , open a terminal and type the following command echo $HOME on Windows , open a command-prompt and type the following command echo %USERPROFILE% on Cygwin , open a bash and type the following command echo $USERPROFILE","title":"Home directory location"},{"location":"faqs/how_can_i_perform_multi_instance_learning_in_weka/","text":"The article Multi-instance classification explains what classifiers can perform multi-instance classification and what format the data has to be in for these multi-instance classifiers.","title":"How can i perform multi instance learning in weka"},{"location":"faqs/how_can_i_speed_up_weka/","text":"Depending on the algorithm, it might be possible. Have a look at the article Speeding up Weka for more details.","title":"How can i speed up weka"},{"location":"faqs/how_can_i_track_instances_in_weka/","text":"WEKA doesn't support internal IDs for instances, one has to use ID attributes. See How do I use ID attributes?","title":"How can i track instances in weka"},{"location":"faqs/how_can_i_use_transactional_data_in_weka/","text":"Transactional data is often stored in databases by having a table with the transaction ID as the primary key. Individual items or elements of a given transaction may be split up over multiple rows in the table (each with the same ID). Data in this format needs to be converted to one row per transaction before it can be used to learn classifiers, association rules, clusterers etc. in WEKA. From WEKA 3.7.2 there is a package called denormalize that contains a filter that can perform this kind of \"flattening\" process. The filter requires 1) that the data contains an ID field that uniquely identifies each separate transaction , and 2) the data is already sorted in order of this ID field . Here is an example scenario taken from the WEKA mailing list: Hi, I have data spanning multiple rows for an instance, such as below (User 1 span across multiple rows, User 2 as well). Is it possible to use WEKA to cluster this dataset? If not, any suggestion on how I should organize the data so that I can use WEKA to cluster this data? User ItemID Sequence TimeSpent 1 1 1 5 1 2 2 1 1 5 3 8 1 6 4 12 1 8 5 2 2 1 1 7 2 2 2 3 2 3 3 3 2 4 4 2 2 5 5 7 In WEKA 3.7.2 there is a package called denormalize that contains a filter for flattening transactional data. The first thing you'd have to do to your example above is to convert it into an ARFF file: @relation test @attribute User numeric @attribute ItemID numeric @attribute Sequence numeric @attribute TimeSpent numeric @data 1, 1, 1, 5 1, 2, 2, 1 1, 5, 3, 8 1, 6, 4, 12 1, 8, 5, 2 2, 1, 1, 7 2, 2, 2, 3 2, 3, 3, 3 2, 4, 4, 2 2, 5, 5, 7 Next, you can run the NumericToNominal filter to convert the attributes that need to be coded as nominal (the User attribute is an ID and can stay either as numeric or nominal). Here I've converted all attributes except the ID to nominal: java weka.filters.unsupervised.attribute.NumericToNominal -R 2-last -i test.arff > test2.arff This results in: @relation test-weka.filters.unsupervised.attribute.NumericToNominal-R2-last @attribute User numeric @attribute ItemID {1,2,3,4,5,6,8} @attribute Sequence {1,2,3,4,5} @attribute TimeSpent {1,2,3,5,7,8,12} @data 1,1,1,5 1,2,2,1 1,5,3,8 1,6,4,12 1,8,5,2 2,1,1,7 2,2,2,3 2,3,3,3 2,4,4,2 2,5,5,7 Now, assuming that the denormalize package is installed, and ( IMPORTANT ) that the data is already sorted in order of the ID attribute (\"User\" in this case): java weka.Run Denormalize -G first -i test2.arff > final.arff This results in: @attribute User numeric @attribute ItemID_1 {f,t} @attribute ItemID_2 {f,t} @attribute ItemID_3 {f,t} @attribute ItemID_4 {f,t} @attribute ItemID_5 {f,t} @attribute ItemID_6 {f,t} @attribute ItemID_8 {f,t} @attribute Sequence_1 {f,t} @attribute Sequence_2 {f,t} @attribute Sequence_3 {f,t} @attribute Sequence_4 {f,t} @attribute Sequence_5 {f,t} @attribute TimeSpent_1 {f,t} @attribute TimeSpent_2 {f,t} @attribute TimeSpent_3 {f,t} @attribute TimeSpent_5 {f,t} @attribute TimeSpent_7 {f,t} @attribute TimeSpent_8 {f,t} @attribute TimeSpent_12 {f,t} @data 1,t,t,f,f,t,t,t,t,t,t,t,t,t,t,f,t,f,t,t 2,t,t,t,t,t,f,f,t,t,t,t,t,f,t,t,f,t,f,f Note, that for clustering/association rules you'd want to first remove the User ID attribute. I've shown this as an example using the command-line interface. It can all be done from the Explorer as well of course. The Denormalize filter has options for aggregating any numeric attributes (not the ID) as well, so if you left (for example) the TimeSpent attribute as numeric, rather than converting it to nominal using NumericToNominal , then Denormalize can aggregate it (sum, average, max, min).","title":"How can i use transactional data in weka"},{"location":"faqs/how_can_i_use_weka_with_matlab_or_octave/","text":"Matlab and Octave allow you to interface with Java applications, which allows you to use Weka from within these applications. See the following presentation, section Octave ( Octave is fairly compatible with Matlab), on how to use the Java integration: WEKA Ecosystem","title":"How can i use weka with matlab or octave"},{"location":"faqs/how_do_i_add_a_new_classifier_filter_kernel_etc/","text":"As of WEKA 3.4.4, all the derived classes of superclasses that can be edited in the GenericObjectEditor, like subclasses of weka.classifiers.Classifier for instance, can be determined dynamically at runtime. Read here for more information. Note: WEKA 3.5.8 and 3.6.0 turned the automatic discovery off by default. Starting with 3.6.1 and 3.7.0 it is turned on again.","title":"How do i add a new classifier filter kernel etc"},{"location":"faqs/how_do_i_compile_weka/","text":"You can compile the source code simply with any (Sun-compliant) java compiler, or use ant, or an IDE. Check out the article about Compiling WEKA , which contains links to further articles, covering topics about ant and IDEs.","title":"How do i compile weka"},{"location":"faqs/how_do_i_connect_to_a_database/","text":"With a bit of effort you can easily access databases via JDBC . You need the following: JDBC driver for the database you want to access in your CLASSPATH. A customized DatabaseUtils.props file. The following example files are located in the weka/experiment directory of the weka.jar archive: HSQLDB - DatabaseUtils.props.hsql (>= 3.4.1/3.5.0) MS SQL Server 2000 - DatabaseUtils.props.mssqlserver (>= 3.4.9/3.5.4) MS SQL Server 2005 Express Edition - DatabaseUtils.props.mssqlserver2005 (> 3.4.10/3.5.5) MySQL - DatabaseUtils.props.mysql (>= 3.4.9/3.5.4) ODBC - DatabaseUtils.props.odbc (>= 3.4.9/3.5.4) Oracle - DatabaseUtils.props.oracle (>= 3.4.9/3.5.4) PostgreSQL - DatabaseUtils.props.postgresql (>= 3.4.9/3.5.4) Sqlite 3.x - DatabaseUtils.props.sqlite3 (> 3.4.12, > 3.5.7) For more details see the following articles: Databases DatabaseUtils.props Windows databases (covers access via ODBC) The following FAQs could be of interest as well: Couldn't read from database: unknown data type Trying to add JDBC driver: ... - Error, not in CLASSPATH?","title":"How do i connect to a database"},{"location":"faqs/how_do_i_divide_a_dataset_into_training_and_test_set/","text":"You can use the RemovePercentage filter (package weka.filters.unsupervised.instance ). In the Explorer just do the following: training set: Load the full dataset select the RemovePercentage filter in the preprocess panel set the correct percentage for the split apply the filter save the generated data as a new file test set: Load the full dataset (or just use undo to revert the changes to the dataset) select the RemovePercentage filter if not yet selected set the invertSelection property to true apply the filter save the generated data as new file","title":"How do i divide a dataset into training and test set"},{"location":"faqs/how_do_i_generate_compatible_train_and_test_sets_that_get_processed_with_a_filter/","text":"Running a filter twice, once with the train set as input and then the second time with the test set, will create almost certainly two incompatible files. Why is that? Every time you run a filter, it will get initialized based on the input data, and, of course, training and test set will differ, hence creating incompatible output. You can avoid this by using batch filtering . See the article on Batch filtering for more details.","title":"How do i generate compatible train and test sets that get processed with a filter"},{"location":"faqs/how_do_i_generate_learning_curves/","text":"You can generate learning curves using the Advanced mode of the Experimenter. See the article Learning curves for more details.","title":"How do i generate learning curves"},{"location":"faqs/how_do_i_make_predictions_with_a_trained_model/","text":"Since WEKA allows models to be saved (as Java binary serialized objects), one can use those models again to perform predictions. Check out the article Making Predictions for more details.","title":"How do i make predictions with a trained model"},{"location":"faqs/how_do_i_modify_the_runwekabat_file/","text":"Check out the Invocation section of the Java Virtual Machine article.","title":"How do i modify the runwekabat file"},{"location":"faqs/how_do_i_perform_attribute_selection/","text":"WEKA offers different approaches for performing attribute selection: directly with the attribute selection classes, with a meta-classifier, and with a filter. Check out the Performing attribute selection article for more details and examples.","title":"How do i perform attribute selection"},{"location":"faqs/how_do_i_perform_clustering/","text":"WEKA offers clustering capabilities not only as standalone schemes, but also as filters and classifiers. Check out the article about Using cluster algorithms more detailed information.","title":"How do i perform clustering"},{"location":"faqs/how_do_i_perform_cost_sensitive_classification/","text":"Cost-sensitive classification can be achieved using a Cost-Sensitive Classifier . Related articles to the cost-sensitive topic include: Cost Matrix Metacost Searching for the term cost-sensitive will also show articles related to the topic.","title":"How do i perform cost sensitive classification"},{"location":"faqs/how_do_i_perform_one_class_classification/","text":"WEKA offers some rudimentary support for one-class classfication: via the weka.classifiers.functions.LibSVM wrapper classifier (stable 3.6 and developer version). See the LibSVM article for more information. via the weka.classifiers.meta.OneClassClassifier meta-classifier (developer version >3.7.0)","title":"How do i perform one class classification"},{"location":"faqs/how_do_i_perform_text_classification/","text":"The article Text categorization with WEKA explains a few basics of how to deal with text documents, like importing and pre-processing.","title":"How do i perform text classification"},{"location":"faqs/how_do_i_run_the_windows_weka_installer_in_silent_mode/","text":"To run the Windows installer for WEKA without in \"silent\" mode (i.e. without any graphical prompts/dialogs): Open a command prompt window Navigate to the directory where the installer executable resides Type .\\weka-x-y-z.exe /S Replace x, y and z with the correct version numbers for your installer of course. Note that the /S is a capital S.","title":"How do i run the windows weka installer in silent mode"},{"location":"faqs/how_do_i_use_id_attributes/","text":"See the Instance ID article for more information on how to use attribute IDs in WEKA.","title":"How do i use id attributes"},{"location":"faqs/how_do_i_use_libsvm_in_weka/","text":"If you run the classifier weka.classifiers.functions.LibSVM and get the libsvm classes not in CLASSPATH! error message, you are missing the libsvm jar archive in your current classpath. The LibSVM classifier is only a wrapper and doesn't need the libsvm classes to compile (uses Reflection). Check out the LibSVM article for details about how to use this classifier.","title":"How do i use libsvm in weka"},{"location":"faqs/how_do_i_use_weka_from_the_command_line/","text":"Reading the Primer article will help you understand the usage of the command line, as well as the How to run WEKA schemes from commandline article.","title":"How do i use weka from the command line"},{"location":"faqs/how_do_i_use_wekas_classes_in_my_own_code/","text":"It's not that hard to use WEKA classes in your own code, the following articles give a good overview of how to do that: Use WEKA in your Java code In general, the articles tagged as \" source code \". Further resources: Check out the chapter Using the API in the Weka manual (releases >3.6.1 and >3.7.0). The Weka Examples collection is a an ANT project that is available through releases later than 09/08/2009, containing a lot of example classes. Note: WEKA is open-source software under the GNU General Public License , which means that your code has to be licensed under the GPL as well.","title":"How do i use wekas classes in my own code"},{"location":"faqs/how_do_i_write_a_new_classifier_or_filter/","text":"Basically, a classifier needs to be derived from weka.classifiers.Classifier and a filter from weka.filters.Filter , but this is only part of the story. The following articles cover the development of new schemes in greater detail: Writing your own Classifier Writing your own Filter If your scheme is outside the usual WEKA packages, you need to make WEKA aware of this package in order to be able to use it in the GUI as well. See How do I add a new classifier, filter, kernel, etc? for more information about this. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0. Furthermore, this chapter also covers clusterers, attribute selection algorithms and associators.","title":"How do i write a new classifier or filter"},{"location":"faqs/i_cannot_process_large_datasets_any_ideas/","text":"Since most schemes in WEKA need to have all of the data present in memory, large datasets can be a problem. The article Classifying large datasets tries to present some solutions. But make sure that you have already read how to deal with an OutOfMemoryException .","title":"I cannot process large datasets any ideas"},{"location":"faqs/i_have_unbalanced_data_now_what/","text":"You can either perform cost-sensitive classification or try a resampling filter to get a more balanced class distribution: Resample filter (supervised version, to take class distribution into account) SMOTE (available through SMOTE package)","title":"I have unbalanced data now what"},{"location":"faqs/latest_bugfixes/","text":"The article How to get the latest bugfixes explains it in detail.","title":"Latest bugfixes"},{"location":"faqs/old_versions/","text":"If you need a specific version of WEKA, e.g., due to some third-party tools, go WEKA's project page on Sourceforge.net . In the Files section you have access to all the releases ever made.","title":"Old versions"},{"location":"faqs/package_manager_doesnt_start/","text":"The most likely reason for this is that your computer does not have direct access to the Internet and Java needs to be told to use a proxy server to access the web. The best way to achieve this is to configure an environment variable that provides the proxy details, e.g., _JAVA_OPTIONS which is read by Oracle Java virtual machines. There is more information on this variable here . Information on how to set environment variables in Windows is here . For Mac users, there is a nice program to set environment variables available here . Set the value of this variable to -Dhttp.proxyHost=some.proxy.somewhere.net -Dhttp.proxyPort=port where some.proxy.somewhere.net needs to be replaced by the name of your proxy server and port needs to be replaced by the appropriate port number on the proxy server. Your IT department should be able to give you these details. This should allow the package manager to connect to the website that hosts the package meta-information. However, if the package manager still cannot connect to the Internet, you can also force it to run in offline mode, by setting the above environment variable to -Dweka.packageManager.offline=true Then, you can download package .zip files manually via your web browser, by navigating to https://weka.sourceforge.io/packageMetaData/ clicking on the link for the package you want to install, then clicking on Latest , and finally clicking on the URL given next to PackageURL . Once you have downloaded the package .zip file, open the WEKA package manager, and click on the File/URL button in the top-right corner of the package manager window (in the Unofficial panel). Then navigate to your package .zip file and install it. If you are running Weka in offline mode, and the packages you are installing have some dependencies on one another, then there can still be some problems due to Weka not being able to verify the dependencies by checking against the central repository. This is usually a problem in the case where Weka has never been able to connect to the internet and thus has not downloaded and established a cache of the central package metadata repository. Fortunately there is a simple work-around to this, as long as you can access the internet via a web browser: Using your web browser, download https://weka.sourceforge.io/packageMetaData/repo.zip If it doesn't already exist, create the directory ~/wekafiles/repCache Copy the downloaded repo.zip into ~/wekafiles/repCache and unzip it there Start Weka (use the weka.packageManager.offline=true property to speed up the startup process; see [http://weka.wikispaces.com/How+do+I+use+the+package+manager%3F#Package%20manager%20property%20file] for info)","title":"Package manager doesnt start"},{"location":"faqs/pluggable_evaluation_metrics/","text":"WEKA 3.7.8 has a mechanism to allow new classification and regression evaluation metrics to be added as plugins. The new metrics will be output, along with WEKA's standard set of evaluation metrics, in the output generated on the command line, in the Explorer's Classify panel and by the Knowledge Flow's ClassifierPerformanceEvaluator component. Furthermore, new plugin metrics are also available for analysis in the Experimenter. Previously, adding a new evaluation metric involved editing and recompiling the monolithic weka.classifiers.Evaluation class - a shudder-worthy undertaking at the best of times. With the new plugin mechanism it is easy to add a new metric and deploy it via the package management system. The \"Additional configuration files\" section of How are packages structured for the package management system? details how to tell the PluginManager class about your new plugin evaluation metric. Classes and interfaces # The main base class for all new metrics is weka.classifiers.evaluation.AbstractEvaluationMetric . This class requires the following methods to be implemented by concrete sub classes: boolean appliesToNominalClass() - true if the stats computed by this metric apply to nominal class problems boolean appliesToNumericClass() - true if the stats computed by this metric apply to numeric class problems String getMetricName() - return the name of the metric String getMetricDescription() - return a short description of the metric List<String> getStatisticNames() - return a list of statistics that this metric computes (e.g. a \"correct\" metric might return both the number correctly classified and the percentage correct) double getStatistic(String statName) - get the computed value for the named statistic To facilitate computing statistics, the main Evaluation object (who's class now lives in weka.classifiers.evaluation) will pass a reference to itself to all plugin metrics when it is first constructed. Therefore, a plugin metric has access to all the protected fields in the Evaluation class, and can use these when computing it's own statistic(s). Beyond extending AbstractEvaluationMetric , a plugin metric will also need to implement one of the following interfaces: weka.classifiers.evaluation.StandardEvaluationMetric # Interface for a \"standard\" evaluation metric - i.e. one that would be part of the normal output in WEKA without having to turn on specific display options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. weka.classifiers.evaluation.InformationTheoreticEvaluationMetric # Interface for information theoretic evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForConditionalDensityEstimator(ConditionalDensityEstimator classifier, Instance classMissing, double classValue) - updates stats for conditional density estimator based on current test instance. Gets called when the class is numeric and the classifier is a ConditionalDensityEstimators. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. weka.classifiers.evaluation.InformationRetrievalMetric # An interface for information retrieval evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. These statistics will be displayed as new columns in the table of information retrieval statistics. As such, a toSummaryString() formatted representation is not required. It defines the following methods void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. double getStatistic(String name, int classIndex) - get the value of the named statistic for the given class index. If the implementing class is extending AbstractEvaluationMetric then the implementation of getStatistic(String statName) should just call this method with a classIndex of 0. double getClassWeightedAverageStatistic(String statName) - get the weighted (by class) average for this statistic. weka.classifiers.evaluation.IntervalBasedEvaluationMetric # Primarily a marker interface for interval-based evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForIntervalEstimator(IntervalEstimator classifier, Instance instance, double classValue) - updates stats for interval estimator based on current test instance. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"Pluggable evaluation metrics"},{"location":"faqs/pluggable_evaluation_metrics/#classes-and-interfaces","text":"The main base class for all new metrics is weka.classifiers.evaluation.AbstractEvaluationMetric . This class requires the following methods to be implemented by concrete sub classes: boolean appliesToNominalClass() - true if the stats computed by this metric apply to nominal class problems boolean appliesToNumericClass() - true if the stats computed by this metric apply to numeric class problems String getMetricName() - return the name of the metric String getMetricDescription() - return a short description of the metric List<String> getStatisticNames() - return a list of statistics that this metric computes (e.g. a \"correct\" metric might return both the number correctly classified and the percentage correct) double getStatistic(String statName) - get the computed value for the named statistic To facilitate computing statistics, the main Evaluation object (who's class now lives in weka.classifiers.evaluation) will pass a reference to itself to all plugin metrics when it is first constructed. Therefore, a plugin metric has access to all the protected fields in the Evaluation class, and can use these when computing it's own statistic(s). Beyond extending AbstractEvaluationMetric , a plugin metric will also need to implement one of the following interfaces:","title":"Classes and interfaces"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationstandardevaluationmetric","text":"Interface for a \"standard\" evaluation metric - i.e. one that would be part of the normal output in WEKA without having to turn on specific display options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"weka.classifiers.evaluation.StandardEvaluationMetric"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationinformationtheoreticevaluationmetric","text":"Interface for information theoretic evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForPredictor(double predictedValue, Instance instance) - updates the statistics about a predictors performance for the current test instance. Gets called when the class is numeric. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. void updateStatsForConditionalDensityEstimator(ConditionalDensityEstimator classifier, Instance classMissing, double classValue) - updates stats for conditional density estimator based on current test instance. Gets called when the class is numeric and the classifier is a ConditionalDensityEstimators. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"weka.classifiers.evaluation.InformationTheoreticEvaluationMetric"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationinformationretrievalmetric","text":"An interface for information retrieval evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. These statistics will be displayed as new columns in the table of information retrieval statistics. As such, a toSummaryString() formatted representation is not required. It defines the following methods void updateStatsForClassifier(double[] predictedDistribution, Instance instance) - updates the statistics about a classifiers performance for the current test instance. Gets called when the class is nominal. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object. double getStatistic(String name, int classIndex) - get the value of the named statistic for the given class index. If the implementing class is extending AbstractEvaluationMetric then the implementation of getStatistic(String statName) should just call this method with a classIndex of 0. double getClassWeightedAverageStatistic(String statName) - get the weighted (by class) average for this statistic.","title":"weka.classifiers.evaluation.InformationRetrievalMetric"},{"location":"faqs/pluggable_evaluation_metrics/#wekaclassifiersevaluationintervalbasedevaluationmetric","text":"Primarily a marker interface for interval-based evaluation metrics to implement. Allows the command line interface to display these metrics or not based on user-supplied options. It defines the following methods String toSummaryString() - return a formatted string (suitable for displaying in the console or GUI output) that contains all the statistics that this metric computes void updateStatsForIntervalEstimator(IntervalEstimator classifier, Instance instance, double classValue) - updates stats for interval estimator based on current test instance. Implementers need only implement this method if it is not possible to compute their statistics from what is stored in the base Evaluation object.","title":"weka.classifiers.evaluation.IntervalBasedEvaluationMetric"},{"location":"faqs/serialization_is_nice_but_what_about_generating_actual_java_code_from_weka_classes/","text":"Some of WEKA's schemes support the generation of Java source code based on their internal state. See the Generating source code from WEKA classes article for more details.","title":"Serialization is nice but what about generating actual java code from weka classes"},{"location":"faqs/stack_overflow_error/","text":"Try increasing the stack of your virtual machine. With Sun's JDK you can use this command to increase the stacksize: java -Xss512k ... to set the maximum Java stack size to 512KB. If still not sufficient, slowly increase it. For Windows, see OutOfMemoryException for pointers on how to modify your setup.","title":"Stack overflow error"},{"location":"faqs/the_snowball_stemmers_dont_work_what_am_i_doing_wrong/","text":"When you're trying to use the Snowball stemmers in the StringToWordVector nothing happens and you get the message Stemmer 'porter' unknown! in the console. If this happens, you don't have the snowball classes in your classpath. Check out the article about the Stemmers for how to add the snowball stemmers to WEKA.","title":"The snowball stemmers dont work what am i doing wrong"},{"location":"faqs/trying_to_add_jdbc_driver_error_not_in_classpath/","text":"WEKA's default setup for databases tries to locate some common JDBC driver classes (\"JDBC\" is the Java way of connecting to databases, like MySQL, HSQLDB, etc.) at startup time. By just adding these JDBC drivers to your CLASSPATH , WEKA will be automatically able to connect to these databases. If you are not trying to access a database, just forget about these messages. Otherwise, check out the databases article for more information (the database type that you are trying to connect to might not be listed by default).","title":"Trying to add jdbc driver error not in classpath"},{"location":"faqs/ubuntu_1804_blas_warning/","text":"When running Ubuntu 18.04, you might see the following warning message(s) in the console: Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.BLAS <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.BLAS <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.LAPACK <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK Apr 03, 2019 5:40:10 PM com.github.fommil.netlib.LAPACK <clinit> WARNING: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK You can easily fix this by installing the missing dependencies with this command: sudo apt-get install libgfortran-6-dev","title":"Ubuntu 1804 blas warning"},{"location":"faqs/use_csv_files/","text":"Yes, you can. But be aware that there is a drawback in comparison to ARFF files (WEKA's default file format): Train and test set may not be compatible. Using CSV files as train and test set can be a frustrating exercise. Since CSV files don't contain any information about the attributes, WEKA needs to determine the labels for nominal attributes itself. Not only does the order of the appearance of these labels create different nominal attributes (\"1,2,3\" vs \"1,3,2\"), but it is also not guaranteed that all the labels that appeared in the train set also appear in the test set (\"1,2,3,4\" vs \"1,3,4\") and vice versa.","title":"Use csv files"},{"location":"faqs/visualization/","text":"Access to visualization from the Classifier , Cluster and Attribute Selection panel is available from a popup menu. Click the right mouse button over an entry in the Result list to bring up the menu. You will be presented with options for viewing or saving the text output and, depending on the scheme, further options for visualizing errors, clusters, trees etc.","title":"Visualization"},{"location":"faqs/weka_download_problems/","text":"When you download WEKA , make sure that the resulting file size is the same as the WEKA webpage. Otherwise things won't work properly. Apparently some web browsers have trouble downloading WEKA. Also note, that the WEKA homepage only links to the files that are hosted on sourceforge.net . This normally involves a redirect to a mirror from which you'll download the actual file.","title":"Weka download problems"},{"location":"faqs/what_is_git_and_what_do_i_need_to_do_to_access_it/","text":"Git is the version control system that we use nowadays for WEKA's source code. See the git article from more information of how to access the repository and retrieve the source code from there.","title":"What is git and what do i need to do to access it"},{"location":"faqs/what_is_subversion_and_what_do_i_need_to_do_to_access_it/","text":"Subversion is the version control system that we use nowadays for WEKA's source code. See the Subversion article from more information of how to access the repository and retrieve the source code from there.","title":"What is subversion and what do i need to do to access it"},{"location":"faqs/where_can_i_find_information_regarding_roc_curves/","text":"Just check out the articles tagged with ROC , which cover the subject of ROC curves and AUC. These articles cover GUI handling as well as how to create ROC curves from code.","title":"Where can i find information regarding roc curves"},{"location":"faqs/where_can_i_get_wekas_source_code/","text":"Every WEKA release comes with a jar archive (this is just a simple ZIP archive) that contains the complete sources. It is called weka-src.jar . Alternatively, you can get WEKA's source code also from git .","title":"Where can i get wekas source code"},{"location":"faqs/why_am_i_missing_certain_nominal_or_string_values_from_sparse_instances/","text":"Internally, Weka stores all attribute values as double precision floating point numbers. In the case of nominal or string attributes these numbers are interpreted as indexes into the set of values for the attribute in question, with 0 corresponding to the first value, 1 the second and so forth. Because sparse data does not explicitly store zeros, any instances containing the first value (with index 0) of a nominal or string attribute does not show this value when printing out an ARFF file that is sparse format.","title":"Why am i missing certain nominal or string values from sparse instances"},{"location":"faqs/why_do_i_get_the_error_message_training_and_test_set_are_not_compatible/","text":"One of WEKA's fundamental assumption is that the structure of the training and test sets are exactly the same. This does not only mean that you need the exact same number of attributes, but also the exact same type . In case of nominal attributes, you must ensure that the number of labels and the order of the labels are the same. This may seem odd, as for making predictions with a trained classifier, you wouldn't need to include any class attribute information. This is true from a human perspective, but for speed reasons, WEKA doesn't perform any checks regarding the structure of dataset (no mapping of attribute names from training space to test space, also no mapping of labels). Internally, a single row in a dataset is represented as an array of doubles. In case of numeric attributes, this doesn't pose a problem, but for other attribute types, like nominal ones, the doubles represent indexes in the list of available labels. A different order of the labels would result in different labels represented by the same index. Predictions cannot be trusted then. Now, if you want to quickly check where the problem is, a visual diff program is very helpful. There is a plethora of applications available. To name a few cross-platform open-source ones: kdiff3 kompare diffuse If you used a filter for processing training and test set, then have a look at FAQ How do I generate compatible train and test sets that get processed with a filter?","title":"Why do i get the error message training and test set are not compatible"},{"location":"formats_and_processing/arff/","text":"Data format # A description of the ARFF format can be found in the following articles: ARFF (stable version) ARFF (developer version) Note how single quotes and spaces are handled: Single Quotes in Labels of ARFF Files Spaces in Labels of ARFF Files Creating an ARFF file # How to create an ARFF file on the fly, i.e., inside Java, you can find here: Creating an ARFF file CSV # CSV (comma separated value) files are able to be converted to ARFF format. See: Converting CSV to ARFF XML and XRFF # There is an XML-based extension of the ARFF format. See: XRFF XML Load an XML BIF file See also # ARFF Syntax Highlighting for various editors ARFF From Text Collections Remove Attributes Rename Attribute Values Save Instances to ARFF Transferring and ARFF File into a Databse Links # ARFF2DB.py - a Python script for importing an ARFF file into a database (similar functionality to the weka.core.converters.DatabaseSaver class)","title":"ARFF Format"},{"location":"formats_and_processing/arff/#data-format","text":"A description of the ARFF format can be found in the following articles: ARFF (stable version) ARFF (developer version) Note how single quotes and spaces are handled: Single Quotes in Labels of ARFF Files Spaces in Labels of ARFF Files","title":"Data format"},{"location":"formats_and_processing/arff/#creating-an-arff-file","text":"How to create an ARFF file on the fly, i.e., inside Java, you can find here: Creating an ARFF file","title":"Creating an ARFF file"},{"location":"formats_and_processing/arff/#csv","text":"CSV (comma separated value) files are able to be converted to ARFF format. See: Converting CSV to ARFF","title":"CSV"},{"location":"formats_and_processing/arff/#xml-and-xrff","text":"There is an XML-based extension of the ARFF format. See: XRFF XML Load an XML BIF file","title":"XML and XRFF"},{"location":"formats_and_processing/arff/#see-also","text":"ARFF Syntax Highlighting for various editors ARFF From Text Collections Remove Attributes Rename Attribute Values Save Instances to ARFF Transferring and ARFF File into a Databse","title":"See also"},{"location":"formats_and_processing/arff/#links","text":"ARFF2DB.py - a Python script for importing an ARFF file into a database (similar functionality to the weka.core.converters.DatabaseSaver class)","title":"Links"},{"location":"formats_and_processing/arff_developer/","text":"An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. Overview # ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive. Examples # Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files. The ARFF Header Section # The ARFF Header section of the file contains the relation declaration and attribute declarations. The @relation Declaration # The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name. == The @attribute Declarations == Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive. Numeric attributes # Numeric attributes can be real or integer numbers. Nominal attributes # Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted. String attributes # String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string Date attributes # Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below). Relational attributes # Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ... The ARFF Data Section # The ARFF Data section of the file contains the data declaration line and the actual instance lines. The @data Declaration # The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data The instance data # Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1 Sparse ARFF files # Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files. Instance weights in ARFF files # A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility. See also # Add weights to dataset ARFF Syntax Highlighting for various editors Links # ISO 8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Arff developer"},{"location":"formats_and_processing/arff_developer/#overview","text":"ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive.","title":"Overview"},{"location":"formats_and_processing/arff_developer/#examples","text":"Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files.","title":"Examples"},{"location":"formats_and_processing/arff_developer/#the-arff-header-section","text":"The ARFF Header section of the file contains the relation declaration and attribute declarations.","title":"The ARFF Header Section"},{"location":"formats_and_processing/arff_developer/#the-relation-declaration","text":"The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name. == The @attribute Declarations == Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive.","title":"The @relation Declaration"},{"location":"formats_and_processing/arff_developer/#numeric-attributes","text":"Numeric attributes can be real or integer numbers.","title":"Numeric attributes"},{"location":"formats_and_processing/arff_developer/#nominal-attributes","text":"Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted.","title":"Nominal attributes"},{"location":"formats_and_processing/arff_developer/#string-attributes","text":"String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string","title":"String attributes"},{"location":"formats_and_processing/arff_developer/#date-attributes","text":"Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below).","title":"Date attributes"},{"location":"formats_and_processing/arff_developer/#relational-attributes","text":"Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ...","title":"Relational attributes"},{"location":"formats_and_processing/arff_developer/#the-arff-data-section","text":"The ARFF Data section of the file contains the data declaration line and the actual instance lines.","title":"The ARFF Data Section"},{"location":"formats_and_processing/arff_developer/#the-data-declaration","text":"The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data","title":"The @data Declaration"},{"location":"formats_and_processing/arff_developer/#the-instance-data","text":"Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1","title":"The instance data"},{"location":"formats_and_processing/arff_developer/#sparse-arff-files","text":"Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files.","title":"Sparse ARFF files"},{"location":"formats_and_processing/arff_developer/#instance-weights-in-arff-files","text":"A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility.","title":"Instance weights in ARFF files"},{"location":"formats_and_processing/arff_developer/#see-also","text":"Add weights to dataset ARFF Syntax Highlighting for various editors","title":"See also"},{"location":"formats_and_processing/arff_developer/#links","text":"ISO 8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Links"},{"location":"formats_and_processing/arff_from_text_collections/","text":"The following utility generates an ARFF file from text documents in a given directory (download link is at the end of this article). The stable/developer version of Weka offer this tool as the weka.core.converters.TextDirectoryLoader converter. This can be used as: java -cp <path to weka.jar> weka.core.converters.TextDirectoryLoader -dir . For help just type: java -cp <path to weka.jar> weka.core.converters.TextDirectoryLoader /* * TextDirectoryToArff.java * Copyright (C) 2002 Richard Kirkby * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ import java.io.* ; import weka.core.* ; /** * Builds an arff dataset from the documents in a given directory. * Assumes that the file names for the documents end with \".txt\". * * Usage:<p/> * * TextDirectoryToArff <directory path> <p/> * * @author Richard Kirkby (rkirkby at cs.waikato.ac.nz) * @version 1.0 */ public class TextDirectoryToArff { public Instances createDataset ( String directoryPath ) throws Exception { FastVector atts = new FastVector ( 2 ); atts . addElement ( new Attribute ( \"filename\" , ( FastVector ) null )); atts . addElement ( new Attribute ( \"contents\" , ( FastVector ) null )); Instances data = new Instances ( \"text_files_in_\" + directoryPath , atts , 0 ); File dir = new File ( directoryPath ); String [] files = dir . list (); for ( int i = 0 ; i < files . length ; i ++ ) { if ( files [ i ] . endsWith ( \".txt\" )) { try { double [] newInst = new double [ 2 ] ; newInst [ 0 ] = ( double ) data . attribute ( 0 ). addStringValue ( files [ i ] ); File txt = new File ( directoryPath + File . separator + files [ i ] ); InputStreamReader is ; is = new InputStreamReader ( new FileInputStream ( txt )); StringBuffer txtStr = new StringBuffer (); int c ; while (( c = is . read ()) != - 1 ) { txtStr . append (( char ) c ); } newInst [ 1 ] = ( double ) data . attribute ( 1 ). addStringValue ( txtStr . toString ()); data . add ( new Instance ( 1.0 , newInst )); } catch ( Exception e ) { //System.err.println(\"failed to convert file: \" + directoryPath + File.separator + files[i]); } } } return data ; } public static void main ( String [] args ) { if ( args . length == 1 ) { TextDirectoryToArff tdta = new TextDirectoryToArff (); try { Instances dataset = tdta . createDataset ( args [ 0 ] ); System . out . println ( dataset ); } catch ( Exception e ) { System . err . println ( e . getMessage ()); e . printStackTrace (); } } else { System . out . println ( \"Usage: java TextDirectoryToArff <directory name>\" ); } } } See also # Text categorization with Weka Downloads # TextDirectoryToArff.java","title":"Arff from text collections"},{"location":"formats_and_processing/arff_from_text_collections/#see-also","text":"Text categorization with Weka","title":"See also"},{"location":"formats_and_processing/arff_from_text_collections/#downloads","text":"TextDirectoryToArff.java","title":"Downloads"},{"location":"formats_and_processing/arff_stable/","text":"An ARFF (Attribute-Relation File Format) file is an ASCII text file that describes a list of instances sharing a set of attributes. Overview # ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive. Examples # Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files. The ARFF Header Section # The ARFF Header section of the file contains the relation declaration and attribute declarations. The @relation Declaration # The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name. The @attribute Declarations # Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive. Numeric attributes # Numeric attributes can be real or integer numbers. Nominal attributes # Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted. String attributes # String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string Date attributes # Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below). Relational attributes # Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ... The ARFF Data Section # The ARFF Data section of the file contains the data declaration line and the actual instance lines. The @data Declaration # The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data The instance data # Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1 Sparse ARFF files # Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files. Instance weights in ARFF files # A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility. See also # Add weights to dataset ARFF Syntax Highlighting for various editors Links # ISO-8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Arff stable"},{"location":"formats_and_processing/arff_stable/#overview","text":"ARFF files have two distinct sections. The first section is the Header information, which is followed the Data information. The Header of the ARFF file contains the name of the relation, a list of the attributes (the columns in the data), and their types. An example header on the standard IRIS dataset looks like this: % 1. Title: Iris Plants Database % % 2. Sources: % (a) Creator: R.A. Fisher % (b) Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) % (c) Date: July, 1988 % @RELATION iris @ATTRIBUTE sepallength NUMERIC @ATTRIBUTE sepalwidth NUMERIC @ATTRIBUTE petallength NUMERIC @ATTRIBUTE petalwidth NUMERIC @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} The Data of the ARFF file looks like the following: @DATA 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa Lines that begin with a % are comments. The @RELATION , @ATTRIBUTE and @DATA declarations are case insensitive.","title":"Overview"},{"location":"formats_and_processing/arff_stable/#examples","text":"Several well-known machine learning datasets are distributed with Weka in the $WEKAHOME/data directory as ARFF files.","title":"Examples"},{"location":"formats_and_processing/arff_stable/#the-arff-header-section","text":"The ARFF Header section of the file contains the relation declaration and attribute declarations.","title":"The ARFF Header Section"},{"location":"formats_and_processing/arff_stable/#the-relation-declaration","text":"The relation name is defined as the first line in the ARFF file. The format is: @relation [relation-name] where [relation-name] is a string. The string must be quoted if the name includes spaces. Furthermore, relation names or attribute names (see below) cannot begin with a character below \\u0021 '{', '}', ',', or '%' Moreover, it can only begin with a single or double quote if there is a corresponding quote at the end of the name.","title":"The @relation Declaration"},{"location":"formats_and_processing/arff_stable/#the-attribute-declarations","text":"Attribute declarations take the form of an ordered sequence of @attribute statements. Each attribute in the data set has its own @attribute statement which uniquely defines the name of that attribute and its data type. The order the attributes are declared indicates the column position in the data section of the file. For example, if an attribute is the third one declared then Weka expects that all that attributes values will be found in the third comma delimited column. The format for the @attribute statement is: @attribute [attribute-name] [datatype] where the [attribute-name] must adhere to the constraints specified in the above section on the @relation declaration. The [datatype] can be any of the four types supported by Weka: numeric integer is treated as numeric real is treated as numeric [nominal-specification] string date [date-format] relational for multi-instance data (for future use) where [nominal-specification] and [date-format] are defined below. The keywords numeric , real , integer , string and date are case insensitive.","title":"The @attribute Declarations"},{"location":"formats_and_processing/arff_stable/#numeric-attributes","text":"Numeric attributes can be real or integer numbers.","title":"Numeric attributes"},{"location":"formats_and_processing/arff_stable/#nominal-attributes","text":"Nominal values are defined by providing an [nominal-specification] listing the possible values: {[nominal-name1], [nominal-name2], [nominal-name3], ...} For example, the class value of the Iris dataset can be defined as follows: @ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica} Values that contain spaces must be quoted.","title":"Nominal attributes"},{"location":"formats_and_processing/arff_stable/#string-attributes","text":"String attributes allow us to create attributes containing arbitrary textual values. This is very useful in text-mining applications, as we can create datasets with string attributes, then write Weka Filters to manipulate strings (like StringToWordVectorFilter ). String attributes are declared as follows: @ATTRIBUTE LCC string","title":"String attributes"},{"location":"formats_and_processing/arff_stable/#date-attributes","text":"Date attribute declarations take the form: @attribute [name] date [[date-format]] where [name] is the name for the attribute and [date-format] is an optional string specifying how date values should be parsed and printed (this is the same format used by SimpleDateFormat ). The default format string accepts the ISO-8601 combined date and time format: yyyy-MM-dd'T'HH:mm:ss . Check out the Javadoc of the java.text.SimpleDateFormat class for supported character patterns. Dates must be specified in the data section as the corresponding string representations of the date/time (see example below).","title":"Date attributes"},{"location":"formats_and_processing/arff_stable/#relational-attributes","text":"Relational attribute declarations take the form: @attribute [name] relational [further attribute definitions] @end [name] For the multi-instance dataset MUSK1 the definition would look like this ( \"...\" denotes an omission): @attribute molecule_name {MUSK-jf78,...,NON-MUSK-199} @attribute bag relational @attribute f1 numeric ... @attribute f166 numeric @end bag @attribute class {0,1} ...","title":"Relational attributes"},{"location":"formats_and_processing/arff_stable/#the-arff-data-section","text":"The ARFF Data section of the file contains the data declaration line and the actual instance lines.","title":"The ARFF Data Section"},{"location":"formats_and_processing/arff_stable/#the-data-declaration","text":"The @data declaration is a single line denoting the start of the data segment in the file. The format is: @data","title":"The @data Declaration"},{"location":"formats_and_processing/arff_stable/#the-instance-data","text":"Each instance is represented on a single line, with carriage returns denoting the end of the instance. A percent sign (%) introduces a comment, which continues to the end of the line. Attribute values for each instance can be delimited by commas or tabs. A comma/tab may be followed by zero or more spaces. Attribute values must appear in the order in which they were declared in the header section (i.e., the data corresponding to the nth @attribute declaration is always the nth field of the attribute). A missing value is represented by a single question mark, as in: @data 4.4,?,1.5,?,Iris-setosa Values of string and nominal attributes are case sensitive, and any that contain space or the comment-delimiter character % must be quoted. (The code suggests that double-quotes are acceptable and that a backslash will escape individual characters.) An example follows: @relation LCCvsLCSH @attribute LCC string @attribute LCSH string @data AG5, 'Encyclopedias and dictionaries.;Twentieth century.' AS262, 'Science -- Soviet Union -- History.' AE5, 'Encyclopedias and dictionaries.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Phases.' AS281, 'Astronomy, Assyro-Babylonian.;Moon -- Tables.' Dates must be specified in the data section using the string representation specified in the attribute declaration. For example: @RELATION Timestamps @ATTRIBUTE timestamp DATE \"yyyy-MM-dd HH:mm:ss\" @DATA \"2001-04-03 12:12:12\" \"2001-05-03 12:59:55\" Relational data must be enclosed within double quotes \" . For example an instance of the MUSK1 dataset ( \"...\" denotes an omission): MUSK-188,\"42,...,30\",1","title":"The instance data"},{"location":"formats_and_processing/arff_stable/#sparse-arff-files","text":"Sparse ARFF files are very similar to ARFF files, but data with value 0 are not be explicitly represented. Sparse ARFF files have the same header (i.e @relation and @attribute tags) but the data section is different. Instead of representing each value in order, like this: @data 0, X, 0, Y, 'class A' 0, 0, W, 0, 'class B' the non-zero attributes are explicitly identified by attribute number and their value stated, like this: @data {1 X, 3 Y, 4 'class A'} {2 W, 4 'class B'} Each instance is surrounded by curly braces, and the format for each entry is: [index] [space] [value] where index is the attribute index (starting from 0). Note that the omitted values in a sparse instance are 0 , they are not \"missing\" values! If a value is unknown, you must explicitly represent it with a question mark (?). Warning: There is a known problem saving SparseInstance objects from datasets that have string attributes. In Weka, string and nominal data values are stored as numbers; these numbers act as indexes into an array of possible attribute values (this is very efficient). However, the first string value is assigned index 0: this means that, internally, this value is stored as a 0. When a SparseInstance is written, string instances with internal value 0 are not output, so their string value is lost (and when the arff file is read again, the default value 0 is the index of a different string value, so the attribute value appears to change). To get around this problem, add a dummy string value at index 0 that is never used whenever you declare string attributes that are likely to be used in SparseInstance objects and saved as Sparse ARFF files.","title":"Sparse ARFF files"},{"location":"formats_and_processing/arff_stable/#instance-weights-in-arff-files","text":"A weight can be associated with an instance in a standard ARFF file by appending it to the end of the line for that instance and enclosing the value in curly braces. E.g: @data 0, X, 0, Y, 'class A', {5} For a sparse instance, this example would look like: @data {1 X, 3 Y, 4 'class A'}, {5} Note that any instance without a weight value specified is assumed to have a weight of 1 for backwards compatibility.","title":"Instance weights in ARFF files"},{"location":"formats_and_processing/arff_stable/#see-also","text":"Add weights to dataset ARFF Syntax Highlighting for various editors","title":"See also"},{"location":"formats_and_processing/arff_stable/#links","text":"ISO-8601 Javadoc of java.text.SimpleDateFormat (lists the supported character patterns) ANTLR syntax by Staal A. Vinterbo arff.g","title":"Links"},{"location":"formats_and_processing/arff_syntax/","text":"Here you can find syntax highlightings for various editors: Emacs # Add the code from the arff.emacs file into your startup file. Notepad++ # Copy the contents of tag of the arff.notepadplus file into your %APPDATA%\\Notepad++\\userDefineLang.xml file. (Ensure that you maintain the XML structure). If userDefineLang.xml does not exist, simply rename the arff.notepadplus file to userDefineLang.xml TextPad # Copy the file arff.syn into your <TEXTPAD-DIR>/system directory. Then run the wizard for adding a new document class (Configure -> New Document Class...). Ultraedit # Just copy/paste the content of the file arff.ultraedit in your <ULTRAEDIT-DIR>/WORDFILE.TXT file. Adjust the /Lnn language number that it fits into the numbering of your current settings. vim/gvim # Save the file arff.vim in your $HOME/.vim/syntax directory. You can enable the syntax with :set syntax=arff . Links # Emacs homepage Notepad++ homepage TextPad homepage Ultraedit homepage vim homepage","title":"Arff syntax"},{"location":"formats_and_processing/arff_syntax/#emacs","text":"Add the code from the arff.emacs file into your startup file.","title":"Emacs"},{"location":"formats_and_processing/arff_syntax/#notepad","text":"Copy the contents of tag of the arff.notepadplus file into your %APPDATA%\\Notepad++\\userDefineLang.xml file. (Ensure that you maintain the XML structure). If userDefineLang.xml does not exist, simply rename the arff.notepadplus file to userDefineLang.xml","title":"Notepad++"},{"location":"formats_and_processing/arff_syntax/#textpad","text":"Copy the file arff.syn into your <TEXTPAD-DIR>/system directory. Then run the wizard for adding a new document class (Configure -> New Document Class...).","title":"TextPad"},{"location":"formats_and_processing/arff_syntax/#ultraedit","text":"Just copy/paste the content of the file arff.ultraedit in your <ULTRAEDIT-DIR>/WORDFILE.TXT file. Adjust the /Lnn language number that it fits into the numbering of your current settings.","title":"Ultraedit"},{"location":"formats_and_processing/arff_syntax/#vimgvim","text":"Save the file arff.vim in your $HOME/.vim/syntax directory. You can enable the syntax with :set syntax=arff .","title":"vim/gvim"},{"location":"formats_and_processing/arff_syntax/#links","text":"Emacs homepage Notepad++ homepage TextPad homepage Ultraedit homepage vim homepage","title":"Links"},{"location":"formats_and_processing/converting_csv_to_arff/","text":"For converting CSV (comma separated value) files into ARFF files you need the following two converters : CSVLoader for loading the CSV file into an Instances object ArffSaver to save the Instances as an ARFF file In the following you'll find some example code to show you how to use the converters . The class takes 2 arguments: the input CSV file the output ARFF file Example code: import weka.core.Instances ; import weka.core.converters.ArffSaver ; import weka.core.converters.CSVLoader ; import java.io.File ; public class CSV2Arff { /** * takes 2 arguments: * - CSV input file * - ARFF output file */ public static void main ( String [] args ) throws Exception { if ( args . length != 2 ) { System . out . println ( \"\\nUsage: CSV2Arff <input.csv> <output.arff>\\n\" ); System . exit ( 1 ); } // load CSV CSVLoader loader = new CSVLoader (); loader . setSource ( new File ( args [ 0 ] )); Instances data = loader . getDataSet (); // save ARFF ArffSaver saver = new ArffSaver (); saver . setInstances ( data ); saver . setFile ( new File ( args [ 1 ] )); saver . setDestination ( new File ( args [ 1 ] )); saver . writeBatch (); } } Note: with versions of Weka later than 3.5.3 the call of saver.setDestination(new File(args[1])); is no longer necessary, it is automatically done in the saver.setFile(new File(args[1])); method. See also # The Weka Examples collection dedicates several example classes of loading from and saving to various file formats: stable-3-8 developer","title":"Converting csv to arff"},{"location":"formats_and_processing/converting_csv_to_arff/#see-also","text":"The Weka Examples collection dedicates several example classes of loading from and saving to various file formats: stable-3-8 developer","title":"See also"},{"location":"formats_and_processing/creating_arff_file/","text":"The following code generates an Instances object and outputs it to stdout as ARFF file. It generates the following types of attributes: numeric nominal string date relational Example class AttTest : import weka.core.Attribute ; import weka.core.DenseInstance ; import weka.core.Instances ; import java.util.ArrayList ; /** * Generates a little ARFF file with different attribute types. * * @author FracPete */ public class AttTest { public static void main ( String [] args ) throws Exception { ArrayList < Attribute > atts ; ArrayList < Attribute > attsRel ; ArrayList < String > attVals ; ArrayList < String > attValsRel ; Instances data ; Instances dataRel ; double [] vals ; double [] valsRel ; int i ; // 1. set up attributes atts = new ArrayList < Attribute > (); // - numeric atts . add ( new Attribute ( \"att1\" )); // - nominal attVals = new ArrayList < String > (); for ( i = 0 ; i < 5 ; i ++ ) attVals . add ( \"val\" + ( i + 1 )); atts . add ( new Attribute ( \"att2\" , attVals )); // - string atts . add ( new Attribute ( \"att3\" , ( ArrayList < String > ) null )); // - date atts . add ( new Attribute ( \"att4\" , \"yyyy-MM-dd\" )); // - relational attsRel = new ArrayList < Attribute > (); // -- numeric attsRel . add ( new Attribute ( \"att5.1\" )); // -- nominal attValsRel = new ArrayList < String > (); for ( i = 0 ; i < 5 ; i ++ ) attValsRel . add ( \"val5.\" + ( i + 1 )); attsRel . add ( new Attribute ( \"att5.2\" , attValsRel )); dataRel = new Instances ( \"att5\" , attsRel , 0 ); atts . add ( new Attribute ( \"att5\" , dataRel , 0 )); // 2. create Instances object data = new Instances ( \"MyRelation\" , atts , 0 ); // 3. fill with data // first instance vals = new double [ data . numAttributes () ] ; // - numeric vals [ 0 ] = Math . PI ; // - nominal vals [ 1 ] = attVals . indexOf ( \"val3\" ); // - string vals [ 2 ] = data . attribute ( 2 ). addStringValue ( \"This is a string!\" ); // - date vals [ 3 ] = data . attribute ( 3 ). parseDate ( \"2001-11-09\" ); // - relational dataRel = new Instances ( data . attribute ( 4 ). relation (), 0 ); // -- first instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . PI + 1 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.3\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); // -- second instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . PI + 2 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.2\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); vals [ 4 ] = data . attribute ( 4 ). addRelation ( dataRel ); // add data . add ( new DenseInstance ( 1.0 , vals )); // second instance vals = new double [ data . numAttributes () ] ; // important: needs NEW array! // - numeric vals [ 0 ] = Math . E ; // - nominal vals [ 1 ] = attVals . indexOf ( \"val1\" ); // - string vals [ 2 ] = data . attribute ( 2 ). addStringValue ( \"And another one!\" ); // - date vals [ 3 ] = data . attribute ( 3 ). parseDate ( \"2000-12-01\" ); // - relational dataRel = new Instances ( data . attribute ( 4 ). relation (), 0 ); // -- first instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . E + 1 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.4\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); // -- second instance valsRel = new double [ 2 ] ; valsRel [ 0 ] = Math . E + 2 ; valsRel [ 1 ] = attValsRel . indexOf ( \"val5.1\" ); dataRel . add ( new DenseInstance ( 1.0 , valsRel )); vals [ 4 ] = data . attribute ( 4 ). addRelation ( dataRel ); // add data . add ( new DenseInstance ( 1.0 , vals )); // 4. output data System . out . println ( data ); } } Missing values # By default, a new double array will be initialized with 0s. In case you want to be a value missing at a certain position, you have to explicitly set the missing value via the missingValue() method of the weka.core.Utils class. In case you already have an existing weka.core.Instance object, then you use its setMissing(int) method, which sets a missing value at the given position. Here are examples, which set the third attribute to missing: double array: double[] vals = ... // from somewhere, e.g., from AttTest.java example vals[2] = Utils.missingValue(); weka.core.Instance object: double[] vals = ... // from somewhere, e.g., from AttTest.java example Instance inst = new DenseInstance(1.0, vals); inst.setMissing(2); Downloads # AttTest.java ( stable , developer ) - the above class See also # Save Instances to an ARFF File - if you want to save the data to a file instead of printing it to stdout Adding attributes to a dataset - shows how to add attributes to an existing dataset ARFF format","title":"Creating arff file"},{"location":"formats_and_processing/creating_arff_file/#missing-values","text":"By default, a new double array will be initialized with 0s. In case you want to be a value missing at a certain position, you have to explicitly set the missing value via the missingValue() method of the weka.core.Utils class. In case you already have an existing weka.core.Instance object, then you use its setMissing(int) method, which sets a missing value at the given position. Here are examples, which set the third attribute to missing: double array: double[] vals = ... // from somewhere, e.g., from AttTest.java example vals[2] = Utils.missingValue(); weka.core.Instance object: double[] vals = ... // from somewhere, e.g., from AttTest.java example Instance inst = new DenseInstance(1.0, vals); inst.setMissing(2);","title":"Missing values"},{"location":"formats_and_processing/creating_arff_file/#downloads","text":"AttTest.java ( stable , developer ) - the above class","title":"Downloads"},{"location":"formats_and_processing/creating_arff_file/#see-also","text":"Save Instances to an ARFF File - if you want to save the data to a file instead of printing it to stdout Adding attributes to a dataset - shows how to add attributes to an existing dataset ARFF format","title":"See also"},{"location":"formats_and_processing/load_an_xml_bif_file/","text":"You should use the BIFReader class ( weka.classifiers.bayes.net.BIFReader ). Here is the snippet : import weka.classifiers.bayes.BayesNet ; import weka.classifiers.bayes.net.BIFReader ; public class wekaTest { public static void main ( String [] args ) throws Exception { BayesNet network = new BayesNet (); BIFReader reader = new BIFReader (); network = reader . processFile ( \"rb_on_min_attr.xml\" ); } } https:/weka.sourceforge.io/doc.dev/weka/classifiers/evaluation/ThresholdCurve.html Downloads # LoadBif.java ( stable-3.8 , developer )","title":"Load an xml bif file"},{"location":"formats_and_processing/load_an_xml_bif_file/#downloads","text":"LoadBif.java ( stable-3.8 , developer )","title":"Downloads"},{"location":"formats_and_processing/remove_attributes/","text":"The following code removes specified attributes from an ARFF file and prints the result to stdout. The class takes the following parameters: ARFF file attribute index/indices inversion of index/indices (false/true) import weka.core.Instances ; import weka.filters.Filter ; import weka.filters.unsupervised.attribute.Remove ; import java.io.BufferedReader ; import java.io.FileReader ; public class RemoveTest { /** * takes an ARFF file as first argument, the number of indices to remove * as second and thirdly whether to invert or not (true/false). * Dumps the generated data to stdout. */ public static void main ( String [] args ) throws Exception { Instances inst ; Instances instNew ; Remove remove ; inst = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); remove = new Remove (); remove . setAttributeIndices ( args [ 1 ] ); remove . setInvertSelection ( new Boolean ( args [ 2 ] ). booleanValue ()); remove . setInputFormat ( inst ); instNew = Filter . useFilter ( inst , remove ); System . out . println ( instNew ); } } The same can be achieved with the following filter commandline (add -V to invert the selection): java weka.filters.unsupervised.attribute.Remove -R <indices> -i <input> -o <output> Downloads # RemoveTest.java ( stable-3.8 , developer )","title":"Remove attributes"},{"location":"formats_and_processing/remove_attributes/#downloads","text":"RemoveTest.java ( stable-3.8 , developer )","title":"Downloads"},{"location":"formats_and_processing/rename_attribute_values/","text":"After discretizing an attribute you might want to rename the values of the newly created nominal attribute. E.g., after discretizing a numeric attribute with values ranging from 1 to 100 you might end up with a nominal attribute that has the following values: '1-15','16-18','29-100' You can use the renameAttributeValue(...) method of the weka.core.Instances class (see API ) to rename this values into, e.g., 0, 1 and 2. Here's a code snippet how to do this ( arff is an Instances object, att is an attribute of the same instances object): for ( int n = 0 ; n < att . numValues (); n ++ ) { arff . renameAttributeValue ( att , att . value ( n ), \"\" + n ); The Rename.java like mentioned above. See also # Use Weka in your Java code Downloads # Rename.java","title":"Rename attribute values"},{"location":"formats_and_processing/rename_attribute_values/#see-also","text":"Use Weka in your Java code","title":"See also"},{"location":"formats_and_processing/rename_attribute_values/#downloads","text":"Rename.java","title":"Downloads"},{"location":"formats_and_processing/save_instances_to_arff/","text":"DataSink # The easiest way to save an weka.core.Instances object to a file is by using the weka.core.converters.ConverterUtils.DataSink class. import weka.core.converters.ConverterUtils.DataSink ; import weka.core.Instances ; Instances dataset = ... String outputFilename = ... try { DataSink . write ( outputFilename , dataset ); } catch ( Exception e ) { System . err . println ( \"Failed to save data to: \" + outputFilename ); e . printStackTrace (); } Converter # You can use the ArffSaver class ( weka.core.converters.ArffSaver ) for saving a weka.core.Instances object to a file. Here is the snippet : Instances dataSet = ... ArffSaver saver = new ArffSaver (); saver . setInstances ( dataSet ); saver . setFile ( new File ( \"./data/test.arff\" )); saver . writeBatch (); Notes: using the converter approach, one can easily swap the ArffSaver with another saver, e.g., the CSVSaver to output the data in a different format. The Weka Examples collection dedicates quite a few examples to the use of converters in the wekaexamples.core.converters package ( stable , developer ) Java I/O # You can also save the weka.core.Instances object directly using Java I/O classes: import java.io.BufferedWriter ; import java.io.FileWriter ; ... Instances dataSet = ... BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"./data/test.arff\" )); writer . write ( dataSet . toString ()); writer . flush (); writer . close (); Note: using the toString() of the weka.core.Instances doesn't scale very well for large datasets, since the complete string has to fit into memory. It is best to use a converter, as described in the previous section, which uses an incremental approach for writing the dataset to disk.","title":"DataSink"},{"location":"formats_and_processing/save_instances_to_arff/#datasink","text":"The easiest way to save an weka.core.Instances object to a file is by using the weka.core.converters.ConverterUtils.DataSink class. import weka.core.converters.ConverterUtils.DataSink ; import weka.core.Instances ; Instances dataset = ... String outputFilename = ... try { DataSink . write ( outputFilename , dataset ); } catch ( Exception e ) { System . err . println ( \"Failed to save data to: \" + outputFilename ); e . printStackTrace (); }","title":"DataSink"},{"location":"formats_and_processing/save_instances_to_arff/#converter","text":"You can use the ArffSaver class ( weka.core.converters.ArffSaver ) for saving a weka.core.Instances object to a file. Here is the snippet : Instances dataSet = ... ArffSaver saver = new ArffSaver (); saver . setInstances ( dataSet ); saver . setFile ( new File ( \"./data/test.arff\" )); saver . writeBatch (); Notes: using the converter approach, one can easily swap the ArffSaver with another saver, e.g., the CSVSaver to output the data in a different format. The Weka Examples collection dedicates quite a few examples to the use of converters in the wekaexamples.core.converters package ( stable , developer )","title":"Converter"},{"location":"formats_and_processing/save_instances_to_arff/#java-io","text":"You can also save the weka.core.Instances object directly using Java I/O classes: import java.io.BufferedWriter ; import java.io.FileWriter ; ... Instances dataSet = ... BufferedWriter writer = new BufferedWriter ( new FileWriter ( \"./data/test.arff\" )); writer . write ( dataSet . toString ()); writer . flush (); writer . close (); Note: using the toString() of the weka.core.Instances doesn't scale very well for large datasets, since the complete string has to fit into memory. It is best to use a converter, as described in the previous section, which uses an incremental approach for writing the dataset to disk.","title":"Java I/O"},{"location":"formats_and_processing/single_quotes_in_labels_of_arff_files/","text":"Single quotes in Weka are used to surround strings with spaces or other special characters (see spaces in labels of ARFF files ). If some of your labels contain single quotes, you have to escape them with a backslash. For example, the name of one of Alexandre Duma's musketeers is D'Artagnan and needs to be quoted and escaped as follows 'D\\'Artagnan' Using the Weka API, you can use the quote(String) method of the weka.core.Utils class for doing this: import weka.core.Utils ; ... String raw = \"D'Artagnan\" ; String escaped = Utils . quote ( raw );","title":"Single quotes in labels of arff files"},{"location":"formats_and_processing/spaces_in_labels_of_arff_files/","text":"A common problem people have with ARFF files is that labels can only have spaces if they are enclosed in single quotes, i.e., a label such as: some value should be written either 'some value' or some_value in the file. See single quotes in labels of ARFF files for an example using the Weka API for automatically quoting such strings.","title":"Spaces in labels of arff files"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/","text":"This example transfers a dataset stored in an ARFF file into the MySQL database weka_test on the MySQL server running on the same machine. In order to make this work, the MySQL JDBC driver must be in the CLASSPATH and the DatabaseUtils.props file must be configured accordingly. Usage: Arff2Database <input.arff> Source code: import weka.core.* ; import weka.core.converters.* ; import java.io.* ; /** * A simple API example of transferring an ARFF file into a MySQL table. * It loads the data into the database \"weka_test\" on the MySQL server * running on the same machine. Instead of using the relation name of the * database as the table name, \"mytable\" is used instead. The * DatabaseUtils.props file must be configured accordingly. * * Usage: Arff2Database input.arff * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class Arff2Database { /** * loads a dataset into a MySQL database * * @param args the commandline arguments */ public static void main ( String [] args ) throws Exception { Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); DatabaseSaver save = new DatabaseSaver (); save . setUrl ( \"jdbc:mysql://localhost:3306/weka_test\" ); save . setUser ( \"fracpete\" ); save . setPassword ( \"fracpete\" ); save . setInstances ( data ); save . setRelationForTableName ( false ); save . setTableName ( \"mytable\" ); save . connectToDatabase (); save . writeBatch (); } } See also # Databases - explains how to use databases within Weka weka/experiment/DatabaseUtils.props - the properties file explained in detail Downloads # Arff2Database.java The Weka Examples collection contains several example classes: SaveDataToDbBatch.java ( stable-3.8 , developer ) SaveDataToDbIncremental.java ( stable-3.8 , developer ) Links # MySQL homepage","title":"Transferring an arff file into a database"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/#see-also","text":"Databases - explains how to use databases within Weka weka/experiment/DatabaseUtils.props - the properties file explained in detail","title":"See also"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/#downloads","text":"Arff2Database.java The Weka Examples collection contains several example classes: SaveDataToDbBatch.java ( stable-3.8 , developer ) SaveDataToDbIncremental.java ( stable-3.8 , developer )","title":"Downloads"},{"location":"formats_and_processing/transferring_an_arff_file_into_a_database/#links","text":"MySQL homepage","title":"Links"},{"location":"formats_and_processing/xml/","text":"Weka now supports XML (e X tensible M arkup L anguage) in several places: Command Line # WEKA now allows to start Classifiers and Experiments with the -xml option followed by a filename to retrieve the command line options from the XML file instead of the command line. For such simple classifiers like e.g. J48 this looks like overkill, but as soon as one uses Meta-Classifiers or Meta-Meta-Classifiers the handling gets tricky and one spends a lot of time looking for missing quotes. With the hierarchical structure of XML files it is simple to plug in other classifiers by just exchanging tags. The DTD for the XML options is quite simple: <!DOCTYPE options [ <!ELEMENT options (option)*> <!ATTLIST options type CDATA \"classifier\"> <!ATTLIST options value CDATA \"\"> <!ELEMENT option (#PCDATA | options)*> <!ATTLIST option name CDATA #REQUIRED> <!ATTLIST option type (flag | single | hyphens | quotes) \"single\"> ] > The type attribute of the option tag needs some explanations. There are currently four different types of options in WEKA: flag The simplest option that takes no arguments, like e.g. the -V flag for inversing an selection. <option name= \"V\" type= \"flag\" /> single The option takes exactly one parameter, directly following after the option, e.g., for specifying the trainings file with -t somefile.arff . Here the parameter value is just put between the opening and closing tag. Since single is the default value for the type tag we don't need to specify it explicitly. <option name= \"t\" > somefile.arff </option> hyphens Meta-Classifiers like AdaBoostM1 take another classifier as option with the -W option, where the options for the base classifier follow after the -- . And here it is where the fun starts; where to put parameters for the base classifier if the Meta-Classifier itself is a base classifier for another Meta-Classifier? E.g., does -W weka.classifiers.trees.J48 -- -C 0.001 become this: <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> Internally, all the options enclosed by the options tag are pushed to the end after the -- if one transforms the XML into a command line string. quotes A Meta-Classifier like Stacking can take several -B options, where each single one encloses other options in quotes (this itself can contain a Meta-Classifier!). From -B \"weka.classifiers.trees.J48\" we then get this XML: <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> With the XML representation one doesn't have to worry anymore about the level of quotes one is using and therefore doesn't have to care about the correct escaping (i.e. \" ... \\\" ... \\\" ...\") since this is done automatically. And if we now put all together we can transform this more complicated command line ( java and the CLASSPATH omitted): weka.classifiers.meta.Stacking -B \"weka.classifiers.meta.AdaBoostM1 -W weka.classifiers.trees.J48 -- -C 0.001\" -B \"weka.classifiers.meta.Bagging -W weka.classifiers.meta.AdaBoostM1 -- -W weka.classifiers.trees.J48\" -B \"weka.classifiers.meta.Stacking -B \\\"weka.classifiers.trees.J48\\\"\" -t test/datasets/hepatitis.arff into XML: <options type= \"class\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Bagging\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> <option name= \"t\" > test/datasets/hepatitis.arff </option> </options> Note: The type and value attribute of the outermost options tag is not used while reading the parameters. It is merely for documentation purposes, so that one knows which class was actually started from the command line. Responsible Class(es): weka.core.xml.XMLOptions Example(s): commandline.xml Serialization of Experiments # It is now possible to serialize the Experiments from the WEKA Experimenter not only in the proprietary binary format Java offers with serialization (with this you run into problems trying to read old experiments with a newer WEKA version, due to different SerialUIDs), but also in XML. There are currently two different ways to do this: built-in The built-in serialization captures only the necessary informations of an experiment and doesn't serialize anything else. It's sole purpose is to save the setup of a specific experiment and can therefore not store any built models. Thanks to this limitation we'll never run into problems with mismatching SerialUIDs. This kind of serialization is always available and can be selected via a Filter (*.xml) in the Save/Open-Dialog of the Experimenter. The DTD is very simple and looks like this (for version 3.4.5): <!DOCTYPE object[ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"no\"> <!ATTLIST object array CDATA \"no\"> <!-- the dimensions of the array; no=0, yes=1 --> <!ATTLIST object null CDATA \"no\"> <!ATTLIST object version CDATA \"3.4.5\"> ]> Prior to versions 3.4.5 and 3.5.0 it looked like this: <!DOCTYPE object [ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"yes\"> <!ATTLIST object array CDATA \"no\"> ] > Responsible Class(es): weka.experiment.xml.XMLExperiment for general Serialization: weka.core.xml.XMLSerialization weka.core.xml.XMLBasicSerialization Example(s): serialization.xml KOML The Koala Object Markup Language (KOML) is published under the LGPL and is an alternative way of serializing and derserializing Java Objects in an XML file. Like the normal serialization it serializes everything into XML via an ObjectOutputStream, including the SerialUID of each class. Even though we have the same problems with mismatching SerialUIDs it is at least possible edit the XML files by hand and replace the offending IDs with the new ones. In order to use KOML one only has to assure that the KOML classes are in the CLASSPATH with which the Experimenter is launched. As soon as KOML is present another Filter (*.koml) will show up in the Save/Open-Dialog. The DTD for KOML can be found here . Responsible Class(es): weka.core.xml.KOML Example(s): serialization.koml The experiment class can of course read those XML files if passed as input or output file (see options of weka.experiment.Experiment and weka.experiment.RemoteExperiment ). Serialization of Classifiers # The options for models of a classifier, -l for the input model and -d for the output model, now also supports XML serialized files. Here we have to differentiate between two different formats: built-in The built-in serialization captures only the options of a classifier but not the built model. With the -l one still has to provide a training file, since we only retrieve the options from the XML file. It is possible to add more options on the command line, but it is no check performed whether they collide with the ones stored in the XML file. The file is expected to end with .xml . KOML Since the KOML serialization captures everything of a Java Object we can use it just like the normal Java serialization. The file is expected to end with .koml . The built-in serialization can be used in the Experimenter for loading/saving options from algorithms that have been added to a Simple Experiment. Unfortunately it is not possible to create such a hierarchical structure like mentioned in Command Line . This is because of the loss of information caused by the getOptions() method of classifiers: it returns only a flat String-Array and not a tree structure. Responsible Class(es): weka.core.xml.KOML weka.classifiers.xml.XMLClassifier Example(s): commandline_inputmodel.xml Bayesian Networks # The GraphVisualizer ( weka.gui.graphvisualizer.GraphVisualizer ) can save graphs into the Interchange Format for Bayesian Networks (BIF). If started from command line with an XML filename as first parameter and not from the Explorer it can display the given file directly. The DTD for BIF is this: <!DOCTYPE BIF [ <!ELEMENT BIF ( NETWORK )*> <!ATTLIST BIF VERSION CDATA #REQUIRED> <!ELEMENT NETWORK ( NAME, ( PROPERTY | VARIABLE | DEFINITION )* )> <!ELEMENT NAME (#PCDATA)> <!ELEMENT VARIABLE ( NAME, ( OUTCOME | PROPERTY )* ) > <!ATTLIST VARIABLE TYPE (nature|decision|utility) \"nature\"> <!ELEMENT OUTCOME (#PCDATA)> <!ELEMENT DEFINITION ( FOR | GIVEN | TABLE | PROPERTY )* > <!ELEMENT FOR (#PCDATA)> <!ELEMENT GIVEN (#PCDATA)> <!ELEMENT TABLE (#PCDATA)> <!ELEMENT PROPERTY (#PCDATA)> ]> Responsible Class(es): weka.classifiers.bayes.BayesNet#toXMLBIF03() weka.classifiers.bayes.net.BIFReader weka.gui.graphvisualizer.BIFParser Example(s): bif.xml Tools # Experimenter options The XSLT script options.xsl parses an XML file for the experimenter and outputs the options in two ways: in an array-like fashion, i.e., each option on a separate line; the class is output first. commandline-like, i.e., the class followed by all its parameters; at each end of a line a \"\\\" is appended. (works only on *nix and Cygwin ) (Use options_single.xsl Usage: xsltproc options.xsl <xml file> Note: you can use any XSLT processor, e.g., xt; xsltproc is just one. Downloads # KOML koml12.dtd - local copy of the KOML DTD 1.2 koml_bin.zip koml_sources.zip - the KOML source code","title":"Xml"},{"location":"formats_and_processing/xml/#command-line","text":"WEKA now allows to start Classifiers and Experiments with the -xml option followed by a filename to retrieve the command line options from the XML file instead of the command line. For such simple classifiers like e.g. J48 this looks like overkill, but as soon as one uses Meta-Classifiers or Meta-Meta-Classifiers the handling gets tricky and one spends a lot of time looking for missing quotes. With the hierarchical structure of XML files it is simple to plug in other classifiers by just exchanging tags. The DTD for the XML options is quite simple: <!DOCTYPE options [ <!ELEMENT options (option)*> <!ATTLIST options type CDATA \"classifier\"> <!ATTLIST options value CDATA \"\"> <!ELEMENT option (#PCDATA | options)*> <!ATTLIST option name CDATA #REQUIRED> <!ATTLIST option type (flag | single | hyphens | quotes) \"single\"> ] > The type attribute of the option tag needs some explanations. There are currently four different types of options in WEKA: flag The simplest option that takes no arguments, like e.g. the -V flag for inversing an selection. <option name= \"V\" type= \"flag\" /> single The option takes exactly one parameter, directly following after the option, e.g., for specifying the trainings file with -t somefile.arff . Here the parameter value is just put between the opening and closing tag. Since single is the default value for the type tag we don't need to specify it explicitly. <option name= \"t\" > somefile.arff </option> hyphens Meta-Classifiers like AdaBoostM1 take another classifier as option with the -W option, where the options for the base classifier follow after the -- . And here it is where the fun starts; where to put parameters for the base classifier if the Meta-Classifier itself is a base classifier for another Meta-Classifier? E.g., does -W weka.classifiers.trees.J48 -- -C 0.001 become this: <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> Internally, all the options enclosed by the options tag are pushed to the end after the -- if one transforms the XML into a command line string. quotes A Meta-Classifier like Stacking can take several -B options, where each single one encloses other options in quotes (this itself can contain a Meta-Classifier!). From -B \"weka.classifiers.trees.J48\" we then get this XML: <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> With the XML representation one doesn't have to worry anymore about the level of quotes one is using and therefore doesn't have to care about the correct escaping (i.e. \" ... \\\" ... \\\" ...\") since this is done automatically. And if we now put all together we can transform this more complicated command line ( java and the CLASSPATH omitted): weka.classifiers.meta.Stacking -B \"weka.classifiers.meta.AdaBoostM1 -W weka.classifiers.trees.J48 -- -C 0.001\" -B \"weka.classifiers.meta.Bagging -W weka.classifiers.meta.AdaBoostM1 -- -W weka.classifiers.trees.J48\" -B \"weka.classifiers.meta.Stacking -B \\\"weka.classifiers.trees.J48\\\"\" -t test/datasets/hepatitis.arff into XML: <options type= \"class\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" > <option name= \"C\" > 0.001 </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Bagging\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.meta.AdaBoostM1\" > <option name= \"W\" type= \"hyphens\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> </options> </option> <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.meta.Stacking\" > <option name= \"B\" type= \"quotes\" > <options type= \"classifier\" value= \"weka.classifiers.trees.J48\" /> </option> </options> </option> <option name= \"t\" > test/datasets/hepatitis.arff </option> </options> Note: The type and value attribute of the outermost options tag is not used while reading the parameters. It is merely for documentation purposes, so that one knows which class was actually started from the command line. Responsible Class(es): weka.core.xml.XMLOptions Example(s): commandline.xml","title":"Command Line"},{"location":"formats_and_processing/xml/#serialization-of-experiments","text":"It is now possible to serialize the Experiments from the WEKA Experimenter not only in the proprietary binary format Java offers with serialization (with this you run into problems trying to read old experiments with a newer WEKA version, due to different SerialUIDs), but also in XML. There are currently two different ways to do this: built-in The built-in serialization captures only the necessary informations of an experiment and doesn't serialize anything else. It's sole purpose is to save the setup of a specific experiment and can therefore not store any built models. Thanks to this limitation we'll never run into problems with mismatching SerialUIDs. This kind of serialization is always available and can be selected via a Filter (*.xml) in the Save/Open-Dialog of the Experimenter. The DTD is very simple and looks like this (for version 3.4.5): <!DOCTYPE object[ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"no\"> <!ATTLIST object array CDATA \"no\"> <!-- the dimensions of the array; no=0, yes=1 --> <!ATTLIST object null CDATA \"no\"> <!ATTLIST object version CDATA \"3.4.5\"> ]> Prior to versions 3.4.5 and 3.5.0 it looked like this: <!DOCTYPE object [ <!ELEMENT object (#PCDATA | object)*> <!ATTLIST object name CDATA #REQUIRED> <!ATTLIST object class CDATA #REQUIRED> <!ATTLIST object primitive CDATA \"yes\"> <!ATTLIST object array CDATA \"no\"> ] > Responsible Class(es): weka.experiment.xml.XMLExperiment for general Serialization: weka.core.xml.XMLSerialization weka.core.xml.XMLBasicSerialization Example(s): serialization.xml KOML The Koala Object Markup Language (KOML) is published under the LGPL and is an alternative way of serializing and derserializing Java Objects in an XML file. Like the normal serialization it serializes everything into XML via an ObjectOutputStream, including the SerialUID of each class. Even though we have the same problems with mismatching SerialUIDs it is at least possible edit the XML files by hand and replace the offending IDs with the new ones. In order to use KOML one only has to assure that the KOML classes are in the CLASSPATH with which the Experimenter is launched. As soon as KOML is present another Filter (*.koml) will show up in the Save/Open-Dialog. The DTD for KOML can be found here . Responsible Class(es): weka.core.xml.KOML Example(s): serialization.koml The experiment class can of course read those XML files if passed as input or output file (see options of weka.experiment.Experiment and weka.experiment.RemoteExperiment ).","title":"Serialization of Experiments"},{"location":"formats_and_processing/xml/#serialization-of-classifiers","text":"The options for models of a classifier, -l for the input model and -d for the output model, now also supports XML serialized files. Here we have to differentiate between two different formats: built-in The built-in serialization captures only the options of a classifier but not the built model. With the -l one still has to provide a training file, since we only retrieve the options from the XML file. It is possible to add more options on the command line, but it is no check performed whether they collide with the ones stored in the XML file. The file is expected to end with .xml . KOML Since the KOML serialization captures everything of a Java Object we can use it just like the normal Java serialization. The file is expected to end with .koml . The built-in serialization can be used in the Experimenter for loading/saving options from algorithms that have been added to a Simple Experiment. Unfortunately it is not possible to create such a hierarchical structure like mentioned in Command Line . This is because of the loss of information caused by the getOptions() method of classifiers: it returns only a flat String-Array and not a tree structure. Responsible Class(es): weka.core.xml.KOML weka.classifiers.xml.XMLClassifier Example(s): commandline_inputmodel.xml","title":"Serialization of Classifiers"},{"location":"formats_and_processing/xml/#bayesian-networks","text":"The GraphVisualizer ( weka.gui.graphvisualizer.GraphVisualizer ) can save graphs into the Interchange Format for Bayesian Networks (BIF). If started from command line with an XML filename as first parameter and not from the Explorer it can display the given file directly. The DTD for BIF is this: <!DOCTYPE BIF [ <!ELEMENT BIF ( NETWORK )*> <!ATTLIST BIF VERSION CDATA #REQUIRED> <!ELEMENT NETWORK ( NAME, ( PROPERTY | VARIABLE | DEFINITION )* )> <!ELEMENT NAME (#PCDATA)> <!ELEMENT VARIABLE ( NAME, ( OUTCOME | PROPERTY )* ) > <!ATTLIST VARIABLE TYPE (nature|decision|utility) \"nature\"> <!ELEMENT OUTCOME (#PCDATA)> <!ELEMENT DEFINITION ( FOR | GIVEN | TABLE | PROPERTY )* > <!ELEMENT FOR (#PCDATA)> <!ELEMENT GIVEN (#PCDATA)> <!ELEMENT TABLE (#PCDATA)> <!ELEMENT PROPERTY (#PCDATA)> ]> Responsible Class(es): weka.classifiers.bayes.BayesNet#toXMLBIF03() weka.classifiers.bayes.net.BIFReader weka.gui.graphvisualizer.BIFParser Example(s): bif.xml","title":"Bayesian Networks"},{"location":"formats_and_processing/xml/#tools","text":"Experimenter options The XSLT script options.xsl parses an XML file for the experimenter and outputs the options in two ways: in an array-like fashion, i.e., each option on a separate line; the class is output first. commandline-like, i.e., the class followed by all its parameters; at each end of a line a \"\\\" is appended. (works only on *nix and Cygwin ) (Use options_single.xsl Usage: xsltproc options.xsl <xml file> Note: you can use any XSLT processor, e.g., xt; xsltproc is just one.","title":"Tools"},{"location":"formats_and_processing/xml/#downloads","text":"KOML koml12.dtd - local copy of the KOML DTD 1.2 koml_bin.zip koml_sources.zip - the KOML source code","title":"Downloads"},{"location":"formats_and_processing/xrff/","text":"The XRFF (e X tensible attribute- R elation F ile F ormat) is an XML-based extension of the ARFF format. File extensions # .xrff the default extension of XRFF files .xrff.gz the extension for gzip compressed XRFF files (see Compression for more details) Comparison # ARFF # In the following a snippet of the UCI dataset iris in ARFF format: @relation iris @attribute sepallength numeric @attribute sepalwidth numeric @attribute petallength numeric @attribute petalwidth numeric @attribute class {Iris-setosa,Iris-versicolor,Iris-virginica} @data 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3,1.4,0.2,Iris-setosa ... XRFF # And the same dataset represented as XRFF file: <?xml version=\"1.0\" encoding=\"utf-8\"?> <!DOCTYPE dataset [ <!ELEMENT dataset (header,body)> <!ATTLIST dataset name CDATA #REQUIRED> <!ATTLIST dataset version CDATA \"3.5.4\"> <!ELEMENT header (notes?,attributes)> <!ELEMENT body (instances)> <!ELEMENT notes ANY> <!-- comments, information, copyright, etc. --> <!ELEMENT attributes (attribute+)> <!ELEMENT attribute (labels?,metadata?,attributes?)> <!ATTLIST attribute name CDATA #REQUIRED> <!ATTLIST attribute type (numeric|date|nominal|string|relational) #REQUIRED> <!ATTLIST attribute format CDATA #IMPLIED> <!ATTLIST attribute class (yes|no) \"no\"> <!ELEMENT labels (label*)> <!-- only for type \"nominal\" --> <!ELEMENT label ANY> <!ELEMENT metadata (property*)> <!ELEMENT property ANY> <!ATTLIST property name CDATA #REQUIRED> <!ELEMENT instances (instance*)> <!ELEMENT instance (value*)> <!ATTLIST instance type (normal|sparse) \"normal\"> <!ATTLIST instance weight CDATA #IMPLIED> <!ELEMENT value (#PCDATA|instances)*> <!ATTLIST value index CDATA #IMPLIED> <!-- 1-based index (only used for instance format \"sparse\") --> <!ATTLIST value missing (yes|no) \"no\"> ] > <dataset name= \"iris\" version= \"3.5.3\" > <header> <attributes> <attribute name= \"sepallength\" type= \"numeric\" /> <attribute name= \"sepalwidth\" type= \"numeric\" /> <attribute name= \"petallength\" type= \"numeric\" /> <attribute name= \"petalwidth\" type= \"numeric\" /> <attribute class= \"yes\" name= \"class\" type= \"nominal\" > <labels> <label> Iris-setosa </label> <label> Iris-versicolor </label> <label> Iris-virginica </label> </labels> </attribute> </attributes> </header> <body> <instances> <instance> <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> <instance> <value> 4.9 </value> <value> 3 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> ... </instances> </body> </dataset> Sparse format # The XRFF format also supports a sparse data representation. Even though the iris dataset does not contain sparse data, the above example will be used here to illustrate the sparse format: ... <instances> <instance type= \"sparse\" > <value index= \"1\" > 5.1 </value> <value index= \"2\" > 3.5 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> <instance type= \"sparse\" > <value index= \"1\" > 4.9 </value> <value index= \"2\" > 3 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> ... </instances> ... In contrast to the normal data format, each sparse instance tag contains a type attribute with the value sparse : <instance type= \"sparse\" > And each value tag needs to specify the index attribute, which contains the 1-based index of this value. <value index= \"1\" > 5.1 </value> Compression # Since the XML representation takes up considerably more space than the rather compact ARFF format, one can also compress the data via gzip . Weka automatically recognizes a file being gzip compressed, if the file's extension is .xrff.gz instead of .xrff . The Weka Explorer now allows to load/save compressed and uncompressed XRFF files (this applies also to ARFF files). Additional features # In addition to all the features of the ARFF format, the XRFF format contains the following additional features: class attribute specification attribute weights instance weights Class attribute specification # Via the class=\"yes\" attribute in the attribute specification in the header, one can define which attribute should act as class attribute. A feature that can be used on the command line as well as in the Experimenter, which now can also load other data formats, and removing the limitation of the class attribute always having to be the last one. Snippet from the iris dataset: <attribute ** class= \"yes\" ** name= \"class\" type= \"nominal\" > Attribute weights # Attribute weights are stored in an attributes meta-data tag (in the header section). Here's an example of the petalwidth attribute with a weight of 0.9: <attribute name= \"petalwidth\" type= \"numeric\" > <metadata> <property name= \"weight\" > 0.9 </property> </metadata> </attribute> Instance weights # Instance weights are defined via the weight attribute in each instance tag. By default, the weight is 1. Here's an example: <instance weight= \"0.75\" > <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance>","title":"Xrff"},{"location":"formats_and_processing/xrff/#file-extensions","text":".xrff the default extension of XRFF files .xrff.gz the extension for gzip compressed XRFF files (see Compression for more details)","title":"File extensions"},{"location":"formats_and_processing/xrff/#comparison","text":"","title":"Comparison"},{"location":"formats_and_processing/xrff/#arff","text":"In the following a snippet of the UCI dataset iris in ARFF format: @relation iris @attribute sepallength numeric @attribute sepalwidth numeric @attribute petallength numeric @attribute petalwidth numeric @attribute class {Iris-setosa,Iris-versicolor,Iris-virginica} @data 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3,1.4,0.2,Iris-setosa ...","title":"ARFF"},{"location":"formats_and_processing/xrff/#xrff","text":"And the same dataset represented as XRFF file: <?xml version=\"1.0\" encoding=\"utf-8\"?> <!DOCTYPE dataset [ <!ELEMENT dataset (header,body)> <!ATTLIST dataset name CDATA #REQUIRED> <!ATTLIST dataset version CDATA \"3.5.4\"> <!ELEMENT header (notes?,attributes)> <!ELEMENT body (instances)> <!ELEMENT notes ANY> <!-- comments, information, copyright, etc. --> <!ELEMENT attributes (attribute+)> <!ELEMENT attribute (labels?,metadata?,attributes?)> <!ATTLIST attribute name CDATA #REQUIRED> <!ATTLIST attribute type (numeric|date|nominal|string|relational) #REQUIRED> <!ATTLIST attribute format CDATA #IMPLIED> <!ATTLIST attribute class (yes|no) \"no\"> <!ELEMENT labels (label*)> <!-- only for type \"nominal\" --> <!ELEMENT label ANY> <!ELEMENT metadata (property*)> <!ELEMENT property ANY> <!ATTLIST property name CDATA #REQUIRED> <!ELEMENT instances (instance*)> <!ELEMENT instance (value*)> <!ATTLIST instance type (normal|sparse) \"normal\"> <!ATTLIST instance weight CDATA #IMPLIED> <!ELEMENT value (#PCDATA|instances)*> <!ATTLIST value index CDATA #IMPLIED> <!-- 1-based index (only used for instance format \"sparse\") --> <!ATTLIST value missing (yes|no) \"no\"> ] > <dataset name= \"iris\" version= \"3.5.3\" > <header> <attributes> <attribute name= \"sepallength\" type= \"numeric\" /> <attribute name= \"sepalwidth\" type= \"numeric\" /> <attribute name= \"petallength\" type= \"numeric\" /> <attribute name= \"petalwidth\" type= \"numeric\" /> <attribute class= \"yes\" name= \"class\" type= \"nominal\" > <labels> <label> Iris-setosa </label> <label> Iris-versicolor </label> <label> Iris-virginica </label> </labels> </attribute> </attributes> </header> <body> <instances> <instance> <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> <instance> <value> 4.9 </value> <value> 3 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance> ... </instances> </body> </dataset>","title":"XRFF"},{"location":"formats_and_processing/xrff/#sparse-format","text":"The XRFF format also supports a sparse data representation. Even though the iris dataset does not contain sparse data, the above example will be used here to illustrate the sparse format: ... <instances> <instance type= \"sparse\" > <value index= \"1\" > 5.1 </value> <value index= \"2\" > 3.5 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> <instance type= \"sparse\" > <value index= \"1\" > 4.9 </value> <value index= \"2\" > 3 </value> <value index= \"3\" > 1.4 </value> <value index= \"4\" > 0.2 </value> <value index= \"5\" > Iris-setosa </value> </instance> ... </instances> ... In contrast to the normal data format, each sparse instance tag contains a type attribute with the value sparse : <instance type= \"sparse\" > And each value tag needs to specify the index attribute, which contains the 1-based index of this value. <value index= \"1\" > 5.1 </value>","title":"Sparse format"},{"location":"formats_and_processing/xrff/#compression","text":"Since the XML representation takes up considerably more space than the rather compact ARFF format, one can also compress the data via gzip . Weka automatically recognizes a file being gzip compressed, if the file's extension is .xrff.gz instead of .xrff . The Weka Explorer now allows to load/save compressed and uncompressed XRFF files (this applies also to ARFF files).","title":"Compression"},{"location":"formats_and_processing/xrff/#additional-features","text":"In addition to all the features of the ARFF format, the XRFF format contains the following additional features: class attribute specification attribute weights instance weights","title":"Additional features"},{"location":"formats_and_processing/xrff/#class-attribute-specification","text":"Via the class=\"yes\" attribute in the attribute specification in the header, one can define which attribute should act as class attribute. A feature that can be used on the command line as well as in the Experimenter, which now can also load other data formats, and removing the limitation of the class attribute always having to be the last one. Snippet from the iris dataset: <attribute ** class= \"yes\" ** name= \"class\" type= \"nominal\" >","title":"Class attribute specification"},{"location":"formats_and_processing/xrff/#attribute-weights","text":"Attribute weights are stored in an attributes meta-data tag (in the header section). Here's an example of the petalwidth attribute with a weight of 0.9: <attribute name= \"petalwidth\" type= \"numeric\" > <metadata> <property name= \"weight\" > 0.9 </property> </metadata> </attribute>","title":"Attribute weights"},{"location":"formats_and_processing/xrff/#instance-weights","text":"Instance weights are defined via the weight attribute in each instance tag. By default, the weight is 1. Here's an example: <instance weight= \"0.75\" > <value> 5.1 </value> <value> 3.5 </value> <value> 1.4 </value> <value> 0.2 </value> <value> Iris-setosa </value> </instance>","title":"Instance weights"},{"location":"not_so_faq/gsp/","text":"Class # weka.associators.GeneralizedSequentialPatterns Publication # Ramakrishnan Srikant, Rakesh Agrawal (1996). Mining Sequential Patterns: Generalizations and Performance Improvements. Downloads # GeneralizedSequentialPattern_example.arff","title":"Class"},{"location":"not_so_faq/gsp/#class","text":"weka.associators.GeneralizedSequentialPatterns","title":"Class"},{"location":"not_so_faq/gsp/#publication","text":"Ramakrishnan Srikant, Rakesh Agrawal (1996). Mining Sequential Patterns: Generalizations and Performance Improvements.","title":"Publication"},{"location":"not_so_faq/gsp/#downloads","text":"GeneralizedSequentialPattern_example.arff","title":"Downloads"},{"location":"not_so_faq/j48_numbers/","text":"J48 pruned tree node-caps = yes | deg-malig = 1: recurrence-events (1.01/0.4) | deg-malig = 2: no-recurrence-events (26.2/8.0) | deg-malig = 3: recurrence-events (30.4/7.4) node-caps = no: no-recurrence-events (228.39/53.4) The first number is the total number of instances (weight of instances) reaching the leaf. The second number is the number (weight) of those instances that are misclassified. If your data has missing attribute values then you will end up with fractional instances at the leafs. When splitting on an attribute where some of the training instances have missing values, J48 will divide a training instance with a missing value for the split attribute up into fractional parts proportional to the frequencies of the observed non-missing values. This is discussed in the Witten & Frank Data Mining book as well as Ross Quinlan's original publications on C4.5.","title":"J48 numbers"},{"location":"packages/","text":"Weka 3.7.2 introduced support for packages, making it easy to extend Weka without having to recompile or patch the underlying Weka installation. Here are some pointers for using and developing packages: How do I use the package manager? Unofficial packages How are packages structured for the package management system? List of available packages","title":"Packages"},{"location":"packages/manager/","text":"Usually, the term \"package\" is used to refer to Java's concept of organizing classes. From version 3.7.2, Weka has the concept of a package as a bundle of additional functionality, separate from that supplied in the main weka.jar file. A package consists of various jar files, documentation, meta data, and possibly source code. Many learning algorithms and tools that were present in earlier versions of Weka have become separate packages from version 3.7.2. This simplifies the core Weka system and allows users to install just what they need or are interested in. It also provides a simple mechanism for people to use when contributing to Weka. There are a number of packages available for Weka that add learning schemes or extend the functionality of the core system in some fashion. Many are provided by the Weka team and others are from third parties. Weka includes a facility for the management of packages and a mechanism to load them dynamically at runtime. There is both a command-line and GUI package manager. If the package manager does not start when you try to run it, take a look at this page. Command line package management # Assuming that the weka.jar file is in the classpath, the package manager can be accessed by typing: java weka.core.WekaPackageManager Supplying no options will print the usage information: Usage: weka.core.WekaPackageManager [option] Options: -list-packages <all | installed | available> -package-info <repository | installed | archive> packageName -install-package <packageName | packageZip | URL> [version] -uninstall-package <packageName> -refresh-cache Weka 3.7.8 now offers a completely \"offline\" mode that involves no attempts to connect to the internet. This mode can be used to install package zip files that the user already has on the file system, and to browse already installed packages. This mode can be accessed from the command line package manager by specifying the \"-offline\" option. Alternatively, the property weka.packageManager.offline=true can be provided to the Java virtual machine on the command line or in a properties file (see the section on properties below). Information (meta data) about packages is stored on a web server hosted on Sourceforge. The first time the package manager is run, for a new installation of Weka, there will be a short delay while the system downloads and stores a cache of the meta data from the server. Maintaining a cache speeds up the process of browsing the package information. From time to time you should update the local cache of package meta data in order to get the latest information on packages from the server. This can be achieved by supplying the -refresh-cache option. The -list-packages option will, as the name suggests, print information (version numbers and short descriptions) about various packages. The option must be followed by one of three keywords: all will print information on all packages that the system knows about installed will print information on all packages that are installed locally available will print information on all packages that are not installed The following shows an example of listing all packages installed locally: java weka.core.WekaPackageManager -list-packages installed Installed Repository Package ========= ========== ======= 1.0.0 1.0.0 DTNB: Class for building and using a decision table/naive bayes hybrid classifier. 1.0.0 1.0.0 massiveOnlineAnalysis: MOA (Massive On-line Analysis). 1.0.0 1.0.0 multiInstanceFilters: A collection of filters for manipulating multi-instance data. 1.0.0 1.0.0 naiveBayesTree: Class for generating a decision tree with naive Bayes classifiers at the leaves. 1.0.0 1.0.0 scatterPlot3D: A visualization component for displaying a 3D scatter plot of the data using Java 3D. The -package-info command lists information about a package given its name. The command is followed by one of three keywords and then the name of a package: repository will print info from the repository for the named package installed will print info on the installed version of the named package archive will print info for a package stored in a zip archive. In this case, the \u201carchive\u201d keyword must be followed by the path to an package zip archive file rather than just the name of a package The following shows an example of listing information for the \u201cisotonicRegression\u201d package from the server: java weka.core.WekaPackageManager -package-info repository isotonicRegression Description:Learns an isotonic regression model. Picks the attribute that results in the lowest squared error. Missing values are not allowed. Can only deal with numeric attributes. Considers the monotonically increasing case as well as the monotonically decreasing case. Version:1.0.0 PackageURL:http://prdownloads.sourceforge.net/weka/isotonicRegression1.0.0.zip?download Author:Eibe Frank PackageName:isotonicRegression Title:Learns an isotonic regression model. Date:2009-09-10 URL:https://weka.sourceforge.io/doc.dev/weka/classifiers/IsotonicRegression.html Category:Regression Depends:weka (>=3.7.1) License:GPL 2.0 Maintainer:Weka team <wekalist@list.scms.waikato.ac.nz> The -install-package command allows a package to be installed from one of three locations: specifying a name of a package will install the package using the information in the package description meta data stored on the server. If no version number is given, then the latest available version of the package is installed. providing a path to a zip file will attempt to unpack and install the archive as a Weka package providing a URL (beginning with http:// ) to a package zip file on the web will download and attempt to install the zip file as a Weka package The uninstall-package command will uninstall the named package. Of course, the named package has to be installed for this command to have any effect! Running installed learning algorithms # Running learning algorithms that come with the main weka distribution (i.e. are contained in the weka.jar file) was covered earlier in the Primer . But what about algorithms from packages that you\u2019ve installed using the package manager? We don\u2019t want to have to add a ton of jar files to our classpath every time we wan\u2019t to run a particular algorithm. Fortunately, we don\u2019t have to. Weka has a mechanism to load installed packages dynamically at run time. This means that newly installed packages are available in Weka's GUIs immediately. What about running algorithms from packages on the command line I hear you ask? We can run a named algorithm by using the Run command: java weka.Run If no arguments are supplied, then Run outputs the following usage information: Usage: weka.Run [-no-scan] [-no-load] <scheme name [scheme options]> The Run command supports sub-string matching, so you can run a classifier (such as J48) like so: java weka.Run J48 When there are multiple matches on a supplied scheme name you will be presented with a list. For example: java weka.Run NaiveBayes Select a scheme to run, or <return> to exit: 1) weka.classifiers.bayes.ComplementNaiveBayes 2) weka.classifiers.bayes.NaiveBayes 3) weka.classifiers.bayes.NaiveBayesMultinomial 4) weka.classifiers.bayes.NaiveBayesMultinomialUpdateable 5) weka.classifiers.bayes.NaiveBayesSimple 6) weka.classifiers.bayes.NaiveBayesUpdateable Enter a number > You can turn off the scanning of packages and sub-string matching by providing the -no-scan option. This is useful when using the Run command in a script. In this case, you need to specify the fully qualified name of the algorithm to use. E.g. java weka.Run -no-scan weka.classifiers.bayes.NaiveBayes To reduce startup time you can also turn off the dynamic loading of installed packages by specifying the -no-load option. In this case, you will need to explicitly include any packaged algorithms in your classpath if you plan to use them. E.g. java -classpath ./weka.jar:$HOME/wekafiles/packages/DTNB/DTNB.jar rweka.Run -no-load -no-scan weka.classifiers.rules.DTNB GUI package manager # As well as a command line client, there is also a graphical interface to Weka\u2019s package management system. This is available from the Tools menu in the GUIChooser . All the functionality available in the command line client to the package management system is available in the GUI version, along with the ability to install and uninstall multiple packages in one hit. The package manager\u2019s window is split horizontally into two parts: at the top is a list of packages and at the bottom is a mini browser that can be used to display information on the currently selected package. The package list shows the name of a package, its category, the currently installed version (if installed), the latest version available via the repository and whether the package has been loaded or not. This list may be sorted by either package name or category by clicking on the appropriate column header. A second click on the same header reverses the sort order. Three radio buttons in the upper left of the window can be used to filter what is displayed in the list. All packages (default), all available packages (i.e. those not yet installed) or only installed packages can be displayed. If multiple versions of a package are available, they can be accessed by clicking on an entry in the \u201cRepository version\u201d column: Installing and removing packages # At the very top of the window are three buttons. On the left-hand side is a button that can be used to refresh the cached copy of the package repository meta data. The first time that the package manager (GUI or command line) is used there will be a short delay as the initial cache is established. NOTE: Weka (3.7.2) will not automatically check for new information at the central repository, so it is a good idea to refresh the local cache regularly. From Weka 3.7.3 the package manager will notify you if there are new packages available at the central repository . The two buttons at the top right are used to install and remove packages repspectively. Multiple packages may be installed/removed by using a shift-left-click combination to select a range and/or by using a command-left-click combination to add to the selection. Underneath the install and uninstall but- tons is a checkbox that can be enabled to ignore any dependencies required by selected packages and any conflicts that may occur. Installing packages while this checkbox is selected will '''not''' install required dependencies. Some packages may have additional information on how to complete the installation or special instructions that gets displayed when the package is installed: Usually it is not necessary to restart Weka after packages have been installed\u2014the changes should be available immediately. An exception is when upgrading a package that is already installed. If in doubt, restart Weka. Unofficial packages # The package list shows those packages that have their meta data stored in Weka\u2019s central meta data repository. These packages are \u201cofficial\u201d Weka packages and the Weka team as verified that they appear to provide what is advertised (and do not contain malicious code or malware). It is also possible to install an unofficial package that has not gone through the process of become official. Unofficial packages might be provided, for exam- ple, by researchers who want to make experimental algorithms quickly available to the community for feedback. Unofficial packages can be installed by clicking the \u201cFile/url\u201d button on the top-right of the package manager window. This will bring up an \u201cUnnoficial package install\u201d dialog where the user can browse their file system for a package zip file or directly enter an URL to the package zip file. Note that no dependency checking is done for unofficial packages. Using a HTTP proxy # Both the GUI and command line package managers can operate via a http proxy. To do so, start Weka from the command line and supply property values for the proxy host and port: java -Dhttp.proxyHost=some.proxy.somewhere.net -Dhttp.proxyPort=port weka.gui.GUIChooser If your proxy requires authentication, then Weka will present a GUI dialog where you can enter your username and password. If you are running on a headless environment, then two more (non-standard) properties can be supplied: -Dhttp.proxyUser=some_user_name -Dhttp.proxyPassword=some_password Using an alternative central package meta data repository # By default, both the command-line and GUI package managers use the central package meta data repository hosted on Sourceforge. In the unlikely event that this site is unavailable for one reason or another, it is possible to point the package management system at an alternative repository. This mechanism allows a temporary backup of the official repostory to be accessed, local mirrors to be established and alternative repositories to be set up for use etc. An alternative repository can be specified by setting a Java property: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere This can either be set when starting Weka from the command line with the -D flag, or it can be placed into a file called \u201cPackageRepository.props\u201d in $WEKA_HOME/props . The default value of WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. More information on how and where Weka stores configuration information is given in the Package Structure article. Package manager property file # As mentioned in the previous section, an alternative package meta data repository can be specified by placing an entry in the PackageRepository.props file in $WEKA_HOME/props . From Weka 3.7.8, the package manager also looks for properties placed in $WEKA_HOME/props/PackageManager.props . The current set of properties that can be set are: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere weka.packageManager.offline=[true | false] weka.packageManager.loadPackages=[true | false] weka.pluginManager.disable=com.funky.FunkyExplorerPluginTab The default for offline mode (if unspecified) is false and for loadPackages is true . The weka.pluginManager.disable property can be used to specify a comma-separated list of fully qualified class names to \"disable\" in the GUI. This can be used to make problematic components unavailable in the GUI without having to prevent the entire package that contains them from being loaded. E.g. \"funkyPackage\" might provide several classifiers and a special Explorer plugin tab for visualization. Suppose, for example, that the plugin Explorer tab has issues with certain data sets and causes annoying exceptions to be generated (or perhaps in the worst cases crashes the Explorer!). In this case we might want to use the classifiers provided by the package and just disable the Explorer plugin. Listing the fully qualified name of the Explorer plugin as a member of the comma-separated list associated with the weka.pluginManager.disable property will achieve this.","title":"Manager"},{"location":"packages/manager/#command-line-package-management","text":"Assuming that the weka.jar file is in the classpath, the package manager can be accessed by typing: java weka.core.WekaPackageManager Supplying no options will print the usage information: Usage: weka.core.WekaPackageManager [option] Options: -list-packages <all | installed | available> -package-info <repository | installed | archive> packageName -install-package <packageName | packageZip | URL> [version] -uninstall-package <packageName> -refresh-cache Weka 3.7.8 now offers a completely \"offline\" mode that involves no attempts to connect to the internet. This mode can be used to install package zip files that the user already has on the file system, and to browse already installed packages. This mode can be accessed from the command line package manager by specifying the \"-offline\" option. Alternatively, the property weka.packageManager.offline=true can be provided to the Java virtual machine on the command line or in a properties file (see the section on properties below). Information (meta data) about packages is stored on a web server hosted on Sourceforge. The first time the package manager is run, for a new installation of Weka, there will be a short delay while the system downloads and stores a cache of the meta data from the server. Maintaining a cache speeds up the process of browsing the package information. From time to time you should update the local cache of package meta data in order to get the latest information on packages from the server. This can be achieved by supplying the -refresh-cache option. The -list-packages option will, as the name suggests, print information (version numbers and short descriptions) about various packages. The option must be followed by one of three keywords: all will print information on all packages that the system knows about installed will print information on all packages that are installed locally available will print information on all packages that are not installed The following shows an example of listing all packages installed locally: java weka.core.WekaPackageManager -list-packages installed Installed Repository Package ========= ========== ======= 1.0.0 1.0.0 DTNB: Class for building and using a decision table/naive bayes hybrid classifier. 1.0.0 1.0.0 massiveOnlineAnalysis: MOA (Massive On-line Analysis). 1.0.0 1.0.0 multiInstanceFilters: A collection of filters for manipulating multi-instance data. 1.0.0 1.0.0 naiveBayesTree: Class for generating a decision tree with naive Bayes classifiers at the leaves. 1.0.0 1.0.0 scatterPlot3D: A visualization component for displaying a 3D scatter plot of the data using Java 3D. The -package-info command lists information about a package given its name. The command is followed by one of three keywords and then the name of a package: repository will print info from the repository for the named package installed will print info on the installed version of the named package archive will print info for a package stored in a zip archive. In this case, the \u201carchive\u201d keyword must be followed by the path to an package zip archive file rather than just the name of a package The following shows an example of listing information for the \u201cisotonicRegression\u201d package from the server: java weka.core.WekaPackageManager -package-info repository isotonicRegression Description:Learns an isotonic regression model. Picks the attribute that results in the lowest squared error. Missing values are not allowed. Can only deal with numeric attributes. Considers the monotonically increasing case as well as the monotonically decreasing case. Version:1.0.0 PackageURL:http://prdownloads.sourceforge.net/weka/isotonicRegression1.0.0.zip?download Author:Eibe Frank PackageName:isotonicRegression Title:Learns an isotonic regression model. Date:2009-09-10 URL:https://weka.sourceforge.io/doc.dev/weka/classifiers/IsotonicRegression.html Category:Regression Depends:weka (>=3.7.1) License:GPL 2.0 Maintainer:Weka team <wekalist@list.scms.waikato.ac.nz> The -install-package command allows a package to be installed from one of three locations: specifying a name of a package will install the package using the information in the package description meta data stored on the server. If no version number is given, then the latest available version of the package is installed. providing a path to a zip file will attempt to unpack and install the archive as a Weka package providing a URL (beginning with http:// ) to a package zip file on the web will download and attempt to install the zip file as a Weka package The uninstall-package command will uninstall the named package. Of course, the named package has to be installed for this command to have any effect!","title":"Command line package management"},{"location":"packages/manager/#running-installed-learning-algorithms","text":"Running learning algorithms that come with the main weka distribution (i.e. are contained in the weka.jar file) was covered earlier in the Primer . But what about algorithms from packages that you\u2019ve installed using the package manager? We don\u2019t want to have to add a ton of jar files to our classpath every time we wan\u2019t to run a particular algorithm. Fortunately, we don\u2019t have to. Weka has a mechanism to load installed packages dynamically at run time. This means that newly installed packages are available in Weka's GUIs immediately. What about running algorithms from packages on the command line I hear you ask? We can run a named algorithm by using the Run command: java weka.Run If no arguments are supplied, then Run outputs the following usage information: Usage: weka.Run [-no-scan] [-no-load] <scheme name [scheme options]> The Run command supports sub-string matching, so you can run a classifier (such as J48) like so: java weka.Run J48 When there are multiple matches on a supplied scheme name you will be presented with a list. For example: java weka.Run NaiveBayes Select a scheme to run, or <return> to exit: 1) weka.classifiers.bayes.ComplementNaiveBayes 2) weka.classifiers.bayes.NaiveBayes 3) weka.classifiers.bayes.NaiveBayesMultinomial 4) weka.classifiers.bayes.NaiveBayesMultinomialUpdateable 5) weka.classifiers.bayes.NaiveBayesSimple 6) weka.classifiers.bayes.NaiveBayesUpdateable Enter a number > You can turn off the scanning of packages and sub-string matching by providing the -no-scan option. This is useful when using the Run command in a script. In this case, you need to specify the fully qualified name of the algorithm to use. E.g. java weka.Run -no-scan weka.classifiers.bayes.NaiveBayes To reduce startup time you can also turn off the dynamic loading of installed packages by specifying the -no-load option. In this case, you will need to explicitly include any packaged algorithms in your classpath if you plan to use them. E.g. java -classpath ./weka.jar:$HOME/wekafiles/packages/DTNB/DTNB.jar rweka.Run -no-load -no-scan weka.classifiers.rules.DTNB","title":"Running installed learning algorithms"},{"location":"packages/manager/#gui-package-manager","text":"As well as a command line client, there is also a graphical interface to Weka\u2019s package management system. This is available from the Tools menu in the GUIChooser . All the functionality available in the command line client to the package management system is available in the GUI version, along with the ability to install and uninstall multiple packages in one hit. The package manager\u2019s window is split horizontally into two parts: at the top is a list of packages and at the bottom is a mini browser that can be used to display information on the currently selected package. The package list shows the name of a package, its category, the currently installed version (if installed), the latest version available via the repository and whether the package has been loaded or not. This list may be sorted by either package name or category by clicking on the appropriate column header. A second click on the same header reverses the sort order. Three radio buttons in the upper left of the window can be used to filter what is displayed in the list. All packages (default), all available packages (i.e. those not yet installed) or only installed packages can be displayed. If multiple versions of a package are available, they can be accessed by clicking on an entry in the \u201cRepository version\u201d column:","title":"GUI package manager"},{"location":"packages/manager/#installing-and-removing-packages","text":"At the very top of the window are three buttons. On the left-hand side is a button that can be used to refresh the cached copy of the package repository meta data. The first time that the package manager (GUI or command line) is used there will be a short delay as the initial cache is established. NOTE: Weka (3.7.2) will not automatically check for new information at the central repository, so it is a good idea to refresh the local cache regularly. From Weka 3.7.3 the package manager will notify you if there are new packages available at the central repository . The two buttons at the top right are used to install and remove packages repspectively. Multiple packages may be installed/removed by using a shift-left-click combination to select a range and/or by using a command-left-click combination to add to the selection. Underneath the install and uninstall but- tons is a checkbox that can be enabled to ignore any dependencies required by selected packages and any conflicts that may occur. Installing packages while this checkbox is selected will '''not''' install required dependencies. Some packages may have additional information on how to complete the installation or special instructions that gets displayed when the package is installed: Usually it is not necessary to restart Weka after packages have been installed\u2014the changes should be available immediately. An exception is when upgrading a package that is already installed. If in doubt, restart Weka.","title":"Installing and removing packages"},{"location":"packages/manager/#unofficial-packages","text":"The package list shows those packages that have their meta data stored in Weka\u2019s central meta data repository. These packages are \u201cofficial\u201d Weka packages and the Weka team as verified that they appear to provide what is advertised (and do not contain malicious code or malware). It is also possible to install an unofficial package that has not gone through the process of become official. Unofficial packages might be provided, for exam- ple, by researchers who want to make experimental algorithms quickly available to the community for feedback. Unofficial packages can be installed by clicking the \u201cFile/url\u201d button on the top-right of the package manager window. This will bring up an \u201cUnnoficial package install\u201d dialog where the user can browse their file system for a package zip file or directly enter an URL to the package zip file. Note that no dependency checking is done for unofficial packages.","title":"Unofficial packages"},{"location":"packages/manager/#using-a-http-proxy","text":"Both the GUI and command line package managers can operate via a http proxy. To do so, start Weka from the command line and supply property values for the proxy host and port: java -Dhttp.proxyHost=some.proxy.somewhere.net -Dhttp.proxyPort=port weka.gui.GUIChooser If your proxy requires authentication, then Weka will present a GUI dialog where you can enter your username and password. If you are running on a headless environment, then two more (non-standard) properties can be supplied: -Dhttp.proxyUser=some_user_name -Dhttp.proxyPassword=some_password","title":"Using a HTTP proxy"},{"location":"packages/manager/#using-an-alternative-central-package-meta-data-repository","text":"By default, both the command-line and GUI package managers use the central package meta data repository hosted on Sourceforge. In the unlikely event that this site is unavailable for one reason or another, it is possible to point the package management system at an alternative repository. This mechanism allows a temporary backup of the official repostory to be accessed, local mirrors to be established and alternative repositories to be set up for use etc. An alternative repository can be specified by setting a Java property: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere This can either be set when starting Weka from the command line with the -D flag, or it can be placed into a file called \u201cPackageRepository.props\u201d in $WEKA_HOME/props . The default value of WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. More information on how and where Weka stores configuration information is given in the Package Structure article.","title":"Using an alternative central package meta data repository"},{"location":"packages/manager/#package-manager-property-file","text":"As mentioned in the previous section, an alternative package meta data repository can be specified by placing an entry in the PackageRepository.props file in $WEKA_HOME/props . From Weka 3.7.8, the package manager also looks for properties placed in $WEKA_HOME/props/PackageManager.props . The current set of properties that can be set are: weka.core.wekaPackageRepositoryURL=http://some.mirror.somewhere weka.packageManager.offline=[true | false] weka.packageManager.loadPackages=[true | false] weka.pluginManager.disable=com.funky.FunkyExplorerPluginTab The default for offline mode (if unspecified) is false and for loadPackages is true . The weka.pluginManager.disable property can be used to specify a comma-separated list of fully qualified class names to \"disable\" in the GUI. This can be used to make problematic components unavailable in the GUI without having to prevent the entire package that contains them from being loaded. E.g. \"funkyPackage\" might provide several classifiers and a special Explorer plugin tab for visualization. Suppose, for example, that the plugin Explorer tab has issues with certain data sets and causes annoying exceptions to be generated (or perhaps in the worst cases crashes the Explorer!). In this case we might want to use the classifiers provided by the package and just disable the Explorer plugin. Listing the fully qualified name of the Explorer plugin as a member of the comma-separated list associated with the weka.pluginManager.disable property will achieve this.","title":"Package manager property file"},{"location":"packages/structure/","text":"Articles such as How do I use WEKA's classes in my own code? and How do I write a new classifier or filter? describe how to extend Weka to add your own learning algorithms and so forth. This article describes how such enhancements can be assembled into a package that can be accessed via Weka\u2019s package management system. Bundling your enhancements in a package makes it easy to share with other Weka users. In this article we refer to a package as an archive containing various resources such as compiled code, source code, javadocs, package description files (meta data), third-party libraries and configuration property files. Not all of the preceding may be in a given package, and there may be other resources included as well. This concept of a package is quite different to that of a Java packages, which simply define how classes are arranged hierarchically. Where does WEKA store packages and other configuration stuff? # By default, Weka stores packages and other information in $WEKA_HOME . The default location for WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. You can change the default location for WEKA_HOME by setting this either as an evironment variable for your platform, or by specifying it as a Java property when starting Weka. E.g.: export WEKA_HOME=/home/somewhere/weka_bits_and_bobs will set the directory that Weka uses to /home/somewhere/weka_bits_and_bobs under the LINUX operating system. The same thing can be accomplished when starting Weka by specifying a Java property on the command line, E.g.: java -DWEKA_HOME=/home/somewhere/weka_bits_and_bobs -jar weka.jar Inside $WEKA_HOME you will find the main weka log file (weka.log) and a number of directories: packages holds installed packages. Each package is contained its own subdirectory. props holds various Java property files used by Weka. This directory replaces the user\u2019s home directory (used in earlier releases of Weka) as one of the locations checked by Weka for properties files (such as DatabaseUtils.props ). Weka first checks, in order, the current directory (i.e. the directory that Weka is launched from), then $WEKA_HOME/props and finally the weka.jar file for property files. repCache holds the cached copy of the meta data from the central package repository. If the contents of this directory get corrupted it can be safely deleted and Weka will recreate it on the next restart. systemDialogs holds marker files that are created when you check Don\u2019t show this again in various system popup dialogs. Removing this directory or its contents will cause Weka to display those prompts anew. Anatomy of a package # A Weka package is a zip archive that must unpack to the current directory. For example, the DTNB package contains the decision table naive Bayes hybrid classifier and is delivered in a file called DTNB.zip . When unpacked this zip file creates the following directory structure: <current directory> +-DTNB.jar +-Description.props +-build_package.xml +-src | +-main | | +-java | | +-weka | | +-classifiers | | +-rules | | +-DTNB.java | +-test | +-java | +-weka | +-classifiers | +-rules | +-DTNBTest.java +-lib +-doc When installing, the package manager will use the value of the \"PackageName\" field in the Description.props file (see below) to create a directory in $WEKA_HOME/packages to hold the package contents. The contents of the doc directory have not been shown in the diagram above, but this directory contains javadoc for the DTNB class. A package must have a Description.props file and contain at least one jar file with compiled Java classes. The package manager will attempt to load all jar files that it finds in the root directory and the lib directory. Other files are optional, but if the package is open-source then it is nice to include the source code and an ant build file that can be used to compile the code. Template versions of the Description.props file and build_package.xml file are available from the Weka site and here. The description file # A valid package must contain a Description.props file that provides meta data on the package. Identical files are stored at the central package repository and the local cache maintained by the package manager. The package manager uses these files to compare what is installed to what is available and resolve dependencies. The Description.props contains basic information on the package in the following format: # Template Description file for a Weka package # Package name (required) PackageName=funkyPackage # Version (required) Version=1.0.0 #Date (year-month-day) Date=2010-01-01 # Title (required) Title=My cool algorithm # Category (recommended) Category=Classification # Author (required) Author=Joe Dev <joe@somewhere.net>,Dev2 <dev2@somewhereelse.net> # Maintainer (required) Maintainer=Joe Dev <joe@somewhere.net> # License (required) License=GPL 2.0|Mozilla # Description (required) Description=This package contains the famous Funky Classifer that performs \\ truely funky prediction. # Changes and/or bug fixes in this package (optional) Changes=Fixed a serious bug that affected overall coolness of the Funky Classifier # Package URL for obtaining the package archive (required) PackageURL=http://somewhere.net/weka/funkyPackage.zip # URL for further information URL=http://somewhere.net/funkyResearchInfo.html # Enhances various other packages? Enhances=packageName1,packageName2,... # Related to other packages? Related=packageName1,packageName2,... # Dependencies (required; format: packageName (equality/inequality version_number) Depends=weka (>=3.7.1), packageName1 (=x.y.z), packageName2 (>u.v.w|<=x.y.z),... The PackageName and Version give the name of the package and version number respectively. The name can consist of letters, numbers, and the dot character. It should not start with a dot and should not contain any spaces. The version number is a sequence of three non-negative integers separated by single . or - characters. The Title field should give a one sentence description of the package. The Description field can give a longer description of the package spaning multiple sentences. It may include technical references and can use HTML markup. The Category field is strongly recommended as this information is displayed on both the repository web site and in the GUI package manager client. In the latter, the user can sort the packages on the basis of the category field. It is recommended that an existing category be assigned if possible. Some examples include (Classification, Text classification, Ensemble learning, Regression, Clustering, Associations, Preprocessing, Visualization, Explorer, Experimenter, KnowledgeFlow). The Author field describes who wrote the package and may include multiple names (separated by commas). Email addresses may be given in angle brackets after each name. The field is intended for human readers and no email addresses are automatically extracted. The Maintainer field lists who maintains the package and should include a single email address, enclosed in angle brackets, for sending bug reports to. The License field lists the license(s) that apply to the package. This field may contain the short specification of a license (such as LGPL, GPL 2.0 etc.) or the string file LICENSE , where LICENSE exists as a file in the top-level directory of the package. The string Unlimited may be supplied to indicate that there are no restrictions on distribution or use aside from those imposed by relevant laws. The PackageURL field lists valid URL that points to the package zip file. This URL is used by the package manager to download and install the package. The required Depends field gives a comma separated list of packages which this package depends on. The name of a package is followed by a version number constraint enclosed in parenthesis. Valid operators for version number constraints include =, <, >, <=, >=. The keyword weka is reserved to refer to the base Weka system and can be used to indicate a dependency on a particular version of Weka. At a minimum, the Depends field should list the base version of Weka that the package will operate with. Some examples include: Depends=weka (>=3.7.2), DTNB (=1.0.0) states that this package requires Weka 3.7.2 or higher and version 1.0.0 of the package DTNB. Depends=weka (>3.7.1|<3.8.0) states that this package requires a version of Weka between 3.7.1 and 3.8.0. Depends=weka (>=3.7.2), DTNB (<1.5.0|>=2.0.1) states that this package requires that a version of the DTNB package be installed that is either less than version 1.5.0 or greater than or equal to version 2.0.1. If there is no version number constraint following a package name, the package manager assumes that the latest version of the dependent package is suitable. The optional URL field gives a URL at which the user can find additional online information about the package or its constituent algorithms. The optional Enhances field can be used to indicate which other packages this package is based on (i.e. if it extends methods/algorithms from another package in some fashion). The optional Related field is similar to the Enhances field. It can be used to point the user to other packages that are related in some fashion to this one. The optional Changes field should be used to indicate what changes/bug fixes are included in the current release of the package. There are several other fields that can be used to provide information to assist the user with completing installation (if it can\u2019t be completely accomplished with the package zip file) or display error messages if necessary components are missing: MessageToDisplayOnInstallation=Funky package requires some extra\\n\\ stuff to be installed after installing this package. You will\\n\\ need to blah, blah, blah in order to blah, blah, blah... DoNotLoadIfFileNotPresent=lib/someLibrary.jar,otherStuff/important,... DoNotLoadIfFileNotPresentMessage=funkyPackage can't be loaded because some \\ funky libraries are missing. Please download funkyLibrary.jar from \\ http://www.funky.com and install in $WEKA_HOME/packages/funkyPackage/lib DoNotLoadIfClassNotPresent=com.some.class.from.some.Where,org.some.other.Class,... DoNotLoadIfClassNotPresentMessage=funkyPackage can't be loaded because \\ com.funky.FunkyClass can't be instantiated. Have you downloaded and run \\ the funky software installer for your platform? The optional MessageToDisplayOnInstallation field allows you to specify special instructions to the user in order to help them complete the intallation manually. This message gets displayed on the console, written to the log and appears in a pop-up information dialog if using the GUI package manager. It should include \\n in order to avoid long lines when displayed in a GUI pop-up dialog. The optional DoNotLoadIfFileNotPresent field can be used to prevent Weka from loading the package if the named ''files'' and/or ''directories'' are not present in the package\u2019s installation directory. An example is the massiveOnlineAnalysis package. This package is a connector only package and does not include the MOA library. Users of this package must download the moa.jar file separately and copy it to the package\u2019s lib directory manually. Multiple files and directories can be specified as a comma separated list. All paths are relative to the top-level directory of the package. IMPORTANT : use forward slashes as separator characters, as these are portable accross all platforms. The DoNotLoadIfFileNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a file or directory is missing from the package. This message will be displayed on the console and in the log. The optional DoNotLoadIfClassNotPresent field can be used to prevent Weka from loading the package if the named ''class(es)'' can\u2019t be instantiated. This is useful for packages that rely on stuff that has to be installed manually by the user. For example, Java3D is a separate download on all platforms except for OSX, and installs itself into the system JRE/JDK. The DoNotLoadIfClassNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a class can\u2019t be instantiated. Again, this will be displayed on the console and in the log. New in Weka 3.9.2 and 3.8.2 is the ability to constrain the OS and architecture that a package can be installed and loaded on. Two new fields are used for this: # Specify which OS's the package can operate with. Omitting this entry indicates no restrictions on OS. (optional) OSName=Windows,Mac,Linux # Specify which architecture the package can operate with. Omitting this entry indicates no restriction. (optional) OSArch=64 Entries in the OSName field are compared against the value of the Java property \"os.name\" using a String.toLowerCase().contains() operation. Any single match indicates a pass. If an OSName field exists in the Description.props, and it matches, then the optional OSArch field is examined. If present, values in the OSArch list are compared against the value of the Java property \"os.arch\". The special OSArch entries \"32\" and \"64\" are tested using a String().contains() operation; all other entries are compared using String.equalsIgnoreCase(). Additional configuration files # Certain types of packages may require additional configuration files to be present as part of the package. The last chapter covered various ways in which Weka can be extended without having to alter the core Weka code. These plugin mechanisms have been subsumed by the package management system, so some of the configuration property files they require must be present in the package\u2019s top-level directory if the package in question contains such a plugin. Examples include additional tabs for the Explorer, mappings to custom property editors for Weka\u2019s GenericObjectEditor and Knowledge Flow plugins. Here are some examples: The scatterPlot3D package adds a new tab to the Explorer. In order to accomplish this a property has to be set in the Explorer.props file (which contains default values for the Explorer) in order to tell Weka to instantiate and display the new panel. The scatterPlot3D file includes an Explorer.props file in its top-level directory that has the following contents: # Explorer.props file. Adds the Explorer3DPanel to the Tabs key. Tabs=weka.gui.explorer.Explorer3DPanel TabsPolicy=append This property file is read by the package management system when the package is loaded and any key-value pairs are added to existing Explorer properties that have been loaded by the system at startup. If the key already exists in the Explorer properties, then the package has the option to either replace (i.e. overwrite) or append to the existing value. This can be specified with the \"TabsPolicy\" key. In this case, the value weka.gui.explorer.Explorer3DPanel is appended to any existing value associated with the \"Tabs\" key. Explorer3DPanel gets instantiated and added as a new tab when the Explorer starts. Another example is the kfGroovy package. This package adds a plugin component to Weka\u2019s Knowledge Flow that allows a Knowledge Flow step to be implemented and compiled dynamically at runtime as a Groovy script. In order for the Knowledge Flow to make the new step appear in its Plugins toolbar, there needs to be a Beans.props file in the package\u2019s top level directory. In the case of kfGroovy, this property file has the following contents: # Specifies that this component goes into the Plugins toolbar weka.gui.beans.KnowledgeFlow.Plugins=org.pentaho.dm.kf.GroovyComponent The new pluggable evaluation metrics for classification/regression (from Weka 3.7.8) are managed by the PluginManager class. To tell PluginManager that your package provides a new evaluation metric you need to provide a \"PluginManager.props\" file in the package's top level directory. For example, a hypothetical bobsMetric package might declare a new \"Area under Bob curve\" metric like so: # Specify a new plugin Evaluation metric weka.classifiers.evaluation.AbstractEvaluationMetric=weka.classifiers.evaluation.BobsAUC Contributing a package # If you have created a package for Weka then there are two options for making it available to the community. In both cases, hosting the package\u2019s zip archive is the responsibility of the contributer. The first, and official, route is to contact the current Weka maintainer (normally also the admin of the WEKA homepage) and supply your package\u2019s Description.props file. The Weka team will then test downloading and using your package to make sure that there are no obvious problems with what has been specified in the Description.props file and that the software runs and does not contain any malware/malicious code. If all is well, then the package will become an official Weka package and the central package repository meta data will be updated with the package\u2019s Description.props file . Responsibility for maintaining and supporting the package resides with the contributer . The second, and unofficial, route is to simply make the package\u2019s zip archive available on the web somewhere and advertise it yourself. Although users will not be able to browse it\u2019s description in the official package repository, they will be able to download and install it directly from your URL by using the command line version of the package manager. This route could be attractive for people who have published a new algorithm and want to quiclky make a beta version available for others to try without having to go through the official route. Creating a mirror of the package meta data repository # In this section we discuss an easy approach to setting up and maintaining a mirror of the package meta data repository. Having a local mirror may provide faster access times than to that of the official repository on Sourceforge. Extending this approach to the creation of an alternative central repository (hosting packages not available at the official repository) should be straight forward. Just about everything necessary for creating a mirror exists in the local meta data cache created by Weka\u2019s package management system. This cache resides at $WEKA_HOME/repCache . The only thing missing (in Weka 3.7.2) for a complete mirror is the file images.txt , that lists all the image files used in the html index files. This file contains the following two lines: Title-Bird-Header.gif pentaho_logo_rgb_sm.png images.txt is downloaded automatically by the package management system in Weka 3.7.3 and higher. To create a mirror: 1. Copy the contents of $WEKA_HOME/repCache to a temporary directory. For the purposes of this example we\u2019ll call it tempRep 2. Change directory into tempRep and run java weka.core.RepositoryIndexGenerator . . Don't forget the \".\" after the command (this tells RepoistoryIndexGenerator to operate on the current directory) 3. Change directory to the parent of tempRep and synchronize its contents to wherever your web server is located (this is easy via rsync under Nix-like operating systems). RepositoryIndexGenerator automatically creates the main index.html file, all the package index.html files and html files correpsonding to all version prop files for each package. It will also create packageList.txt and numPackages.txt files. IMPORTANT : Make sure that all the files in tempRep are world readable. It is easy to make packages available that are not part of the official Weka repository. Assuming you want to add a package called funkyPackage (as specified by the PackageName field in the Description.props file): Create a directory called funkyPackage in tempRep Copy the Description.props file to tempRep/funkyPackage/Latest.props Copy the Description.props file to tempRep/funkyPackage/<version number>.props , where version number is the version number specified in the Version field of Description.props Run RepositoryIndexGenerator as described previously and sync tempRep to your web server Adding a new version of an existing package is very similar to what has already been described. All that is required is that the new Description.props file corresponding to the new version is copied to Latest.props and to <version numer>.props in the package\u2019s folder. Running RepositoryIndexGenerator will ensure that all necessary html files are created and supporting text files are updated. Automating the mirroring process would simply involve using your OS\u2019s scheduler to execute a script that: Runs weka.core.WekaPackageManager -refresh-cache rsyncs $WEKA_HOME/repCache to tempRep Runs weka.core.RepoistoryIndexGenerator rsyncs tempRep to your web server","title":"Structure"},{"location":"packages/structure/#where-does-weka-store-packages-and-other-configuration-stuff","text":"By default, Weka stores packages and other information in $WEKA_HOME . The default location for WEKA_HOME is user.home/wekafiles , where user.home is the user\u2019s home directory. You can change the default location for WEKA_HOME by setting this either as an evironment variable for your platform, or by specifying it as a Java property when starting Weka. E.g.: export WEKA_HOME=/home/somewhere/weka_bits_and_bobs will set the directory that Weka uses to /home/somewhere/weka_bits_and_bobs under the LINUX operating system. The same thing can be accomplished when starting Weka by specifying a Java property on the command line, E.g.: java -DWEKA_HOME=/home/somewhere/weka_bits_and_bobs -jar weka.jar Inside $WEKA_HOME you will find the main weka log file (weka.log) and a number of directories: packages holds installed packages. Each package is contained its own subdirectory. props holds various Java property files used by Weka. This directory replaces the user\u2019s home directory (used in earlier releases of Weka) as one of the locations checked by Weka for properties files (such as DatabaseUtils.props ). Weka first checks, in order, the current directory (i.e. the directory that Weka is launched from), then $WEKA_HOME/props and finally the weka.jar file for property files. repCache holds the cached copy of the meta data from the central package repository. If the contents of this directory get corrupted it can be safely deleted and Weka will recreate it on the next restart. systemDialogs holds marker files that are created when you check Don\u2019t show this again in various system popup dialogs. Removing this directory or its contents will cause Weka to display those prompts anew.","title":"Where does WEKA store packages and other configuration stuff?"},{"location":"packages/structure/#anatomy-of-a-package","text":"A Weka package is a zip archive that must unpack to the current directory. For example, the DTNB package contains the decision table naive Bayes hybrid classifier and is delivered in a file called DTNB.zip . When unpacked this zip file creates the following directory structure: <current directory> +-DTNB.jar +-Description.props +-build_package.xml +-src | +-main | | +-java | | +-weka | | +-classifiers | | +-rules | | +-DTNB.java | +-test | +-java | +-weka | +-classifiers | +-rules | +-DTNBTest.java +-lib +-doc When installing, the package manager will use the value of the \"PackageName\" field in the Description.props file (see below) to create a directory in $WEKA_HOME/packages to hold the package contents. The contents of the doc directory have not been shown in the diagram above, but this directory contains javadoc for the DTNB class. A package must have a Description.props file and contain at least one jar file with compiled Java classes. The package manager will attempt to load all jar files that it finds in the root directory and the lib directory. Other files are optional, but if the package is open-source then it is nice to include the source code and an ant build file that can be used to compile the code. Template versions of the Description.props file and build_package.xml file are available from the Weka site and here.","title":"Anatomy of a package"},{"location":"packages/structure/#the-description-file","text":"A valid package must contain a Description.props file that provides meta data on the package. Identical files are stored at the central package repository and the local cache maintained by the package manager. The package manager uses these files to compare what is installed to what is available and resolve dependencies. The Description.props contains basic information on the package in the following format: # Template Description file for a Weka package # Package name (required) PackageName=funkyPackage # Version (required) Version=1.0.0 #Date (year-month-day) Date=2010-01-01 # Title (required) Title=My cool algorithm # Category (recommended) Category=Classification # Author (required) Author=Joe Dev <joe@somewhere.net>,Dev2 <dev2@somewhereelse.net> # Maintainer (required) Maintainer=Joe Dev <joe@somewhere.net> # License (required) License=GPL 2.0|Mozilla # Description (required) Description=This package contains the famous Funky Classifer that performs \\ truely funky prediction. # Changes and/or bug fixes in this package (optional) Changes=Fixed a serious bug that affected overall coolness of the Funky Classifier # Package URL for obtaining the package archive (required) PackageURL=http://somewhere.net/weka/funkyPackage.zip # URL for further information URL=http://somewhere.net/funkyResearchInfo.html # Enhances various other packages? Enhances=packageName1,packageName2,... # Related to other packages? Related=packageName1,packageName2,... # Dependencies (required; format: packageName (equality/inequality version_number) Depends=weka (>=3.7.1), packageName1 (=x.y.z), packageName2 (>u.v.w|<=x.y.z),... The PackageName and Version give the name of the package and version number respectively. The name can consist of letters, numbers, and the dot character. It should not start with a dot and should not contain any spaces. The version number is a sequence of three non-negative integers separated by single . or - characters. The Title field should give a one sentence description of the package. The Description field can give a longer description of the package spaning multiple sentences. It may include technical references and can use HTML markup. The Category field is strongly recommended as this information is displayed on both the repository web site and in the GUI package manager client. In the latter, the user can sort the packages on the basis of the category field. It is recommended that an existing category be assigned if possible. Some examples include (Classification, Text classification, Ensemble learning, Regression, Clustering, Associations, Preprocessing, Visualization, Explorer, Experimenter, KnowledgeFlow). The Author field describes who wrote the package and may include multiple names (separated by commas). Email addresses may be given in angle brackets after each name. The field is intended for human readers and no email addresses are automatically extracted. The Maintainer field lists who maintains the package and should include a single email address, enclosed in angle brackets, for sending bug reports to. The License field lists the license(s) that apply to the package. This field may contain the short specification of a license (such as LGPL, GPL 2.0 etc.) or the string file LICENSE , where LICENSE exists as a file in the top-level directory of the package. The string Unlimited may be supplied to indicate that there are no restrictions on distribution or use aside from those imposed by relevant laws. The PackageURL field lists valid URL that points to the package zip file. This URL is used by the package manager to download and install the package. The required Depends field gives a comma separated list of packages which this package depends on. The name of a package is followed by a version number constraint enclosed in parenthesis. Valid operators for version number constraints include =, <, >, <=, >=. The keyword weka is reserved to refer to the base Weka system and can be used to indicate a dependency on a particular version of Weka. At a minimum, the Depends field should list the base version of Weka that the package will operate with. Some examples include: Depends=weka (>=3.7.2), DTNB (=1.0.0) states that this package requires Weka 3.7.2 or higher and version 1.0.0 of the package DTNB. Depends=weka (>3.7.1|<3.8.0) states that this package requires a version of Weka between 3.7.1 and 3.8.0. Depends=weka (>=3.7.2), DTNB (<1.5.0|>=2.0.1) states that this package requires that a version of the DTNB package be installed that is either less than version 1.5.0 or greater than or equal to version 2.0.1. If there is no version number constraint following a package name, the package manager assumes that the latest version of the dependent package is suitable. The optional URL field gives a URL at which the user can find additional online information about the package or its constituent algorithms. The optional Enhances field can be used to indicate which other packages this package is based on (i.e. if it extends methods/algorithms from another package in some fashion). The optional Related field is similar to the Enhances field. It can be used to point the user to other packages that are related in some fashion to this one. The optional Changes field should be used to indicate what changes/bug fixes are included in the current release of the package. There are several other fields that can be used to provide information to assist the user with completing installation (if it can\u2019t be completely accomplished with the package zip file) or display error messages if necessary components are missing: MessageToDisplayOnInstallation=Funky package requires some extra\\n\\ stuff to be installed after installing this package. You will\\n\\ need to blah, blah, blah in order to blah, blah, blah... DoNotLoadIfFileNotPresent=lib/someLibrary.jar,otherStuff/important,... DoNotLoadIfFileNotPresentMessage=funkyPackage can't be loaded because some \\ funky libraries are missing. Please download funkyLibrary.jar from \\ http://www.funky.com and install in $WEKA_HOME/packages/funkyPackage/lib DoNotLoadIfClassNotPresent=com.some.class.from.some.Where,org.some.other.Class,... DoNotLoadIfClassNotPresentMessage=funkyPackage can't be loaded because \\ com.funky.FunkyClass can't be instantiated. Have you downloaded and run \\ the funky software installer for your platform? The optional MessageToDisplayOnInstallation field allows you to specify special instructions to the user in order to help them complete the intallation manually. This message gets displayed on the console, written to the log and appears in a pop-up information dialog if using the GUI package manager. It should include \\n in order to avoid long lines when displayed in a GUI pop-up dialog. The optional DoNotLoadIfFileNotPresent field can be used to prevent Weka from loading the package if the named ''files'' and/or ''directories'' are not present in the package\u2019s installation directory. An example is the massiveOnlineAnalysis package. This package is a connector only package and does not include the MOA library. Users of this package must download the moa.jar file separately and copy it to the package\u2019s lib directory manually. Multiple files and directories can be specified as a comma separated list. All paths are relative to the top-level directory of the package. IMPORTANT : use forward slashes as separator characters, as these are portable accross all platforms. The DoNotLoadIfFileNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a file or directory is missing from the package. This message will be displayed on the console and in the log. The optional DoNotLoadIfClassNotPresent field can be used to prevent Weka from loading the package if the named ''class(es)'' can\u2019t be instantiated. This is useful for packages that rely on stuff that has to be installed manually by the user. For example, Java3D is a separate download on all platforms except for OSX, and installs itself into the system JRE/JDK. The DoNotLoadIfClassNotPresentMessage field can be used to supply an optional message to display to the user if Weka detects that a class can\u2019t be instantiated. Again, this will be displayed on the console and in the log. New in Weka 3.9.2 and 3.8.2 is the ability to constrain the OS and architecture that a package can be installed and loaded on. Two new fields are used for this: # Specify which OS's the package can operate with. Omitting this entry indicates no restrictions on OS. (optional) OSName=Windows,Mac,Linux # Specify which architecture the package can operate with. Omitting this entry indicates no restriction. (optional) OSArch=64 Entries in the OSName field are compared against the value of the Java property \"os.name\" using a String.toLowerCase().contains() operation. Any single match indicates a pass. If an OSName field exists in the Description.props, and it matches, then the optional OSArch field is examined. If present, values in the OSArch list are compared against the value of the Java property \"os.arch\". The special OSArch entries \"32\" and \"64\" are tested using a String().contains() operation; all other entries are compared using String.equalsIgnoreCase().","title":"The description file"},{"location":"packages/structure/#additional-configuration-files","text":"Certain types of packages may require additional configuration files to be present as part of the package. The last chapter covered various ways in which Weka can be extended without having to alter the core Weka code. These plugin mechanisms have been subsumed by the package management system, so some of the configuration property files they require must be present in the package\u2019s top-level directory if the package in question contains such a plugin. Examples include additional tabs for the Explorer, mappings to custom property editors for Weka\u2019s GenericObjectEditor and Knowledge Flow plugins. Here are some examples: The scatterPlot3D package adds a new tab to the Explorer. In order to accomplish this a property has to be set in the Explorer.props file (which contains default values for the Explorer) in order to tell Weka to instantiate and display the new panel. The scatterPlot3D file includes an Explorer.props file in its top-level directory that has the following contents: # Explorer.props file. Adds the Explorer3DPanel to the Tabs key. Tabs=weka.gui.explorer.Explorer3DPanel TabsPolicy=append This property file is read by the package management system when the package is loaded and any key-value pairs are added to existing Explorer properties that have been loaded by the system at startup. If the key already exists in the Explorer properties, then the package has the option to either replace (i.e. overwrite) or append to the existing value. This can be specified with the \"TabsPolicy\" key. In this case, the value weka.gui.explorer.Explorer3DPanel is appended to any existing value associated with the \"Tabs\" key. Explorer3DPanel gets instantiated and added as a new tab when the Explorer starts. Another example is the kfGroovy package. This package adds a plugin component to Weka\u2019s Knowledge Flow that allows a Knowledge Flow step to be implemented and compiled dynamically at runtime as a Groovy script. In order for the Knowledge Flow to make the new step appear in its Plugins toolbar, there needs to be a Beans.props file in the package\u2019s top level directory. In the case of kfGroovy, this property file has the following contents: # Specifies that this component goes into the Plugins toolbar weka.gui.beans.KnowledgeFlow.Plugins=org.pentaho.dm.kf.GroovyComponent The new pluggable evaluation metrics for classification/regression (from Weka 3.7.8) are managed by the PluginManager class. To tell PluginManager that your package provides a new evaluation metric you need to provide a \"PluginManager.props\" file in the package's top level directory. For example, a hypothetical bobsMetric package might declare a new \"Area under Bob curve\" metric like so: # Specify a new plugin Evaluation metric weka.classifiers.evaluation.AbstractEvaluationMetric=weka.classifiers.evaluation.BobsAUC","title":"Additional configuration files"},{"location":"packages/structure/#contributing-a-package","text":"If you have created a package for Weka then there are two options for making it available to the community. In both cases, hosting the package\u2019s zip archive is the responsibility of the contributer. The first, and official, route is to contact the current Weka maintainer (normally also the admin of the WEKA homepage) and supply your package\u2019s Description.props file. The Weka team will then test downloading and using your package to make sure that there are no obvious problems with what has been specified in the Description.props file and that the software runs and does not contain any malware/malicious code. If all is well, then the package will become an official Weka package and the central package repository meta data will be updated with the package\u2019s Description.props file . Responsibility for maintaining and supporting the package resides with the contributer . The second, and unofficial, route is to simply make the package\u2019s zip archive available on the web somewhere and advertise it yourself. Although users will not be able to browse it\u2019s description in the official package repository, they will be able to download and install it directly from your URL by using the command line version of the package manager. This route could be attractive for people who have published a new algorithm and want to quiclky make a beta version available for others to try without having to go through the official route.","title":"Contributing a package"},{"location":"packages/structure/#creating-a-mirror-of-the-package-meta-data-repository","text":"In this section we discuss an easy approach to setting up and maintaining a mirror of the package meta data repository. Having a local mirror may provide faster access times than to that of the official repository on Sourceforge. Extending this approach to the creation of an alternative central repository (hosting packages not available at the official repository) should be straight forward. Just about everything necessary for creating a mirror exists in the local meta data cache created by Weka\u2019s package management system. This cache resides at $WEKA_HOME/repCache . The only thing missing (in Weka 3.7.2) for a complete mirror is the file images.txt , that lists all the image files used in the html index files. This file contains the following two lines: Title-Bird-Header.gif pentaho_logo_rgb_sm.png images.txt is downloaded automatically by the package management system in Weka 3.7.3 and higher. To create a mirror: 1. Copy the contents of $WEKA_HOME/repCache to a temporary directory. For the purposes of this example we\u2019ll call it tempRep 2. Change directory into tempRep and run java weka.core.RepositoryIndexGenerator . . Don't forget the \".\" after the command (this tells RepoistoryIndexGenerator to operate on the current directory) 3. Change directory to the parent of tempRep and synchronize its contents to wherever your web server is located (this is easy via rsync under Nix-like operating systems). RepositoryIndexGenerator automatically creates the main index.html file, all the package index.html files and html files correpsonding to all version prop files for each package. It will also create packageList.txt and numPackages.txt files. IMPORTANT : Make sure that all the files in tempRep are world readable. It is easy to make packages available that are not part of the official Weka repository. Assuming you want to add a package called funkyPackage (as specified by the PackageName field in the Description.props file): Create a directory called funkyPackage in tempRep Copy the Description.props file to tempRep/funkyPackage/Latest.props Copy the Description.props file to tempRep/funkyPackage/<version number>.props , where version number is the version number specified in the Version field of Description.props Run RepositoryIndexGenerator as described previously and sync tempRep to your web server Adding a new version of an existing package is very similar to what has already been described. All that is required is that the new Description.props file corresponding to the new version is copied to Latest.props and to <version numer>.props in the package\u2019s folder. Running RepositoryIndexGenerator will ensure that all necessary html files are created and supporting text files are updated. Automating the mirroring process would simply involve using your OS\u2019s scheduler to execute a script that: Runs weka.core.WekaPackageManager -refresh-cache rsyncs $WEKA_HOME/repCache to tempRep Runs weka.core.RepoistoryIndexGenerator rsyncs tempRep to your web server","title":"Creating a mirror of the package meta data repository"},{"location":"packages/unofficial/","text":"There are a number of packages for WEKA 3.8 on the internet that are not listed in the \"official\" WEKA package repository. These packages can nevertheless be easily installed via the package manager in WEKA 3.8 (available via the Tools menu in WEKA's GUIChooser) by providing the URL for the package .zip file. Below is an (incomplete list) of packages that are available. Input/outout # common-csv -- loader/saver for various common CSV formats, using the Apache Commons CSV library. matlab -- loader/saver for binary Matlab .mat files, using the MFL library. Preprocessing # dataset-weights -- filters for setting attribute and instance weights using various methods. missing-values-imputation -- various methods for imputing missing values using a filter. mxexpression -- filter for updating a target attribute using a mathematical expression. Classification # Java neural network package -- Java (convolutional or fully-connected) neural network implementation with plugin for Weka . Uses dropout and rectified linear units. Implementation is multithreaded and uses MTJ matrix library with native libs for performance. HMMWeka -- This library makes Hidden Markov Model machine learning available in Weka. Collective classification -- Algorithms around semi-supervised learning and collective classification. Bagging ensemble selection -- Bagging Ensemble Selection - a new ensemble learning strategy. DataSqueezer -- Efficient rule builder that generates a set of production rules from labeled input data. It can handle missing data and has log-linear asymptotic complexity with the number of training examples. miDS -- mi-DS is a multiple-Instance learning supervised algorithm based on the DataSqueezer algorithm. LibD3C -- Ensemble classifiers with a clustering and dynamic selection strategy. ICRM -- An Interpretable Classification Rule Mining Algorithm. tclass -- TClass is a supervised learner for multivariate time series, originally developed by Waleed Kadous . wekaclassalgos -- collection of artificial neural network (ANN) algorithms and artificial immune system (AIS) algorithms, originally developed by Jason Brownlee . mxexpression -- classifier for making predictions using a mathematical expression. Clustering # APCluster -- Affinity propagation algorithm for clustering, used especially in bioinformatics and computer vision. Fast Optics -- Fast Implementation of OPTICS algorithm using random projections for Euclidean distances. Similarity functions # wekabiosimilarity -- implements several measures to compare binary feature vectors; and, additionally, extrapolates those measures to work with multi-value, string and numerical feature vectors. Discretization # ur-CAIM -- Improved CAIM Discretization for Unbalanced and Balanced Data. CAIM -- Class-Attribute Interdependence Maximization algorithm: discretizes a continuous feature into a number of intervals. This is done by using class information, without requiring the user to provide this number. Feature selection # RSARSubsetEval -- Rough set feature selection. Frequent pattern mining # XApriori --Available case analysis modification of Apriori frequent pattern mining algorithm. Stemming # Snowball stemmers -- Contains the actual snowball stemmer algorithms to make the Snowball stemmer wrapper in Weka work. PTStemmer -- Wrapper for Pedro Oliveira's stemmer library for Portuguese. Text mining # nlp -- Contains components for natural language processing, eg part-of-speech tagging filter and Penn Tree Bank tokenizer. Makes use of the Stanford Parser (parser models need to be downloaded separately). Visualization # graphviz-treevisualize -- Generating nice graphs in the Explorer from trees (eg J48) using the GraphViz executables. confusionmatrix -- Various visualizations of confusion matrices in the Explorer. serialized-model-viewer -- Adds a standalone tab to the Explorer that allows the user to load a serialized model and view its content as text (simply uses the objects' toString() method). Parameter optimization # multisearch -- Meta-classifier similar to GridSearch, but for optimizing arbitrary number of parameters. Others # screencast4j -- Allows you to record sound, webcam and screen feeds, storing them in separate files to be combined into a screencast using a video editor . This screencast you can then share on YouTube, for instance. command-to-code -- Turns command-lines (eg of classifiers or filters) into various Java code snippets. jshell-scripting -- Allows scripting in Java, using jshell","title":"Unofficial"},{"location":"packages/unofficial/#inputoutout","text":"common-csv -- loader/saver for various common CSV formats, using the Apache Commons CSV library. matlab -- loader/saver for binary Matlab .mat files, using the MFL library.","title":"Input/outout"},{"location":"packages/unofficial/#preprocessing","text":"dataset-weights -- filters for setting attribute and instance weights using various methods. missing-values-imputation -- various methods for imputing missing values using a filter. mxexpression -- filter for updating a target attribute using a mathematical expression.","title":"Preprocessing"},{"location":"packages/unofficial/#classification","text":"Java neural network package -- Java (convolutional or fully-connected) neural network implementation with plugin for Weka . Uses dropout and rectified linear units. Implementation is multithreaded and uses MTJ matrix library with native libs for performance. HMMWeka -- This library makes Hidden Markov Model machine learning available in Weka. Collective classification -- Algorithms around semi-supervised learning and collective classification. Bagging ensemble selection -- Bagging Ensemble Selection - a new ensemble learning strategy. DataSqueezer -- Efficient rule builder that generates a set of production rules from labeled input data. It can handle missing data and has log-linear asymptotic complexity with the number of training examples. miDS -- mi-DS is a multiple-Instance learning supervised algorithm based on the DataSqueezer algorithm. LibD3C -- Ensemble classifiers with a clustering and dynamic selection strategy. ICRM -- An Interpretable Classification Rule Mining Algorithm. tclass -- TClass is a supervised learner for multivariate time series, originally developed by Waleed Kadous . wekaclassalgos -- collection of artificial neural network (ANN) algorithms and artificial immune system (AIS) algorithms, originally developed by Jason Brownlee . mxexpression -- classifier for making predictions using a mathematical expression.","title":"Classification"},{"location":"packages/unofficial/#clustering","text":"APCluster -- Affinity propagation algorithm for clustering, used especially in bioinformatics and computer vision. Fast Optics -- Fast Implementation of OPTICS algorithm using random projections for Euclidean distances.","title":"Clustering"},{"location":"packages/unofficial/#similarity-functions","text":"wekabiosimilarity -- implements several measures to compare binary feature vectors; and, additionally, extrapolates those measures to work with multi-value, string and numerical feature vectors.","title":"Similarity functions"},{"location":"packages/unofficial/#discretization","text":"ur-CAIM -- Improved CAIM Discretization for Unbalanced and Balanced Data. CAIM -- Class-Attribute Interdependence Maximization algorithm: discretizes a continuous feature into a number of intervals. This is done by using class information, without requiring the user to provide this number.","title":"Discretization"},{"location":"packages/unofficial/#feature-selection","text":"RSARSubsetEval -- Rough set feature selection.","title":"Feature selection"},{"location":"packages/unofficial/#frequent-pattern-mining","text":"XApriori --Available case analysis modification of Apriori frequent pattern mining algorithm.","title":"Frequent pattern mining"},{"location":"packages/unofficial/#stemming","text":"Snowball stemmers -- Contains the actual snowball stemmer algorithms to make the Snowball stemmer wrapper in Weka work. PTStemmer -- Wrapper for Pedro Oliveira's stemmer library for Portuguese.","title":"Stemming"},{"location":"packages/unofficial/#text-mining","text":"nlp -- Contains components for natural language processing, eg part-of-speech tagging filter and Penn Tree Bank tokenizer. Makes use of the Stanford Parser (parser models need to be downloaded separately).","title":"Text mining"},{"location":"packages/unofficial/#visualization","text":"graphviz-treevisualize -- Generating nice graphs in the Explorer from trees (eg J48) using the GraphViz executables. confusionmatrix -- Various visualizations of confusion matrices in the Explorer. serialized-model-viewer -- Adds a standalone tab to the Explorer that allows the user to load a serialized model and view its content as text (simply uses the objects' toString() method).","title":"Visualization"},{"location":"packages/unofficial/#parameter-optimization","text":"multisearch -- Meta-classifier similar to GridSearch, but for optimizing arbitrary number of parameters.","title":"Parameter optimization"},{"location":"packages/unofficial/#others","text":"screencast4j -- Allows you to record sound, webcam and screen feeds, storing them in separate files to be combined into a screencast using a video editor . This screencast you can then share on YouTube, for instance. command-to-code -- Turns command-lines (eg of classifiers or filters) into various Java code snippets. jshell-scripting -- Allows scripting in Java, using jshell","title":"Others"},{"location":"visualization/changing_the_plot_background/","text":"The default background color for plots, e.g., for ROC curves , is black , which might not be convenient for screenshots. Since the color is stored as a value in a Properties file , you can easily change the color: extract the following file from the weka.jar or weka-src.jar file weka/gui/visualize/Visualize.props place a copy of that file in your HOME directory (on *nix $HOME , on Windows %USERPROFILE% ) edit the Properties file with any text editor replace the value of following key with a color of your liking, e.g., white weka.gui.visualize.Plot2D.backgroundColour save the file and restart Weka","title":"Changing the plot background"},{"location":"visualization/displaying_results_of_cross_validation_folds/","text":"The following KnowledgeFlow setup outputs the cross-validation models for each train/test pair: ArffLoader --dataSet--> CrossValidationFoldMaker --trainingSet/testSet--> J48 --text--> TextViewer Links # Cross-validation_folds_output.kfml - Example KnowledgeFlow layout file","title":"Displaying results of cross validation folds"},{"location":"visualization/displaying_results_of_cross_validation_folds/#links","text":"Cross-validation_folds_output.kfml - Example KnowledgeFlow layout file","title":"Links"},{"location":"visualization/explorer_error_visualization_plugins/","text":"Introduction # As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.ErrorVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.ErrorVisualizePlugin key of the weka/gui/GenericPropertiesCreator.props file. Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . * getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # The following screenshots were generated using weka.classifiers.functions.LinearRegression with default parameters on the UCI dataset bolts , using a percentage split of 66% for the training set and the remainder for testing. Using Weka panels # The ClassifierErrorsWeka.java example simply displays the classifier errors as the Visualize classifier errors menu item already available in Weka. It is just to demonstrate how to use existing Weka classes. Using JMathtools' Boxplot # The ClassifierErrorsMathtools.java . The relative error per prediction is displayed as vertical line. Note: This display is only available for numeric classes. Downloads # ClassifierErrorsWeka.java ClassifierErrorsMathtools.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Links # JMathTools homepage","title":"Introduction"},{"location":"visualization/explorer_error_visualization_plugins/#introduction","text":"As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0.","title":"Introduction"},{"location":"visualization/explorer_error_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.ErrorVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.ErrorVisualizePlugin key of the weka/gui/GenericPropertiesCreator.props file.","title":"Requirements"},{"location":"visualization/explorer_error_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . * getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_error_visualization_plugins/#examples","text":"The following screenshots were generated using weka.classifiers.functions.LinearRegression with default parameters on the UCI dataset bolts , using a percentage split of 66% for the training set and the remainder for testing.","title":"Examples"},{"location":"visualization/explorer_error_visualization_plugins/#using-weka-panels","text":"The ClassifierErrorsWeka.java example simply displays the classifier errors as the Visualize classifier errors menu item already available in Weka. It is just to demonstrate how to use existing Weka classes.","title":"Using Weka panels"},{"location":"visualization/explorer_error_visualization_plugins/#using-jmathtools-boxplot","text":"The ClassifierErrorsMathtools.java . The relative error per prediction is displayed as vertical line. Note: This display is only available for numeric classes.","title":"Using JMathtools' Boxplot"},{"location":"visualization/explorer_error_visualization_plugins/#downloads","text":"ClassifierErrorsWeka.java ClassifierErrorsMathtools.java","title":"Downloads"},{"location":"visualization/explorer_error_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins","title":"See also"},{"location":"visualization/explorer_error_visualization_plugins/#links","text":"JMathTools homepage","title":"Links"},{"location":"visualization/explorer_graph_visualization_plugins/","text":"Introduction # As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. graph is referring to graphs generated, for instance, by the weka.classifiers.bayes.BayesNet classifier. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.GraphVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.GraphVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file. Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # Prefuse visualization toolkit # The PrefuseGraph.java . It is based on the prefuse.demos.GraphView demo class. The following screenshot was generated using BayesNet on the UCI dataset anneal with the following parametrization: weka.classifiers.bayes.BayesNet -D -Q weka.classifiers.bayes.net.search.local.K2 -- -P 3 -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5 Downloads # PrefuseGraph.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Explorer tree visualization plugins Links # Prefuse homepage PAP - prefuse assistance pool","title":"Introduction"},{"location":"visualization/explorer_graph_visualization_plugins/#introduction","text":"As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add graph visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. graph is referring to graphs generated, for instance, by the weka.classifiers.bayes.BayesNet classifier. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0.","title":"Introduction"},{"location":"visualization/explorer_graph_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.GraphVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.GraphVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file.","title":"Requirements"},{"location":"visualization/explorer_graph_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_graph_visualization_plugins/#examples","text":"","title":"Examples"},{"location":"visualization/explorer_graph_visualization_plugins/#prefuse-visualization-toolkit","text":"The PrefuseGraph.java . It is based on the prefuse.demos.GraphView demo class. The following screenshot was generated using BayesNet on the UCI dataset anneal with the following parametrization: weka.classifiers.bayes.BayesNet -D -Q weka.classifiers.bayes.net.search.local.K2 -- -P 3 -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5","title":"Prefuse visualization toolkit"},{"location":"visualization/explorer_graph_visualization_plugins/#downloads","text":"PrefuseGraph.java","title":"Downloads"},{"location":"visualization/explorer_graph_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Explorer tree visualization plugins","title":"See also"},{"location":"visualization/explorer_graph_visualization_plugins/#links","text":"Prefuse homepage PAP - prefuse assistance pool","title":"Links"},{"location":"visualization/explorer_prediction_visualization_plugins/","text":"Introduction # As of Weka version >3.5.2 one can easily add visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.VisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.VisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file (only available in 3.5.7 or later). Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # Table with predictions # The PredictionTable.java example simply displays the actual class label and the one predicted by the classifier. In addition to that, it lists whether it was an incorrect prediction and the class probability for the correct class label. Bar plot with probabilities # The PredictionError.java to display a simple bar plot of the predictions. The correct predictions are displayed in blue , the incorrect ones in red . In both cases the class probability that the classifier returned for the correct class label is displayed on the y axis. The x axis is simply the index of the prediction starting with 0. Downloads # PredictionTable.java PredictionError.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Links # JMathTools homepage","title":"Introduction"},{"location":"visualization/explorer_prediction_visualization_plugins/#introduction","text":"As of Weka version >3.5.2 one can easily add visualization plugins in the Explorer (Classify panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0.","title":"Introduction"},{"location":"visualization/explorer_prediction_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.VisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.VisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file (only available in 3.5.7 or later).","title":"Requirements"},{"location":"visualization/explorer_prediction_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_prediction_visualization_plugins/#examples","text":"","title":"Examples"},{"location":"visualization/explorer_prediction_visualization_plugins/#table-with-predictions","text":"The PredictionTable.java example simply displays the actual class label and the one predicted by the classifier. In addition to that, it lists whether it was an incorrect prediction and the class probability for the correct class label.","title":"Table with predictions"},{"location":"visualization/explorer_prediction_visualization_plugins/#bar-plot-with-probabilities","text":"The PredictionError.java to display a simple bar plot of the predictions. The correct predictions are displayed in blue , the incorrect ones in red . In both cases the class probability that the classifier returned for the correct class label is displayed on the y axis. The x axis is simply the index of the prediction starting with 0.","title":"Bar plot with probabilities"},{"location":"visualization/explorer_prediction_visualization_plugins/#downloads","text":"PredictionTable.java PredictionError.java","title":"Downloads"},{"location":"visualization/explorer_prediction_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins","title":"See also"},{"location":"visualization/explorer_prediction_visualization_plugins/#links","text":"JMathTools homepage","title":"Links"},{"location":"visualization/explorer_tree_visualization_plugins/","text":"Introduction # As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add tree visualization plugins in the Explorer (Classify and Cluster panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. tree is referring to trees generated, for instance, by the weka.classifiers.trees.J48 classifier. To be more precise, all classes that import the weka.core.Drawable interface and which graphType() method returns weka.core.Drawable.TREE . This means, that the trees the clusterer weka.clusterers.Cobweb generates, can be displayed as well. Requirements # custom visualization class must implement the following interface weka.gui.visualize.plugins.TreeVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.TreeVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file. Implementation # The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data. Examples # prefuse visualization toolkit # The PrefuseTree.java . It is based on the prefuse.demos.TreeView demo class. The following screenshot was generated using J48 on the UCI dataset anneal with default parameters: And here is an example of Cobweb on the same dataset, once again with default parameters: Note: Both trees are only partially displayed, since the prefuse tree component offers exploration of the loaded tree. Downloads # PrefuseTree.java See also # Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins Links # Prefuse homepage PAP - prefuse assistance pool","title":"Introduction"},{"location":"visualization/explorer_tree_visualization_plugins/#introduction","text":"As of Weka version >3.5.8 (only developer version, not stable-3.6 branch) one can easily add tree visualization plugins in the Explorer (Classify and Cluster panel). This makes it easy to implement custom visualizations, if the ones Weka offers are not sufficient. Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.7.0. tree is referring to trees generated, for instance, by the weka.classifiers.trees.J48 classifier. To be more precise, all classes that import the weka.core.Drawable interface and which graphType() method returns weka.core.Drawable.TREE . This means, that the trees the clusterer weka.clusterers.Cobweb generates, can be displayed as well.","title":"Introduction"},{"location":"visualization/explorer_tree_visualization_plugins/#requirements","text":"custom visualization class must implement the following interface weka.gui.visualize.plugins.TreeVisualizePlugin the class must either reside in the following package (visualization classes are automatically discovered during run-time) weka.gui.visualize.plugins or the class' package must be listed in the weka.gui.visualize.plugins.TreeVisualizePlugin key of the [weka/gui/GenericPropertiesCreator.props](weka_gui_genericpropertiescreator.props.md) file.","title":"Requirements"},{"location":"visualization/explorer_tree_visualization_plugins/#implementation","text":"The visualization interface contains the following four methods getMinVersion This method returns the minimal version (inclusive) of Weka that is necessary to execute the plugin, e.g., 3.5.0 . getMaxVersion This method returns the maximal version (exclusive) of Weka that is necessary to execute the plugin, e.g., 3.6.0 . getDesignVersion Returns the actual version of Weka this plugin was designed for, e.g., 3.5.1 getVisualizeMenuItem The JMenuItem that is returned via this method will be added to the plugins menu in the popup in the Explorer. The ActionListener for clicking the menu item will most likely open a new frame containing the visualized data.","title":"Implementation"},{"location":"visualization/explorer_tree_visualization_plugins/#examples","text":"","title":"Examples"},{"location":"visualization/explorer_tree_visualization_plugins/#prefuse-visualization-toolkit","text":"The PrefuseTree.java . It is based on the prefuse.demos.TreeView demo class. The following screenshot was generated using J48 on the UCI dataset anneal with default parameters: And here is an example of Cobweb on the same dataset, once again with default parameters: Note: Both trees are only partially displayed, since the prefuse tree component offers exploration of the loaded tree.","title":"prefuse visualization toolkit"},{"location":"visualization/explorer_tree_visualization_plugins/#downloads","text":"PrefuseTree.java","title":"Downloads"},{"location":"visualization/explorer_tree_visualization_plugins/#see-also","text":"Use Weka in your Java code - general overview of the basic Weka API Explorer visualization plugins","title":"See also"},{"location":"visualization/explorer_tree_visualization_plugins/#links","text":"Prefuse homepage PAP - prefuse assistance pool","title":"Links"},{"location":"visualization/explorer_visualization_plugins/","text":"The Explorer offers various possibilities to add custom plugins: predictions errors graphs trees Note: This is also covered in chapter Extending WEKA of the WEKA manual in versions later than 3.6.1/3.7.0.","title":"Explorer visualization plugins"},{"location":"visualization/exporting_charts_from_the_knowledge_flow/","text":"From Weka 3.7.5 it is possible to create and save charts such as scatter plots, attribute histograms, error plots, ROC curves etc. non-interactively as part of a data mining process. Weka's built-in charts can be used or an optional package, such as jfreechartOffscreenChartRenderer (to be released in conjunction with Weka 3.7.5) can be installed using the package manager in order to render pretty charts using the JFreeChart library . A new template is included in the Knowledge Flow that can be used to get you started and demonstrate the options available. The above example Knowledge Flow uses the german credit data from the UCI repository and is configured to use the built-in Weka charting routines. The \" ImageSaver \" components are configured to save the generated PNG charts to the user's home directory (any writable place on the filesystem can be used of course). Tool tips appear when the mouse hovers over the labels of the options for the renderer in the configuration dialog for the \" DataVisualizer \", \" AttributeSummarizer \" and \" ModelPerformanceChart \" components that explain the available options. The following screenshots show the charts that are generated by the flow using using the optional jfreechartOffscreenChartRenderer package: Version 1.0.1 of the jfreechartOffscreenChartRenderer adds the ability to plot a Pareto chart for nominal attributes. The -pareto renderer option can be used to accomplish this.","title":"Exporting charts from the knowledge flow"},{"location":"visualization/extensions_for_wekas_main_gui/","text":"Description # The main GUI (= weka.gui.Main ) contains a plugin mechanism to add functionality to the main menu without having to modify the code of that class (the GUIChooser in the developer version as well). Thanks to the automatic class discovery, Weka will display all components that are found in packages listed in the GenericPropertiesCreator.props file. Version # 3.5.5 Requirements # The are only two requirements for components to be listed in the main menu (under the Extensions menu): they have to implement the weka.gui.MainMenuExtension interface the packages they reside in must be listed in the GenericPropertiesCreator.props under the weka.gui.MainMenuExtension entry Examples # In the following, I'll present two really simple examples of how to add stuff to the main menu. An item that gets added to the main menu either handles everything itself, i.e., creating frame and displaying it, or it needs a frame to place its GUI components in. In the first case, one only needs to let the getActionListener(JFrame) method return an ActionListener and implement the fillFrame(Component) method with an empty body. In the other case, one lets the getActionListener(JFrame) method return null and uses the fillFrame(Component) method to fill the frame with life. Launching a browser # Launching a browser with the Weka homepage is a really example, since one only needs to use the weka.gui.BrowserHelper class to launch a browser with a specific URL. Since we don't need a frame for this, we don't add any functionality to the fillFrame(Component) , but only let the getActionListener(JFrame) method return an ActionListener that launches the browser with the Weka homepage. The StartBrowser.java extension will be listed in the sub-menu Internet as Start browser . public String getSubmenuTitle () { return \"Internet\" ; } public String getMenuTitle () { return \"Start browser\" ; } public ActionListener getActionListener ( JFrame owner ) { final JFrame finalOwner = owner ; ActionListener result = new ActionListener () { public void actionPerformed ( ActionEvent evt ) { BrowserHelper . openURL ( finalOwner , \"http://www.cs.waikato.ac.nz/~ml/weka/\" ); } }; return result ; } public void fillFrame ( Component frame ) { } Since the class is part of the weka.gui.extensions package, we must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file, e.g.: weka.gui.MainMenuExtension = \\ weka.gui.extensions SQL Worksheet # The SqlWorksheet.java mode, one needs to take care of to check for JFrame and JInternalFrame as ancestor of the frame that is being passed through. The method only instantiates an SqlViewer panel, places it in the center of the frame, resizes the frame to 800x600 and moves it into the center of the screen. public String getSubmenuTitle () { return null ; } public String getMenuTitle () { return \"SQL Worksheet\" ; } public ActionListener getActionListener ( JFrame owner ) { return null ; } public void fillFrame ( Component frame ) { SqlViewer sql = new SqlViewer ( null ); * add sql viewer component if ( frame instanceof JFrame ) { JFrame f = ( JFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } else if ( frame instanceof JInternalFrame ) { JInternalFrame f = ( JInternalFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } * size + location ( = centered ) frame . setSize ( 800 , 600 ); frame . validate (); int screenHeight = frame . getGraphicsConfiguration (). getBounds (). height ; int screenWidth = frame . getGraphicsConfiguration (). getBounds (). width ; frame . setLocation ( ( screenWidth - frame . getBounds (). width ) / 2 , ( screenHeight - frame . getBounds (). height ) / 2 ); } This class is part of the weka.gui.extensions package and we therefore must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file: weka.gui.MainMenuExtension = \\ weka.gui.extensions Downloads # StartBrowser.java SqlWorksheet.java","title":"Description"},{"location":"visualization/extensions_for_wekas_main_gui/#description","text":"The main GUI (= weka.gui.Main ) contains a plugin mechanism to add functionality to the main menu without having to modify the code of that class (the GUIChooser in the developer version as well). Thanks to the automatic class discovery, Weka will display all components that are found in packages listed in the GenericPropertiesCreator.props file.","title":"Description"},{"location":"visualization/extensions_for_wekas_main_gui/#version","text":"3.5.5","title":"Version"},{"location":"visualization/extensions_for_wekas_main_gui/#requirements","text":"The are only two requirements for components to be listed in the main menu (under the Extensions menu): they have to implement the weka.gui.MainMenuExtension interface the packages they reside in must be listed in the GenericPropertiesCreator.props under the weka.gui.MainMenuExtension entry","title":"Requirements"},{"location":"visualization/extensions_for_wekas_main_gui/#examples","text":"In the following, I'll present two really simple examples of how to add stuff to the main menu. An item that gets added to the main menu either handles everything itself, i.e., creating frame and displaying it, or it needs a frame to place its GUI components in. In the first case, one only needs to let the getActionListener(JFrame) method return an ActionListener and implement the fillFrame(Component) method with an empty body. In the other case, one lets the getActionListener(JFrame) method return null and uses the fillFrame(Component) method to fill the frame with life.","title":"Examples"},{"location":"visualization/extensions_for_wekas_main_gui/#launching-a-browser","text":"Launching a browser with the Weka homepage is a really example, since one only needs to use the weka.gui.BrowserHelper class to launch a browser with a specific URL. Since we don't need a frame for this, we don't add any functionality to the fillFrame(Component) , but only let the getActionListener(JFrame) method return an ActionListener that launches the browser with the Weka homepage. The StartBrowser.java extension will be listed in the sub-menu Internet as Start browser . public String getSubmenuTitle () { return \"Internet\" ; } public String getMenuTitle () { return \"Start browser\" ; } public ActionListener getActionListener ( JFrame owner ) { final JFrame finalOwner = owner ; ActionListener result = new ActionListener () { public void actionPerformed ( ActionEvent evt ) { BrowserHelper . openURL ( finalOwner , \"http://www.cs.waikato.ac.nz/~ml/weka/\" ); } }; return result ; } public void fillFrame ( Component frame ) { } Since the class is part of the weka.gui.extensions package, we must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file, e.g.: weka.gui.MainMenuExtension = \\ weka.gui.extensions","title":"Launching a browser"},{"location":"visualization/extensions_for_wekas_main_gui/#sql-worksheet","text":"The SqlWorksheet.java mode, one needs to take care of to check for JFrame and JInternalFrame as ancestor of the frame that is being passed through. The method only instantiates an SqlViewer panel, places it in the center of the frame, resizes the frame to 800x600 and moves it into the center of the screen. public String getSubmenuTitle () { return null ; } public String getMenuTitle () { return \"SQL Worksheet\" ; } public ActionListener getActionListener ( JFrame owner ) { return null ; } public void fillFrame ( Component frame ) { SqlViewer sql = new SqlViewer ( null ); * add sql viewer component if ( frame instanceof JFrame ) { JFrame f = ( JFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } else if ( frame instanceof JInternalFrame ) { JInternalFrame f = ( JInternalFrame ) frame ; f . setLayout ( new BorderLayout ()); f . add ( sql , BorderLayout . CENTER ); f . pack (); } * size + location ( = centered ) frame . setSize ( 800 , 600 ); frame . validate (); int screenHeight = frame . getGraphicsConfiguration (). getBounds (). height ; int screenWidth = frame . getGraphicsConfiguration (). getBounds (). width ; frame . setLocation ( ( screenWidth - frame . getBounds (). width ) / 2 , ( screenHeight - frame . getBounds (). height ) / 2 ); } This class is part of the weka.gui.extensions package and we therefore must add this package to the weka.gui.MainMenuExtension entry of the GenericPropertiesCreator.props file: weka.gui.MainMenuExtension = \\ weka.gui.extensions","title":"SQL Worksheet"},{"location":"visualization/extensions_for_wekas_main_gui/#downloads","text":"StartBrowser.java SqlWorksheet.java","title":"Downloads"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/","text":"The KnowledgeFlow enables one to plot the error rate (= RMSE, root mean squared error) and the accuracy of an incremental classifier. An incremental classifier is a classifier that does not need to see the whole data at once, but can be trained instance by instance. All classifiers implementing the interface [weka.classifiers.UpdateableClassifier](https://weka.sourceforge.io/doc.dev/weka/classifiers/updateableclassifier.html) are incremental ones. Setup # The most basic setup for an incremental classifier is show below, using the classifier NaiveBayesUpdateable : ArffLoader --instance--> NaiveBayesUpdateable --incrementalClassifier--> IncrementalClassifierEvaluator --chart--> StripChart Here is a screenshot of the setup: You can also download KnowledgeFlow-incremental_classifier.kfml , an XML version of this setup. Displaying the chart # select a dataset that you want to train the classifier with, via Configure... from the ArffLoader's context menu, e.g., the UCI dataset anneal . select Show chart from the StripChart's context menu (the chart will only be updated if visible!) select Start loading from the ArffLoader's context menu. bring the chart back to front and you should get a graph similar to this one (click to enlarge): Exporting the chart # As of 08/07/2008 (or Weka >3.5.7), the chart can be exported to several file formats using the developer version of Weka. The magic key for bringing up the export dialog is <Ctrl+Alt+Shift><Left-Click> . Since the default black background is not very practical if one wants to embed the chart in a document, one can change the background color via the following property of the Beans.props properties file (and set it to white ): weka.gui.beans.StripChart.backgroundColour The text color of the legend can be modified via the following property (and set it to black ): weka.gui.beans.StripChart$LegendPanel.borderColour Note: Due to the design of the StripChart (nested JPanels), the EPS export does not work properly. But one can always export it as PNG and then convert it under Linux via the pngtopnm/pnmtops/ps2epsi chain. See commandline help of those tools for more details. See also # Properties File weka/gui/beans/Beans.props Links # KnowledgeFlow-incremental_classifier.kfml","title":"Plotting error rate for incremental classifier"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#setup","text":"The most basic setup for an incremental classifier is show below, using the classifier NaiveBayesUpdateable : ArffLoader --instance--> NaiveBayesUpdateable --incrementalClassifier--> IncrementalClassifierEvaluator --chart--> StripChart Here is a screenshot of the setup: You can also download KnowledgeFlow-incremental_classifier.kfml , an XML version of this setup.","title":"Setup"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#displaying-the-chart","text":"select a dataset that you want to train the classifier with, via Configure... from the ArffLoader's context menu, e.g., the UCI dataset anneal . select Show chart from the StripChart's context menu (the chart will only be updated if visible!) select Start loading from the ArffLoader's context menu. bring the chart back to front and you should get a graph similar to this one (click to enlarge):","title":"Displaying the chart"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#exporting-the-chart","text":"As of 08/07/2008 (or Weka >3.5.7), the chart can be exported to several file formats using the developer version of Weka. The magic key for bringing up the export dialog is <Ctrl+Alt+Shift><Left-Click> . Since the default black background is not very practical if one wants to embed the chart in a document, one can change the background color via the following property of the Beans.props properties file (and set it to white ): weka.gui.beans.StripChart.backgroundColour The text color of the legend can be modified via the following property (and set it to black ): weka.gui.beans.StripChart$LegendPanel.borderColour Note: Due to the design of the StripChart (nested JPanels), the EPS export does not work properly. But one can always export it as PNG and then convert it under Linux via the pngtopnm/pnmtops/ps2epsi chain. See commandline help of those tools for more details.","title":"Exporting the chart"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#see-also","text":"Properties File weka/gui/beans/Beans.props","title":"See also"},{"location":"visualization/plotting_error_rate_for_incremental_classifier/#links","text":"KnowledgeFlow-incremental_classifier.kfml","title":"Links"},{"location":"visualization/visualizing_a_tree/","text":"The following code sample ( VisualizeJ48.java ) takes an ARFF file as input, trains a [J48](https://weka.sourceforge.io/doc/weka/classifiers/trees/j48.html) and displays the generated tree with the [TreeVisualizer](https://weka.sourceforge.io/doc/weka/gui/treevisualizer/treevisualizer.html) class. This can be done with all classifiers that implement the [weka.core.Drawable](https://weka.sourceforge.io/doc/weka/core/drawable.html) interface. Source code # import java.awt.BorderLayout ; import java.awt.event.WindowAdapter ; import java.awt.event.WindowEvent ; import java.io.BufferedReader ; import java.io.FileReader ; import javax.swing.JFrame ; import weka.classifiers.trees.J48 ; import weka.core.Instances ; import weka.gui.treevisualizer.PlaceNode2 ; import weka.gui.treevisualizer.TreeVisualizer ; /** * Displays a trained J48 as tree. * Expects an ARFF filename as first argument. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeJ48 { public static void main ( String args [] ) throws Exception { // train classifier J48 cls = new J48 (); Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); cls . buildClassifier ( data ); // display classifier final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Tree Visualizer: J48\" ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); TreeVisualizer tv = new TreeVisualizer ( null , cls . graph (), new PlaceNode2 ()); jf . getContentPane (). add ( tv , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); tv . fitToScreen (); } } Downloads # VisualizeJ48.java","title":"Visualizing a tree"},{"location":"visualization/visualizing_a_tree/#source-code","text":"import java.awt.BorderLayout ; import java.awt.event.WindowAdapter ; import java.awt.event.WindowEvent ; import java.io.BufferedReader ; import java.io.FileReader ; import javax.swing.JFrame ; import weka.classifiers.trees.J48 ; import weka.core.Instances ; import weka.gui.treevisualizer.PlaceNode2 ; import weka.gui.treevisualizer.TreeVisualizer ; /** * Displays a trained J48 as tree. * Expects an ARFF filename as first argument. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeJ48 { public static void main ( String args [] ) throws Exception { // train classifier J48 cls = new J48 (); Instances data = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); data . setClassIndex ( data . numAttributes () - 1 ); cls . buildClassifier ( data ); // display classifier final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Tree Visualizer: J48\" ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); TreeVisualizer tv = new TreeVisualizer ( null , cls . graph (), new PlaceNode2 ()); jf . getContentPane (). add ( tv , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); tv . fitToScreen (); } }","title":"Source code"},{"location":"visualization/visualizing_a_tree/#downloads","text":"VisualizeJ48.java","title":"Downloads"},{"location":"visualization/visualizing_cluster_assignments/","text":"The following code sample ( VisualizeClusterAssignments.java ) displays the cluster assignments of a clusterer on a particular dataset. This is the same functionality as you get with the right-click menu in the Explorer, choosing Visualize cluster assignments . Example command-line: java -classpath .:weka.jar VisualizeClusterAssignments -t /some/where/data.arff -W \"weka.clusterers.EM -I 50\" Note: The command above is for Linux/Unix. For the Windows platform, you have to use semicolons in the CLASSPATH and backlashes instead of forward slashes in the paths. Source code # import weka.clusterers.* ; import weka.core.* ; import weka.core.converters.ConverterUtils.* ; import weka.gui.explorer.ClustererPanel ; import weka.gui.visualize.* ; import java.awt.* ; import java.io.* ; import java.text.* ; import java.util.* ; import javax.swing.* ; /** * Runs a clusterer on a dataset and visualizes the cluster assignments, * like with right-click menu in Explorer. * <p/> * Takes two arguments: * <ol> * <li>-t dataset</li> * <li>-W cluster algorithm with options</li> * </ol> * * Note: code should work with Weka 3.6.0 and 3.5.8. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeClusterAssignments { public static void main ( String [] args ) throws Exception { // load data Instances train = DataSource . read ( Utils . getOption ( 't' , args )); // some data formats store the class attribute information as well if ( train . classIndex () != - 1 ) throw new IllegalArgumentException ( \"Data cannot have class attribute!\" ); // instantiate clusterer String [] options = Utils . splitOptions ( Utils . getOption ( 'W' , args )); String classname = options [ 0 ] ; options [ 0 ] = \"\" ; Clusterer clusterer = AbstractClusterer . forName ( classname , options ); // evaluate clusterer clusterer . buildClusterer ( train ); ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( train ); // setup visualization // taken from: ClustererPanel.startClusterer() PlotData2D predData = ClustererPanel . setUpVisualizableInstances ( train , eval ); String name = ( new SimpleDateFormat ( \"HH:mm:ss - \" )). format ( new Date ()); String cname = clusterer . getClass (). getName (); if ( cname . startsWith ( \"weka.clusterers.\" )) name += cname . substring ( \"weka.clusterers.\" . length ()); else name += cname ; VisualizePanel vp = new VisualizePanel (); vp . setName ( name + \" (\" + train . relationName () + \")\" ); predData . setPlotName ( name + \" (\" + train . relationName () + \")\" ); vp . addPlot ( predData ); // display data // taken from: ClustererPanel.visualizeClusterAssignments(VisualizePanel) String plotName = vp . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Clusterer Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vp , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } } Downloads # VisualizeClusterAssignments.java","title":"Visualizing cluster assignments"},{"location":"visualization/visualizing_cluster_assignments/#source-code","text":"import weka.clusterers.* ; import weka.core.* ; import weka.core.converters.ConverterUtils.* ; import weka.gui.explorer.ClustererPanel ; import weka.gui.visualize.* ; import java.awt.* ; import java.io.* ; import java.text.* ; import java.util.* ; import javax.swing.* ; /** * Runs a clusterer on a dataset and visualizes the cluster assignments, * like with right-click menu in Explorer. * <p/> * Takes two arguments: * <ol> * <li>-t dataset</li> * <li>-W cluster algorithm with options</li> * </ol> * * Note: code should work with Weka 3.6.0 and 3.5.8. * * @author FracPete (fracpete at waikato dot ac dot nz) */ public class VisualizeClusterAssignments { public static void main ( String [] args ) throws Exception { // load data Instances train = DataSource . read ( Utils . getOption ( 't' , args )); // some data formats store the class attribute information as well if ( train . classIndex () != - 1 ) throw new IllegalArgumentException ( \"Data cannot have class attribute!\" ); // instantiate clusterer String [] options = Utils . splitOptions ( Utils . getOption ( 'W' , args )); String classname = options [ 0 ] ; options [ 0 ] = \"\" ; Clusterer clusterer = AbstractClusterer . forName ( classname , options ); // evaluate clusterer clusterer . buildClusterer ( train ); ClusterEvaluation eval = new ClusterEvaluation (); eval . setClusterer ( clusterer ); eval . evaluateClusterer ( train ); // setup visualization // taken from: ClustererPanel.startClusterer() PlotData2D predData = ClustererPanel . setUpVisualizableInstances ( train , eval ); String name = ( new SimpleDateFormat ( \"HH:mm:ss - \" )). format ( new Date ()); String cname = clusterer . getClass (). getName (); if ( cname . startsWith ( \"weka.clusterers.\" )) name += cname . substring ( \"weka.clusterers.\" . length ()); else name += cname ; VisualizePanel vp = new VisualizePanel (); vp . setName ( name + \" (\" + train . relationName () + \")\" ); predData . setPlotName ( name + \" (\" + train . relationName () + \")\" ); vp . addPlot ( predData ); // display data // taken from: ClustererPanel.visualizeClusterAssignments(VisualizePanel) String plotName = vp . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Clusterer Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vp , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } }","title":"Source code"},{"location":"visualization/visualizing_cluster_assignments/#downloads","text":"VisualizeClusterAssignments.java","title":"Downloads"},{"location":"visualization/visualizing_roc_curve/","text":"The following class lets you display a previously saved ROC curve , which also displays the AUC . If you don't need the AUC , then you can also use this command to display the curve: java [ CLASSPATH | -classpath <your-classpath> ] weka.gui.visualize.VisualizePanel <file> Source code: import java.awt.* ; import java.io.* ; import javax.swing.* ; import weka.core.* ; import weka.classifiers.evaluation.* ; import weka.gui.visualize.* ; /** * Visualizes a previously saved ROC curve. Code taken from the * <code>weka.gui.explorer.ClassifierPanel</code> - involved methods: * <ul> * <li>visualize(String,int,int)</li> * </li>visualizeClassifierErrors(VisualizePanel)</li> * </ul> * * @author FracPete */ public class VisualizeROC { /** * takes one argument: previously saved ROC curve data (ARFF file) */ public static void main ( String [] args ) throws Exception { Instances result = new Instances ( new BufferedReader ( new FileReader ( args [ 0 ] ))); result . setClassIndex ( result . numAttributes () - 1 ); ThresholdCurve tc = new ThresholdCurve (); // method visualize ThresholdVisualizePanel vmc = new ThresholdVisualizePanel (); vmc . setROCString ( \"(Area under ROC = \" + Utils . doubleToString ( tc . getROCArea ( result ), 4 ) + \")\" ); vmc . setName ( result . relationName ()); PlotData2D tempd = new PlotData2D ( result ); tempd . setPlotName ( result . relationName ()); tempd . addInstanceNumberAttribute (); // specify which points are connected boolean [] cp = new boolean [ result . numInstances () ] ; for ( int n = 1 ; n < cp . length ; n ++ ) cp [ n ] = true ; tempd . setConnectPoints ( cp ); // add plot vmc . addPlot ( tempd ); // method visualizeClassifierErrors String plotName = vmc . getName (); final javax . swing . JFrame jf = new javax . swing . JFrame ( \"Weka Classifier Visualize: \" + plotName ); jf . setSize ( 500 , 400 ); jf . getContentPane (). setLayout ( new BorderLayout ()); jf . getContentPane (). add ( vmc , BorderLayout . CENTER ); jf . addWindowListener ( new java . awt . event . WindowAdapter () { public void windowClosing ( java . awt . event . WindowEvent e ) { jf . dispose (); } }); jf . setVisible ( true ); } } See also # ROC curves Plotting multiple ROC curves - also contains a Java example of plotting multiple ROC curves in a single plot Downloads # VisualizeROC.java ( stable , developer )","title":"Visualizing roc curve"},{"location":"visualization/visualizing_roc_curve/#see-also","text":"ROC curves Plotting multiple ROC curves - also contains a Java example of plotting multiple ROC curves in a single plot","title":"See also"},{"location":"visualization/visualizing_roc_curve/#downloads","text":"VisualizeROC.java ( stable , developer )","title":"Downloads"}]}
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index e40b8d7..83c83d1 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ