Stanford.NLP.Fsharp


StackOverflow questions understanding

Let's start with sample NLP task: We want to show related questions before user asks a new one (as it works on StackOverflow).

There are many possible solutions for this task. Let's look at one that at the first step tries to understand key phrases that identify this question and runs the search using them.

Approach

First of all, let's choose some real questions from StackOverflow to analyze them:

Now we can use Stanford Parser GUI to visualize the structure of these questions:

We can notice that all phrases that we have selected are parts of noun phrases(NP). As a first solution we can try to analyze tags in the tree and select NP that contains word level tags like (NN,NNS,NNP,NNPS).

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
#r "../../bin/Stanford.NLP.Parser.Fsharp.dll"
#r "../../packages/Stanford.NLP.Parser.3.8.0.0/lib/stanford-parser.dll"
#r "../../packages/Stanford.NLP.Parser.3.8.0.0/lib/ejml-0.23.dll"
#r "../../packages/IKVM.8.1.5717.0/lib/IKVM.Runtime.dll"
#r "../../packages/IKVM.8.1.5717.0/lib/IKVM.OpenJDK.Core.dll"
#r "../../packages/IKVM.8.1.5717.0/lib/IKVM.OpenJDK.Text.dll"


open edu.stanford.nlp.parser.lexparser
open edu.stanford.nlp.trees
open java.util
open System
open Stanford.NLP.FSharp.Parser

let model = @"T:\Stanford.NLP.NET\data\paket-files\nlp.stanford.edu\stanford-parser-full-2016-10-31\models\edu\stanford\nlp\models\lexparser\englishPCFG.ser.gz"
let options = [|"-maxLength"; "500";"-retainTmpSubcategories"; "-MAX_ITEMS";
                "500000";"-outputFormat"; "penn,typedDependenciesCollapsed"|]
let parser = LexicalizedParser.loadModel(model, options)
let languagePack = PennTreebankLanguagePack()
let factory = languagePack.grammaticalStructureFactory()


//open edu.stanford.nlp.parser.lexparser
//open edu.stanford.nlp.trees
//open java.util
//open System
//open Stanford.NLP.FSharp.Parser
//
//#r @"IKVM.Runtime.dll"
//#r @"IKVM.OpenJDK.Core.dll"
//#r @"ejml-0.23.dll"
//#r @"stanford-parser.dll"
//#r @"Stanford.NLP.Parser.Fsharp.dll"

open edu.stanford.nlp.parser.lexparser
open edu.stanford.nlp.trees
open java.util

open System
open Stanford.NLP.FSharp.Parser

//let model = @"d:\englishPCFG.ser.gz"
let model = @"T:\Stanford.NLP.NET\data\paket-files\nlp.stanford.edu\stanford-parser-full-2017-06-09\models\edu\stanford\nlp\models\lexparser\englishPCFG.ser.gz"

let options = [|"-maxLength"; "500";
                "-retainTmpSubcategories";
                "-MAX_ITEMS"; "500000";
                "-outputFormat"; "penn,typedDependenciesCollapsed"|]
let parser = LexicalizedParser.loadModel(model, options)

let tlp = PennTreebankLanguagePack()
let gsf = tlp.grammaticalStructureFactory()

let getTree question =
    let tokenizer = tlp.getTokenizerFactory().getTokenizer(new java.io.StringReader(question))
    let sentence = tokenizer.tokenize()
    parser.apply(sentence)

let getKeyPhrases (tree:Tree) =
    let isNNx = function
        | Label NN | Label NNS
        | Label NNP | Label NNPS -> true
        | _ -> false
    let isNPwithNNx = function
        | Label NP as node ->
            node.getChildrenAsList()
            |> Iterable.castToSeq<Tree>
            |> Seq.exists isNNx
        | _ -> false
    let rec foldTree acc (node:Tree) =
        let acc =
            if node.isLeaf() then acc
            else node.getChildrenAsList()
                 |> Iterable.castToSeq<Tree>
                 |> Seq.fold
                    (fun state x -> foldTree state x)
                    acc
        if isNPwithNNx node
          then node :: acc
          else acc
    foldTree [] tree

let questions =
    [|"How to make an F# project work with the object browser"
      "How can I build WebSharper on Mono 3.0 on Mac?"
      "Adding extra methods as type extensions in F#"
      "How to get MonoDevelop to compile F# projects?"|]

questions
|> Seq.iter (fun question ->
    printfn "Question : %s" question
    question
    |> getTree
    |> getKeyPhrases
    |> List.rev
    |> List.iter (fun p ->
        p.getLeaves()
        |> Iterable.castToArray<Tree>
        |> Array.map(fun x-> x.label().value())
        |> printfn "\t%A")
)

If you run this script, you will see the following:

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
 Question : How to make an F# project work with the object browser
 [|"an"; "F"; "#"; "project"; "work"|]
 [|"the"; "object"; "browser"|]
 Question : How can I build WebSharper on Mono 3.0 on Mac?
 [|"WebSharper"|]
 [|"Mono"; "3.0"|]
 [|"Mac"|]
 Question : Adding extra methods as type extensions in F#
 [|"extra"; "methods"|]
 [|"type"; "extensions"|]
 [|"F"; "#"|]
 Question : How to get MonoDevelop to compile F# projects?
 [|"MonoDevelop"|]
 [|"F"; "#"; "projects"|]
namespace edu
namespace edu.stanford
namespace edu.stanford.nlp
namespace edu.stanford.nlp.parser
namespace edu.stanford.nlp.parser.lexparser
namespace edu.stanford.nlp.trees
namespace java
namespace java.util
namespace System
namespace Stanford
namespace Stanford.NLP
namespace Stanford.NLP.FSharp
namespace Stanford.NLP.FSharp.Parser
val model : string

Full name: So_questions.model
val options : string []

Full name: So_questions.options
val parser : LexicalizedParser

Full name: So_questions.parser
Multiple items
type LexicalizedParser =
  inherit ParserGrammar
  new : lex:Lexicon * bg:BinaryGrammar * ug:UnaryGrammar * dg:DependencyGrammar * stateIndex:Index * wordIndex:Index * tagIndex:Index * op:Options -> LexicalizedParser
  val lex : Lexicon
  val bg : BinaryGrammar
  val ug : UnaryGrammar
  val dg : DependencyGrammar
  val stateIndex : Index
  val wordIndex : Index
  val tagIndex : Index
  val reranker : Reranker
  member defaultCoreNLPFlags : unit -> string[]
  ...

Full name: edu.stanford.nlp.parser.lexparser.LexicalizedParser

--------------------
LexicalizedParser(lex: Lexicon, bg: BinaryGrammar, ug: UnaryGrammar, dg: DependencyGrammar, stateIndex: edu.stanford.nlp.util.Index, wordIndex: edu.stanford.nlp.util.Index, tagIndex: edu.stanford.nlp.util.Index, op: Options) : unit
LexicalizedParser.loadModel() : LexicalizedParser
LexicalizedParser.loadModel(ois: java.io.ObjectInputStream) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, extraFlags: List) : LexicalizedParser
LexicalizedParser.loadModel(op: Options, params extraFlags: string []) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, params extraFlags: string []) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, op: Options, params extraFlags: string []) : LexicalizedParser
val languagePack : PennTreebankLanguagePack

Full name: So_questions.languagePack
Multiple items
type PennTreebankLanguagePack =
  inherit AbstractTreebankLanguagePack
  new : unit -> PennTreebankLanguagePack
  member evalBIgnoredPunctuationTags : unit -> string[]
  member getTokenizerFactory : unit -> TokenizerFactory
  member grammaticalStructureFactory : unit -> GrammaticalStructureFactory + 2 overloads
  member headFinder : unit -> HeadFinder
  member labelAnnotationIntroducingCharacters : unit -> char[]
  member punctuationTags : unit -> string[]
  member punctuationWords : unit -> string[]
  member sentenceFinalPunctuationTags : unit -> string[]
  member sentenceFinalPunctuationWords : unit -> string[]
  ...

Full name: edu.stanford.nlp.trees.PennTreebankLanguagePack

--------------------
PennTreebankLanguagePack() : unit
val factory : GrammaticalStructureFactory

Full name: So_questions.factory
PennTreebankLanguagePack.grammaticalStructureFactory() : GrammaticalStructureFactory
PennTreebankLanguagePack.grammaticalStructureFactory(puncFilter: function.Predicate) : GrammaticalStructureFactory
PennTreebankLanguagePack.grammaticalStructureFactory(puncFilter: function.Predicate, hf: HeadFinder) : GrammaticalStructureFactory
val tlp : PennTreebankLanguagePack

Full name: So_questions.tlp
val gsf : GrammaticalStructureFactory

Full name: So_questions.gsf
val getTree : question:string -> Tree

Full name: So_questions.getTree
val question : string
val tokenizer : edu.stanford.nlp.process.Tokenizer
PennTreebankLanguagePack.getTokenizerFactory() : edu.stanford.nlp.process.TokenizerFactory
namespace java.io
Multiple items
type StringReader =
  inherit Reader
  new : s:string -> StringReader
  member close : unit -> unit
  member mark : readAheadLimit:int -> unit
  member markSupported : unit -> bool
  member read : unit -> int + 1 overload
  member ready : unit -> bool
  member reset : unit -> unit
  member skip : ns:int64 -> int64

Full name: java.io.StringReader

--------------------
java.io.StringReader(s: string) : unit
val sentence : List
edu.stanford.nlp.process.Tokenizer.tokenize() : List
edu.stanford.nlp.parser.common.ParserGrammar.apply(obj: obj) : obj
edu.stanford.nlp.parser.common.ParserGrammar.apply(words: List) : Tree
val getKeyPhrases : tree:Tree -> Tree list

Full name: So_questions.getKeyPhrases
val tree : Tree
Multiple items
type Tree =
  inherit AbstractCollection
  new : unit -> Tree
  member addChild : t:Tree -> unit + 1 overload
  member ancestor : height:int * root:Tree -> Tree
  member cCommands : t1:Tree * t2:Tree -> bool
  member children : unit -> Tree[]
  member constituents : unit -> Set + 4 overloads
  member deepCopy : unit -> Tree + 2 overloads
  member dependencies : unit -> Set + 2 overloads
  member depth : unit -> int + 1 overload
  member dominates : t:Tree -> bool
  ...

Full name: edu.stanford.nlp.trees.Tree

--------------------
Tree() : unit
val isNNx : (Tree -> bool)
active recognizer Label: PennTreebankIITags -> Tree -> unit option

Full name: Stanford.NLP.FSharp.Parser.PennTreebankIIPatterns.( |Label|_| )
union case PennTreebankIITags.NN: PennTreebankIITags
union case PennTreebankIITags.NNS: PennTreebankIITags
union case PennTreebankIITags.NNP: PennTreebankIITags
union case PennTreebankIITags.NNPS: PennTreebankIITags
val isNPwithNNx : (Tree -> bool)
union case PennTreebankIITags.NP: PennTreebankIITags
val node : Tree
Tree.getChildrenAsList() : List
module Iterable

from java.util
val castToSeq : iter:java.lang.Iterable -> seq<'T>

Full name: java.util.Iterable.castToSeq
module Seq

from Microsoft.FSharp.Collections
val exists : predicate:('T -> bool) -> source:seq<'T> -> bool

Full name: Microsoft.FSharp.Collections.Seq.exists
val foldTree : (Tree list -> Tree -> Tree list)
val acc : Tree list
Tree.isLeaf() : bool
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State

Full name: Microsoft.FSharp.Collections.Seq.fold
val state : Tree list
val x : Tree
val questions : string []

Full name: So_questions.questions
val iter : action:('T -> unit) -> source:seq<'T> -> unit

Full name: Microsoft.FSharp.Collections.Seq.iter
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
Multiple items
type List =
  member add : e:obj -> bool + 1 overload
  member addAll : c:Collection -> bool + 1 overload
  member clear : unit -> unit
  member contains : o:obj -> bool
  member containsAll : c:Collection -> bool
  member equals : o:obj -> bool
  member get : index:int -> obj
  member hashCode : unit -> int
  member indexOf : o:obj -> int
  member isEmpty : unit -> bool
  ...
  nested type __DefaultMethods

Full name: java.util.List

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val rev : list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.rev
val iter : action:('T -> unit) -> list:'T list -> unit

Full name: Microsoft.FSharp.Collections.List.iter
val p : Tree
Tree.getLeaves() : List
Tree.getLeaves(list: List) : List
val castToArray : iter:java.lang.Iterable -> 'T []

Full name: java.util.Iterable.castToArray
type Array =
  member Clone : unit -> obj
  member CopyTo : array:Array * index:int -> unit + 1 overload
  member GetEnumerator : unit -> IEnumerator
  member GetLength : dimension:int -> int
  member GetLongLength : dimension:int -> int64
  member GetLowerBound : dimension:int -> int
  member GetUpperBound : dimension:int -> int
  member GetValue : params indices:int[] -> obj + 7 overloads
  member Initialize : unit -> unit
  member IsFixedSize : bool
  ...

Full name: System.Array
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []

Full name: Microsoft.FSharp.Collections.Array.map
Tree.label() : edu.stanford.nlp.ling.Label
Fork me on GitHub