StackOverflow questions understanding
Let's start with sample NLP task: We want to show related questions before user asks a new one (as it works on StackOverflow).
There are many possible solutions for this task. Let's look at one that at the first step tries to understand key phrases that identify this question and runs the search using them.
Approach
First of all, let's choose some real questions from StackOverflow to analyze them:
Now we can use Stanford Parser GUI to visualize the structure of these questions:
We can notice that all phrases that we have selected are parts of noun phrases(NP). As a first solution we can try to analyze
tags in the tree and select NP
that contains word level tags like (NN
,NNS
,NNP
,NNPS
).
1: 2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33: 34: 35: 36: 37: 38: 39: 40: 41: 42: 43: 44: 45: 46: 47: 48: 49: 50: 51: 52: 53: 54: 55: 56: 57: 58: 59: 60: 61: 62: 63: 64: 65: 66: 67: 68: 69: 70: 71: 72: 73: 74: 75: 76: 77: 78: 79: 80: 81: 82: 83: 84: 85: 86: 87: 88: 89: 90: 91: 92: 93: 94: 95: 96: 97: 98: 99: 100: 101: |
#r "../../bin/Stanford.NLP.Parser.Fsharp.dll" #r "../../packages/Stanford.NLP.Parser.3.8.0.0/lib/stanford-parser.dll" #r "../../packages/Stanford.NLP.Parser.3.8.0.0/lib/ejml-0.23.dll" #r "../../packages/IKVM.8.1.5717.0/lib/IKVM.Runtime.dll" #r "../../packages/IKVM.8.1.5717.0/lib/IKVM.OpenJDK.Core.dll" #r "../../packages/IKVM.8.1.5717.0/lib/IKVM.OpenJDK.Text.dll" open edu.stanford.nlp.parser.lexparser open edu.stanford.nlp.trees open java.util open System open Stanford.NLP.FSharp.Parser let model = @"T:\Stanford.NLP.NET\data\paket-files\nlp.stanford.edu\stanford-parser-full-2016-10-31\models\edu\stanford\nlp\models\lexparser\englishPCFG.ser.gz" let options = [|"-maxLength"; "500";"-retainTmpSubcategories"; "-MAX_ITEMS"; "500000";"-outputFormat"; "penn,typedDependenciesCollapsed"|] let parser = LexicalizedParser.loadModel(model, options) let languagePack = PennTreebankLanguagePack() let factory = languagePack.grammaticalStructureFactory() //open edu.stanford.nlp.parser.lexparser //open edu.stanford.nlp.trees //open java.util //open System //open Stanford.NLP.FSharp.Parser // //#r @"IKVM.Runtime.dll" //#r @"IKVM.OpenJDK.Core.dll" //#r @"ejml-0.23.dll" //#r @"stanford-parser.dll" //#r @"Stanford.NLP.Parser.Fsharp.dll" open edu.stanford.nlp.parser.lexparser open edu.stanford.nlp.trees open java.util open System open Stanford.NLP.FSharp.Parser //let model = @"d:\englishPCFG.ser.gz" let model = @"T:\Stanford.NLP.NET\data\paket-files\nlp.stanford.edu\stanford-parser-full-2017-06-09\models\edu\stanford\nlp\models\lexparser\englishPCFG.ser.gz" let options = [|"-maxLength"; "500"; "-retainTmpSubcategories"; "-MAX_ITEMS"; "500000"; "-outputFormat"; "penn,typedDependenciesCollapsed"|] let parser = LexicalizedParser.loadModel(model, options) let tlp = PennTreebankLanguagePack() let gsf = tlp.grammaticalStructureFactory() let getTree question = let tokenizer = tlp.getTokenizerFactory().getTokenizer(new java.io.StringReader(question)) let sentence = tokenizer.tokenize() parser.apply(sentence) let getKeyPhrases (tree:Tree) = let isNNx = function | Label NN | Label NNS | Label NNP | Label NNPS -> true | _ -> false let isNPwithNNx = function | Label NP as node -> node.getChildrenAsList() |> Iterable.castToSeq<Tree> |> Seq.exists isNNx | _ -> false let rec foldTree acc (node:Tree) = let acc = if node.isLeaf() then acc else node.getChildrenAsList() |> Iterable.castToSeq<Tree> |> Seq.fold (fun state x -> foldTree state x) acc if isNPwithNNx node then node :: acc else acc foldTree [] tree let questions = [|"How to make an F# project work with the object browser" "How can I build WebSharper on Mono 3.0 on Mac?" "Adding extra methods as type extensions in F#" "How to get MonoDevelop to compile F# projects?"|] questions |> Seq.iter (fun question -> printfn "Question : %s" question question |> getTree |> getKeyPhrases |> List.rev |> List.iter (fun p -> p.getLeaves() |> Iterable.castToArray<Tree> |> Array.map(fun x-> x.label().value()) |> printfn "\t%A") ) |
If you run this script, you will see the following:
1: 2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: |
Question : How to make an F# project work with the object browser [|"an"; "F"; "#"; "project"; "work"|] [|"the"; "object"; "browser"|] Question : How can I build WebSharper on Mono 3.0 on Mac? [|"WebSharper"|] [|"Mono"; "3.0"|] [|"Mac"|] Question : Adding extra methods as type extensions in F# [|"extra"; "methods"|] [|"type"; "extensions"|] [|"F"; "#"|] Question : How to get MonoDevelop to compile F# projects? [|"MonoDevelop"|] [|"F"; "#"; "projects"|] |
namespace edu
namespace edu.stanford
namespace edu.stanford.nlp
namespace edu.stanford.nlp.parser
namespace edu.stanford.nlp.parser.lexparser
namespace edu.stanford.nlp.trees
namespace java
namespace java.util
namespace System
namespace Stanford
namespace Stanford.NLP
namespace Stanford.NLP.FSharp
namespace Stanford.NLP.FSharp.Parser
val model : string
Full name: So_questions.model
Full name: So_questions.model
val options : string []
Full name: So_questions.options
Full name: So_questions.options
val parser : LexicalizedParser
Full name: So_questions.parser
Full name: So_questions.parser
Multiple items
type LexicalizedParser =
inherit ParserGrammar
new : lex:Lexicon * bg:BinaryGrammar * ug:UnaryGrammar * dg:DependencyGrammar * stateIndex:Index * wordIndex:Index * tagIndex:Index * op:Options -> LexicalizedParser
val lex : Lexicon
val bg : BinaryGrammar
val ug : UnaryGrammar
val dg : DependencyGrammar
val stateIndex : Index
val wordIndex : Index
val tagIndex : Index
val reranker : Reranker
member defaultCoreNLPFlags : unit -> string[]
...
Full name: edu.stanford.nlp.parser.lexparser.LexicalizedParser
--------------------
LexicalizedParser(lex: Lexicon, bg: BinaryGrammar, ug: UnaryGrammar, dg: DependencyGrammar, stateIndex: edu.stanford.nlp.util.Index, wordIndex: edu.stanford.nlp.util.Index, tagIndex: edu.stanford.nlp.util.Index, op: Options) : unit
type LexicalizedParser =
inherit ParserGrammar
new : lex:Lexicon * bg:BinaryGrammar * ug:UnaryGrammar * dg:DependencyGrammar * stateIndex:Index * wordIndex:Index * tagIndex:Index * op:Options -> LexicalizedParser
val lex : Lexicon
val bg : BinaryGrammar
val ug : UnaryGrammar
val dg : DependencyGrammar
val stateIndex : Index
val wordIndex : Index
val tagIndex : Index
val reranker : Reranker
member defaultCoreNLPFlags : unit -> string[]
...
Full name: edu.stanford.nlp.parser.lexparser.LexicalizedParser
--------------------
LexicalizedParser(lex: Lexicon, bg: BinaryGrammar, ug: UnaryGrammar, dg: DependencyGrammar, stateIndex: edu.stanford.nlp.util.Index, wordIndex: edu.stanford.nlp.util.Index, tagIndex: edu.stanford.nlp.util.Index, op: Options) : unit
LexicalizedParser.loadModel() : LexicalizedParser
LexicalizedParser.loadModel(ois: java.io.ObjectInputStream) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, extraFlags: List) : LexicalizedParser
LexicalizedParser.loadModel(op: Options, params extraFlags: string []) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, params extraFlags: string []) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, op: Options, params extraFlags: string []) : LexicalizedParser
LexicalizedParser.loadModel(ois: java.io.ObjectInputStream) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, extraFlags: List) : LexicalizedParser
LexicalizedParser.loadModel(op: Options, params extraFlags: string []) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, params extraFlags: string []) : LexicalizedParser
LexicalizedParser.loadModel(parserFileOrUrl: string, op: Options, params extraFlags: string []) : LexicalizedParser
val languagePack : PennTreebankLanguagePack
Full name: So_questions.languagePack
Full name: So_questions.languagePack
Multiple items
type PennTreebankLanguagePack =
inherit AbstractTreebankLanguagePack
new : unit -> PennTreebankLanguagePack
member evalBIgnoredPunctuationTags : unit -> string[]
member getTokenizerFactory : unit -> TokenizerFactory
member grammaticalStructureFactory : unit -> GrammaticalStructureFactory + 2 overloads
member headFinder : unit -> HeadFinder
member labelAnnotationIntroducingCharacters : unit -> char[]
member punctuationTags : unit -> string[]
member punctuationWords : unit -> string[]
member sentenceFinalPunctuationTags : unit -> string[]
member sentenceFinalPunctuationWords : unit -> string[]
...
Full name: edu.stanford.nlp.trees.PennTreebankLanguagePack
--------------------
PennTreebankLanguagePack() : unit
type PennTreebankLanguagePack =
inherit AbstractTreebankLanguagePack
new : unit -> PennTreebankLanguagePack
member evalBIgnoredPunctuationTags : unit -> string[]
member getTokenizerFactory : unit -> TokenizerFactory
member grammaticalStructureFactory : unit -> GrammaticalStructureFactory + 2 overloads
member headFinder : unit -> HeadFinder
member labelAnnotationIntroducingCharacters : unit -> char[]
member punctuationTags : unit -> string[]
member punctuationWords : unit -> string[]
member sentenceFinalPunctuationTags : unit -> string[]
member sentenceFinalPunctuationWords : unit -> string[]
...
Full name: edu.stanford.nlp.trees.PennTreebankLanguagePack
--------------------
PennTreebankLanguagePack() : unit
val factory : GrammaticalStructureFactory
Full name: So_questions.factory
Full name: So_questions.factory
PennTreebankLanguagePack.grammaticalStructureFactory() : GrammaticalStructureFactory
PennTreebankLanguagePack.grammaticalStructureFactory(puncFilter: function.Predicate) : GrammaticalStructureFactory
PennTreebankLanguagePack.grammaticalStructureFactory(puncFilter: function.Predicate, hf: HeadFinder) : GrammaticalStructureFactory
PennTreebankLanguagePack.grammaticalStructureFactory(puncFilter: function.Predicate) : GrammaticalStructureFactory
PennTreebankLanguagePack.grammaticalStructureFactory(puncFilter: function.Predicate, hf: HeadFinder) : GrammaticalStructureFactory
val tlp : PennTreebankLanguagePack
Full name: So_questions.tlp
Full name: So_questions.tlp
val gsf : GrammaticalStructureFactory
Full name: So_questions.gsf
Full name: So_questions.gsf
val getTree : question:string -> Tree
Full name: So_questions.getTree
Full name: So_questions.getTree
val question : string
val tokenizer : edu.stanford.nlp.process.Tokenizer
PennTreebankLanguagePack.getTokenizerFactory() : edu.stanford.nlp.process.TokenizerFactory
namespace java.io
Multiple items
type StringReader =
inherit Reader
new : s:string -> StringReader
member close : unit -> unit
member mark : readAheadLimit:int -> unit
member markSupported : unit -> bool
member read : unit -> int + 1 overload
member ready : unit -> bool
member reset : unit -> unit
member skip : ns:int64 -> int64
Full name: java.io.StringReader
--------------------
java.io.StringReader(s: string) : unit
type StringReader =
inherit Reader
new : s:string -> StringReader
member close : unit -> unit
member mark : readAheadLimit:int -> unit
member markSupported : unit -> bool
member read : unit -> int + 1 overload
member ready : unit -> bool
member reset : unit -> unit
member skip : ns:int64 -> int64
Full name: java.io.StringReader
--------------------
java.io.StringReader(s: string) : unit
val sentence : List
edu.stanford.nlp.process.Tokenizer.tokenize() : List
edu.stanford.nlp.parser.common.ParserGrammar.apply(obj: obj) : obj
edu.stanford.nlp.parser.common.ParserGrammar.apply(words: List) : Tree
edu.stanford.nlp.parser.common.ParserGrammar.apply(words: List) : Tree
val getKeyPhrases : tree:Tree -> Tree list
Full name: So_questions.getKeyPhrases
Full name: So_questions.getKeyPhrases
val tree : Tree
Multiple items
type Tree =
inherit AbstractCollection
new : unit -> Tree
member addChild : t:Tree -> unit + 1 overload
member ancestor : height:int * root:Tree -> Tree
member cCommands : t1:Tree * t2:Tree -> bool
member children : unit -> Tree[]
member constituents : unit -> Set + 4 overloads
member deepCopy : unit -> Tree + 2 overloads
member dependencies : unit -> Set + 2 overloads
member depth : unit -> int + 1 overload
member dominates : t:Tree -> bool
...
Full name: edu.stanford.nlp.trees.Tree
--------------------
Tree() : unit
type Tree =
inherit AbstractCollection
new : unit -> Tree
member addChild : t:Tree -> unit + 1 overload
member ancestor : height:int * root:Tree -> Tree
member cCommands : t1:Tree * t2:Tree -> bool
member children : unit -> Tree[]
member constituents : unit -> Set + 4 overloads
member deepCopy : unit -> Tree + 2 overloads
member dependencies : unit -> Set + 2 overloads
member depth : unit -> int + 1 overload
member dominates : t:Tree -> bool
...
Full name: edu.stanford.nlp.trees.Tree
--------------------
Tree() : unit
val isNNx : (Tree -> bool)
active recognizer Label: PennTreebankIITags -> Tree -> unit option
Full name: Stanford.NLP.FSharp.Parser.PennTreebankIIPatterns.( |Label|_| )
Full name: Stanford.NLP.FSharp.Parser.PennTreebankIIPatterns.( |Label|_| )
union case PennTreebankIITags.NN: PennTreebankIITags
union case PennTreebankIITags.NNS: PennTreebankIITags
union case PennTreebankIITags.NNP: PennTreebankIITags
union case PennTreebankIITags.NNPS: PennTreebankIITags
val isNPwithNNx : (Tree -> bool)
union case PennTreebankIITags.NP: PennTreebankIITags
val node : Tree
Tree.getChildrenAsList() : List
module Iterable
from java.util
from java.util
val castToSeq : iter:java.lang.Iterable -> seq<'T>
Full name: java.util.Iterable.castToSeq
Full name: java.util.Iterable.castToSeq
module Seq
from Microsoft.FSharp.Collections
from Microsoft.FSharp.Collections
val exists : predicate:('T -> bool) -> source:seq<'T> -> bool
Full name: Microsoft.FSharp.Collections.Seq.exists
Full name: Microsoft.FSharp.Collections.Seq.exists
val foldTree : (Tree list -> Tree -> Tree list)
val acc : Tree list
Tree.isLeaf() : bool
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State
Full name: Microsoft.FSharp.Collections.Seq.fold
Full name: Microsoft.FSharp.Collections.Seq.fold
val state : Tree list
val x : Tree
val questions : string []
Full name: So_questions.questions
Full name: So_questions.questions
val iter : action:('T -> unit) -> source:seq<'T> -> unit
Full name: Microsoft.FSharp.Collections.Seq.iter
Full name: Microsoft.FSharp.Collections.Seq.iter
val printfn : format:Printf.TextWriterFormat<'T> -> 'T
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
Multiple items
type List =
member add : e:obj -> bool + 1 overload
member addAll : c:Collection -> bool + 1 overload
member clear : unit -> unit
member contains : o:obj -> bool
member containsAll : c:Collection -> bool
member equals : o:obj -> bool
member get : index:int -> obj
member hashCode : unit -> int
member indexOf : o:obj -> int
member isEmpty : unit -> bool
...
nested type __DefaultMethods
Full name: java.util.List
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
type List =
member add : e:obj -> bool + 1 overload
member addAll : c:Collection -> bool + 1 overload
member clear : unit -> unit
member contains : o:obj -> bool
member containsAll : c:Collection -> bool
member equals : o:obj -> bool
member get : index:int -> obj
member hashCode : unit -> int
member indexOf : o:obj -> int
member isEmpty : unit -> bool
...
nested type __DefaultMethods
Full name: java.util.List
--------------------
type List<'T> =
| ( [] )
| ( :: ) of Head: 'T * Tail: 'T list
interface IEnumerable
interface IEnumerable<'T>
member Head : 'T
member IsEmpty : bool
member Item : index:int -> 'T with get
member Length : int
member Tail : 'T list
static member Cons : head:'T * tail:'T list -> 'T list
static member Empty : 'T list
Full name: Microsoft.FSharp.Collections.List<_>
val rev : list:'T list -> 'T list
Full name: Microsoft.FSharp.Collections.List.rev
Full name: Microsoft.FSharp.Collections.List.rev
val iter : action:('T -> unit) -> list:'T list -> unit
Full name: Microsoft.FSharp.Collections.List.iter
Full name: Microsoft.FSharp.Collections.List.iter
val p : Tree
Tree.getLeaves() : List
Tree.getLeaves(list: List) : List
Tree.getLeaves(list: List) : List
val castToArray : iter:java.lang.Iterable -> 'T []
Full name: java.util.Iterable.castToArray
Full name: java.util.Iterable.castToArray
type Array =
member Clone : unit -> obj
member CopyTo : array:Array * index:int -> unit + 1 overload
member GetEnumerator : unit -> IEnumerator
member GetLength : dimension:int -> int
member GetLongLength : dimension:int -> int64
member GetLowerBound : dimension:int -> int
member GetUpperBound : dimension:int -> int
member GetValue : params indices:int[] -> obj + 7 overloads
member Initialize : unit -> unit
member IsFixedSize : bool
...
Full name: System.Array
member Clone : unit -> obj
member CopyTo : array:Array * index:int -> unit + 1 overload
member GetEnumerator : unit -> IEnumerator
member GetLength : dimension:int -> int
member GetLongLength : dimension:int -> int64
member GetLowerBound : dimension:int -> int
member GetUpperBound : dimension:int -> int
member GetValue : params indices:int[] -> obj + 7 overloads
member Initialize : unit -> unit
member IsFixedSize : bool
...
Full name: System.Array
val map : mapping:('T -> 'U) -> array:'T [] -> 'U []
Full name: Microsoft.FSharp.Collections.Array.map
Full name: Microsoft.FSharp.Collections.Array.map
Tree.label() : edu.stanford.nlp.ling.Label