#! /usr/bin/perl -w
# -*- mode:perl -*-
###########################################################
# penn2ref.pl 
# Author: Liu Lezhong, lliu2@ix.urz.uni-heidelberg.de
#
# Purpose: np-extraction from Penn Treebank as the input for 
#          the tool Referee
#                          
#
# Input:   raw Penn Treebank 
#
# Output:  filename.pure containing the pure text. 
#          filename.nummer containing the nummer of np positions.
#
# Known bugs: 
#          
#
# TODO:    
###########################################################
###########################################################
# Command line:
# perl penn2ref.pl file_prefix
# e.g.:perl ref2mate.pl cf02
###########################################################


$infile = $ARGV[0];

 #print "$infile\n";
###########################################################
# overall control structure 
##########################################################

###
# main program
###

#preprocessing
preprocessing($infile);

#create the pure text prefix
puretext($infile);
sproblem($infile);

#create the position nummers .prefix.info 
#create the attribut of NP-phrases .prefix.attr
terminalcount($infile);                 
phasenprint($infile);
goback($infile);
subknote($infile);
npphrase($infile);
npprad($infile);
npprad2($infile);
final($infile);
nummer($infile);
############################################################
# subroutines
############################################################

###
# purpose: removes not necessary whitespace and newline in every sentence and 
#          the explanations at the beginn of the file
# input:   raw Penn treebank file
# output:  prefix.new01 containing each sentence with just one newline at the
#          end 
###


sub preprocessing{
    $infile=$_[0];
    
    open(MRG,"$infile.mrg");
    open(NEW01,">$infile.new01");
    
    $/ ="(. .) ))\n"; #chunk begins with
    while (<MRG>){
	s/\*x.*\n//g; #get off the * at the begin of the data
	s/\n//g; #get off the first newline
	
	s/\n/ /g;  #get off the newline in chunk
	s/ \s+/ /g; # replace the more than one whitespace into one whitespace 
	print NEW01 "$_\n";
	
	
    }
    close (MRG);
    close (NEW01);
    
}
###
# purpose: to extract all the terminal(text) aut of the Treebank
# input:   prefix.new01 containing each sentence with just one newline at the
#          end 
# output:  The pure text but with 's 'm 'n't 'd 're problems
###

sub puretext{

    $infile=$_[0];
     open(NEW01,"$infile.new01");
     open(WITHS,">$infile.withs");
      $/ ="(. .) ))"; #chunk begins with
   $pos = 1; #the begin character of the text
    $line=1; #the bigin of line of text

    while (<NEW01>){

	(@terminals) = ( $_ =~ /\([^()]+\s([^()*]+)\)/g ); #save all the termal into an array

foreach $term (@terminals){
    $term1=$term;
  
        $oldPos = $pos; #save the position as begin of the terminal symbol.
        
        $pos += length( $term ) -1; #position plus the lengh of new term
                                    #and minus one is the position of the next new word beginning
          if ($term eq "'s"){  #because there will be a additional whitespace
                               # position so do the other case
       $oldPos--;
       $pos--;
       
   }
     if ($term eq "n't"){
       $oldPos--;
       $pos--;
       
   } 
    if ($term eq "'d"){
       $oldPos--;
       $pos--;
       
   }
 if ($term eq "'m"){
       $oldPos--;
       $pos--;
       
   }
          if ($term eq "'re"){
       $oldPos--;
       $pos--;
       
   }
         
	 if ($pos > 70){           #if the position over 70 than begin a newline
	$oldPos=1;      #the word newline begin with position 1
        $pos=$oldPos+length( $term )-1;  #and so on add the position of the other word
        $line++;
	print WITHS "\n";
}
  print WITHS "$term1 ";
  

        $pos=$pos+2; #so that we will not pass the whitespace position in text
     
}

}
close (NEW01);
close (WITHS);
}
###
# purpose: to clean those 's 'm 'n't 'd 're problems
# input:   prefix.withs 
# output:  prefix.pure without 's 'm 'n't 'd 're problems
###

sub sproblem{

    $infile=$_[0];
     open(WITHS,"$infile.withs");
     open(PURE,">$infile");
  $/ ="(. .) ))"; #chunk begins with
    while (<WITHS>){
   s/ 's/'s/g;         #otherwise the text will be "the teach 's"
   s/ n't/n't/g; #otherwise the text will be "you n't"
   s/ 'd/'d/g;    #and so on
   s/ 're/'re/g;
 s/ 'm/'m/g;
    print PURE $_;
}
    close (WITHS);
    close  (PURE);

}
###
# purpose: to clean those 's 'm 'n't 'd 're problems
# input:   prefix.withs 
# output:  prefix.pure without 's 'm 'n't 'd 're problems
###
 
sub terminalcount{

    $infile=$_[0];
     open(NEW01,"$infile.new01");
    open(TERMINAL,">$infile.terminal");
      $/ ="(. .) ))"; #chunk begins with
   $pos = 1;
    $line=1;

    while (<NEW01>){
       
   
(@terminals) = ( $_ =~ /\([^()]+\s([^()*]+\))/g ); #save all the termal with ) into an array,for example : she)


foreach $term (@terminals){
     
        $oldPos = $pos -1;       #it is all the same ,but due to the false of the referee tool ,we make every position's beginning minus one.
      
        $pos += length( $term ) -2; #position plus the lengh of new term
                                    #and minus two is the position of a new word beginning and the ")" has one position
          if ($term eq "'s)"){  #it is all most the same ,but we should write the ")" too.
       $oldPos--;
       $pos--;	}
           if ($term eq "n't)"){
       $oldPos--;
       $pos--;
       
   }     if ($term eq "'d)"){
       $oldPos--;
       $pos--;
       
   }
          if ($term eq "'re)"){
       $oldPos--;
       $pos--;
       
   }
 if ($term eq "'m)"){
       $oldPos--;
       $pos--;
       
   }
	 if ($pos > 70){
	$oldPos=0;  #then the new line will begin with 0
	$pos=$oldPos+length( $term )-1;
        $line++;
    }
   $_=~ s/\s[^()* ]+\)/&$term $line $oldPos $line $pos/;
 #we make the information of positions behind every terminal & is just replace the whitespace which later will be back to the whitespace again ,due to the 
 #function of  s/// .So that the terminal that had replaced ,will not be replace again.

     $pos=$pos+2;
     #so that we will not pass the whitespace position in text
}
s/&/ /g; #so here we make the whitespace back

print TERMINAL "$_\n\n"; #we print the sentence with the positions


}
close (NEW01);
close (TERMINAL);
}
 ####
 # purpose: print all the phrases with the postion informations 
 #   input: the prefix.new01       
 #  output: all the phrases with the postion informations 
 ###

sub phasenprint{
    $infile=$_[0];
    
    open(TERMINAL,"$infile.terminal");
    open(PHRASE,">$infile.phrase");
    open(NPDEPTH,">$infile.npdepth");
    $/ ="(. .) )\n"; #chunk begins with
    
    while (<TERMINAL>){ 
	chop($_);
	
	@array=split (//) ;  
        #turn every character of the chunk into an array. 
	$i=0;
	$k=-1; 
        #because later the $k will be used as index of the array,
        #and thearray begin with array[0]. 
	foreach $character (@array){  
	    $k++; 
            #$k with be the index of the position of every character.
	    if ("(" eq $character){
		$i++;   #only index the nummer of the phases.
		push @nummer ,$i; #we push it just becouse this information will be printed in another if-function
                push @phrasbegin ,$k; #push the begin position of the phrase
 if ("$array[$k+1]" eq "N" && "$array[$k+2]" eq "P"||"$array[$k+4]" eq "\$"){  
  #when after "(" there is a NP or PRP$ then we count the depth of the NP
		    $np=pop @npdepth;
                    $np++;
                    push @npdepth ,$np;
                    } #the idea is the same as to count the sentence depth
                $ni=pop @depth;     
		$ni++;              
		push @depth ,$ni;    
		#this three lines are for the sentence depth we can count it
                # with just one stake depth
	    } #if there is a ( ,then store the position of the ( in a stack,so that later we can pop it ,when the right ) appear.
	    
	    if (")" eq $character){
		
                $n=pop @nummer;     
                #$n is the index of all the phrases
                $h=pop @phrasbegin; 
                #$h is the bigin position of the phrases

		$in=pop @depth;   
                $kn=$in-1; 
                #these two are also the same for the sentence depth
                print PHRASE "$n";  
                #print the indes of the phrases
                print NPDEPTH "$n "; 
                #print the index into the file of NPDEPTH
		print PHRASE "_$kn"; 
                #print the sentence depth with _
		$in--;
                push @depth ,$in; 
                #this two lines is for the sentence depth too 

                for ($y=$h;$y<=$k;$y++){
		    
		    print PHRASE "$array[$y]";
		    
		} #print the begin till end of the phase
             if ($array[$h+1] eq N && $array[$h+2] eq P||"$array[$h+4]" eq "\$"){ #when the phrase begin with (NP or (PRP$ the print the npdepth with $nn
		 $nn=pop @npdepth;
		 print NPDEPTH "$nn\n";
		 $nn--;
		 push @npdepth ,$nn;
	     } 
	
		print PHRASE "\n";
		
}
	    
	    
	}
	
    }     
    
    
    close (TERMINAL);
    close (PHRASE);
    close (NPDEPTH);
}
 ###
 # purpose: removes all the teminal position informations so that later we
 #          can use the recursive program to extract the real entity
 #          removes the additional information in infile.NPDEPTH
 # input:   the prefix.phrase and prefix.npdepth
 # output:  the phrase without the teminal position informations
 #          the npdepth information with phrase indexes
 ###       
sub goback{
    $infile=$_[0];
    
    open(PHRASE,"$infile.phrase");
    open(BACK,">$infile.back");
    open(NPDEPTH,"$infile.npdepth");
    open(NPDEPTH2,">$infile.npdepth2");
    
     $/ ="\n"; #chunk begins with
   while (<NPDEPTH>){
       chop($_);
       $_=~/([0-9]+)\s([0-9]+)$/;
        #what we need is only the last two nummers in each line
        print NPDEPTH2 "$1 $2\n";
        #print it in NPDEPTH2
       }     
while (<PHRASE>){
	if (/[0-9]+\(NP\s\(NP\s/){
	  s/\s[0-9]+\s[0-9]+\s[0-9]+\s[0-9]+\s/ /g;
          #removes the information of the positions like 12 34 12 44 
		  print BACK "$_";  
  
	}
       
    } 
    close (NPDEPTH);
    close (NPDEPTH2);
    close (PHRASE);  
    close (BACK);
}

sub cfrule {
    ($head,$body) = @_; #get the value of $1 and $2 in subknote funtion

    return $head;  #return just NONterminal 
    
}
###
# purpose: to get the information of the first subknotes of a NP which we need 
# input:   $infile.back
# output:  creats a list of the index of the phrase which we need
###
sub subknote{
    $infile=$_[0];
    open(BACK,"$infile.back");
    open(LIST2,">$infile.list2");
    $/ ="\n";


    while (<BACK>) {

	$w = "[^ ^\(^\)]";   
        # words: everything without whitespace and "("or")"

	$k=$_;
	while ($_ =~ /\(($w+)\s+([^\(]+?)/){ 
        #$w match the NONterminal and $2 match the terminal          

	    $_ =~ s/\(($w+)\s+([^\(]+?)\)/cfrule($1,$2)/ge; 
            #in this recursive funtion the "e" make it posiable to 
            #set the cfrule funtion in one replace funtion
	  	         	
    } 

        
      if($body=~/NN/g){  #if there is a NN in the first depth of subknote of the NP phrase
    $k=~/([0-9]+)_[0-9]+\(NP\s\(NP\s/;
    #match the index of the phrase 
    $h=$1+1;
    #the first subknote is the index of the phrase +1
        (@nr2)=($h);
    print LIST2 "@nr2\n";

}
       if($body=~/CC/g){
         #if there is a CC in the first depth of subknote of the NP phrase
    $k=~/([0-9]+)_[0-9]+\(NP\s\(NP\s/; 
    $h=$1+1;  
     #the first subknote is the index of the phrase +1
             (@nr2)=($h);
    print LIST2 "@nr2\n";
        

	}
        if($body=~/, ,/g){
        #if there is a , , in the first depth of subknote of the NP phrase
    $k=~/([0-9]+)_[0-9]+\(NP\s\(NP\s/; 
    $h=$1+1;
 #the first subknote is the index of the phrase +1
    print LIST2 "$h\n";

     
}


    }
close(BACK);
    close(LIST2);
}


 ###
 # purpose: get the NP phrases out of all the phrases and extract the begin and
 #          end positions of each phrase and add it at the end of each phrase
 #          we need
 # input:   prefix.phrase
 # output:  NP phrases with the begin and end positions of each phrase and 
 #          add it at the end of each phrase
 ###


sub npphrase{
    $infile=$_[0];
    
    open(PHRASE,"$infile.phrase");
    open(PHRASENP,">$infile.phrasenp");

    $/ ="\n"; #chunk begins with
    while (<PHRASE>){
	chop($_);
	if (/[0-9]+\s[0-9]+\s[0-9]+\s[0-9]+/){
        #only when the position information exist
	if (/[0-9]+_[0-9]+\(NP/){
        #only when this phrase begin with NP             
	    print PHRASENP "$_";
            
              $i=1;
            #$i is the swich
            
	 if($_ =~ /.+?([0-9]+\s[0-9]+\s).+?\s([0-9]+\s[0-9]+)\s+\)+$/){
         #if the phrase longer than or the same long as two words
		print PHRASENP "$1$2\n";
		$i=0;
             #turn the swich off

              }
	    if ($i==1){
             #if swich still on
if($_ =~ /.+?([0-9]+\s[0-9]+\s).+?[0-9]+\s[0-9]+\s([0-9]+\s[0-9]+)\s+[^a-z]+\)+$/){  #if the phrase longer than or the same long as two words and has more than one ")" or other nonterminal at the end
        print PHRASENP "$1$2\n";
		$i=0;

            } 
          }
	    if ($i==1){
            #turn the swich off       
		if($_=~ /([0-9]+\s[0-9]+\s[0-9]+\s[0-9]+)\s/){
                #if the phrase has only one word
		    print PHRASENP "$1\n";
            }
	}
	    
	} 
    }
    }
	close (PHRASE);  
    close (PHRASENP);

    
}

 ###
 # purpose: get rid of the NP in NP-PRD that we don't need produce the list1
 #          in which the index of phrases that we don't need
 # input:   infile.phrasenp
 # output:  the npphrases without NP-PRD List1 is the index of phrases we don't
 #          need,the list3 muss be worded later
 ###
sub npprad{
      
  $infile=$_[0];
    
   
    open(PHRASENP,"$infile.phrasenp");
    open(LIST2,"$infile.list2");
  open(LIST1,">$infile.list1");
 open(PHRASENPNP,">$infile.phrasenpnp");
  open(LIST3,">$infile.list3");
    $/ ="\n"; #chunk begins with
  $k=0;
  while (<LIST2>){
   #save the list in @nr2
   #because list2 are the phrase we need, so we should turn it into the list
   #that we don't need
      chop($_);
      
   $nr2[$k]=$_;
      $k++;
      
  }
 
 while (<PHRASENP>){
      $i=1;
    #$i is a swich
      if (/([0-9]+)_[0-9]+\(NP-PRD/){
      #if the phrase begin with NP-PRD
	  $PRD=$1+1;
      #$PRD is the index of the phrase we don't need
	  print LIST3 "$PRD\n" ;
          $i=0;
       #turn the swich off
        if(/, ,/){
        #if there are , , in a PRD phrase then we make an "u"before it
           print LIST3 "u$PRD\n" ; 
        }  
	 
      }
if (/([0-9]+)_[0-9]+\(NP \(NP /){
     #if the phrase begin with (NP (NP
    $h=$1+1;
     #we get the nummer of the first subknot of this phrase
       $j=1;
     #$j is a swich
    foreach $nummer (@nr2){
      
       if($nummer == $h){
        #if the nummer is the same then we need this phrase
       $j=0 ;  }
   
    }
    if($j==1){
       #if the swich is on ,so the nummer is the nummer we don't need
	print LIST1 "$h\n";

    }
}

        if($i==1){       
     #the swich $i is on ,so it is not a NP-PRD,then it is the phrase we need

	 print PHRASENPNP "$_";
     }
}

  
  close (PHRASENP);
  close (LIST2);
  close (PHRASENPNP);
  close (LIST1);
  close (LIST3);
}
 ###
 # purpose: for the NP-PRD we should get rid of all the sonknote of the first 
 #          subknote and also if there is a , , in NP-PRD the first knote after
 #          it muss also be deleted
 # input:   list3 and all the NP phrase
 # output:  List4 contain the index of the phrase that we don't need
 ###
sub npprad2{
      
  $infile=$_[0];
  open(LIST3,"$infile.list3");
  open(PHRASENP,"$infile.phrasenp");
  open(LIST4,">$infile.list4");

  $/ ="\n"; #chunk begins with
  $k=0;
  while (<LIST3>){
      chop($_);
 #save the list3 in @nr3     
   $nr3[$k]=$_;
      $k++;
      
  }
while (<PHRASENP>){

  if (/([0-9]+)_[0-9]+\(NP/){
    #match each index of the NP phrases
         $numm=$1;   
    foreach $nummer (@nr3){
           
       if($nummer =~/u([0-9]+)/){
        #if there is a "u" before the nummer ,that means this PRD phrase has
        # a , , then we should get the index of the first knote after , ,  
	   if ($numm==$1){
          #to find the PRD phrase
           $PRD=$numm-1;
       #we reduce it at first ,then later with first "(" it will be add again
         @array=split (//) ;
         #save the phrase in characters 
	 foreach $character (@array){  
	 	    
	  if ("(" eq $character){
	 #and then count the nummer of "(" in the first subknote of this NP-PRD
          $PRD= $PRD+1;
          
	 }
             }
               
            $PRD=$PRD +2;
            #finaly we add 2 to this nummer so we get the index of the first 
            #knote after , , 
	   print LIST4 "$PRD\n";
            #then add it to list4 
}
       }
   
         
       if($nummer eq $numm){
        #if the index in list3 do not contain and "u"
        #we muss get rid of all the nummer in the first subknote
           $PD=$numm-1;
          
         @array1=split (//) ; 
	 foreach $cha (@array1){  
	 	    
	  if ("(" eq $cha){
	 #we count it and add every time the nummer in it
	      $PD= $PD+1;
	      print LIST4 "$PD\n";
	 }
      }
	  
       }
   }
   }   
}


close (LIST3);  
close (PHRASENP); 
close (LIST4);

}
 ###
 # purpose: get rid of all the phrases we don't need
 # input:   list1 list4 and the np phrases
 # output:  the realnp we need
 #          
 ###
sub final{    


 $infile=$_[0];
 open(PHRASENPNP,"$infile.phrasenpnp");
 open(LIST1,"$infile.list1");
 open(LIST4,"$infile.list4");
 open(REALNP,">$infile.realnp");
  $/ ="\n"; #chunk begins with
 $k=0;
 while (<LIST1>){
      chop($_);
      #save the list1 in @nr1
   $nr1[$k]=$_;
      $k++;
   
  }
 $z=0;
 while (<LIST4>){
      chop($_);
       #save the list4 in @nr4
   $nr4[$z]=$_;
      $z++;
   
  }

 while(<PHRASENPNP>){


         $i=1;
         #the swich
        $_=~/^([0-9]+)_[0-9]+.+$/;
 
     foreach $nummer (@nr1){
	
	
	 if ($nummer == $1){
	#if the nummer is the same as index then swich off
	     $i=0;  
        }
  }
 foreach $num (@nr4){
	
	
	 if ($num == $1){
	#if the nummer is the same as index then swich off
	     $i=0;  
        }
  }

     if($i==1){
         #if the swich on then print the realnp
	 print REALNP $_;
	 if( $_=~/([0-9]+)_([0-9]+)([^0-9]+)(\(PRP\$\s[A-Za-z]*\))\s(([0-9]+)\s([0-9]+)\s([0-9]+)\s([0-9]+))/){
   #matchs the phrase containing PRP$
   #$1 is the index of phrase ,$2 is the sentence depth,
   #$3 is the the nontermimal before PRP$
	 $_=$3;
   #this is just for the $tr/// comment
	 $tr=tr/(//;
    #count how many "(" are there in $3
		 $p=$1+$tr;
     #$p is then the index of phrase (PRP$
		 $q=$2+$tr;
     #$q is then the sentence depth of phrase (PRP$
		 $w=1;
               #the swich
		 if ($p==$h){
               #if the phrase had allready been worked then then the swich off
               #so that the phrase will not appear in two times
		     $w=0;
		 }
		 if($w==1){
        
   		 print REALNP "$p\_$q$4$5\n";
               #so that the phrase PRP$ look like the other realnp we need
	     }
	      $h =$p;
         #save the index in order to compare with the phrase that comes later
	     }
} 


}
 close (REALNP);
 close (LIST1);
 close (PHRASENPNP);
}

###
# purpose:to give  
# input:   
# output:  
#          
###
sub nummer{

 open(REALNP,"$infile.realnp");
 open(INFO,">.$infile.info");
 open(NPDEPTH2,"$infile.npdepth2");
 open(ATTR,">.$infile.attr");
  $/ ="\n"; #chunk begins with
 
 while(<REALNP>){
     chop($_);  
     $_=~/^([0-9]+)(_[0-9]+.+\))([0-9]+\s[0-9]+\s[0-9]+\s[0-9]+)$/;
  
     $hash{$1}="$3";
     $hash2{$1}="$2";
 }
 while(<NPDEPTH2>){
     chop($_);
     $_=~/([0-9]+) ([0-9]+)/;
      $hash3{$1}="$2";
 }
      

 $i=1; 
 $j=1;
sub bynummer{
    $a<=>$b;
}
   @array=sort bynummer keys %hash;

foreach $num (@array){
   
 print INFO "$i $hash{$num} 0 0\n";
        $i++;  
}
  @array2=sort bynummer keys %hash2;

foreach $num2 (@array2){
              print ATTR "($j)";
               if ($hash2{$num2}=~/^_([0-9]+)/){
		   print ATTR "SDEPTH($1)";
	       }
	      $G=1;
              $S=1;
	      if ($hash2{$num2}=~/^_([0-9]+)\(NP-([A-Z]+)/){
		  
		  if ($2 eq SBJ){
		      print ATTR "(Grammatical Role)(SBJ)" ;
		      $G=0;
		  }
	       if ($2 eq NOM){
		      print ATTR "(Grammatical Role)(NOM)" ;
		      $G=0;
		  }
	       if ($2 eq CLF){
		      print ATTR "(Grammatical Role)(CLF)" ;
		      $G=0;
		  }
                 if ($2 eq ADV){
		      print ATTR "(Grammatical Role)(ADV)" ;
		      $G=0;
		  }
                   if ($2 eq LGS){
		      print ATTR "(Grammatical Role)(LGS)" ;
		      $G=0;
		  }
		   if ($2 eq PRD){
		      print ATTR "(Grammatical Role)(PRD)" ;
		      $G=0;
		  }
		   if ($2 eq TPC){
		      print ATTR "(Grammatical Role)(TPC)" ;
		      $G=0;
		  }
		   if ($2 eq CLR){
		      print ATTR "(Grammatical Role)(CLR)" ;
		      $G=0;
		  }
		  if ($2 eq TMP){
		      print ATTR "(Semantic Role)(TMP)" ;
		      $S=0;
		  }
	       if ($2 eq VOC){
		      print ATTR "(Semantic Role)(VOC)" ;
		      $S=0;
		  }
		  if ($2 eq DIR){
		      print ATTR "(Semantic Role)(DIR)" ;
		      $S=0;
		  }
		   if ($2 eq LOC){
		      print ATTR "(Semantic Role)(LOC)" ;
		      $S=0;
		  }
		   if ($2 eq PRP){
		      print ATTR "(Semantic Role)(PRP)" ;
		      $S=0;
		  }
		   if ($2 eq MNR){
		      print ATTR "(Semantic Role)(MNR)" ;
		      $S=0;
		  }
	      }
		  if ($G==1){
		      print ATTR "(Grammatical Role)(none)";
		  }
                 if ($S==1){
		      print ATTR "(Semantic Role)(none)";
		  }
             print ATTR "NPDEPTH($hash3{$num2})";
	     	       $NP=1;
               if ($hash2{$num2}=~/^_[0-9]+\(PRP\$\s[A-Za-z]*\)$/){
		   print ATTR "(NPFORM)(PRP\$)\n";
		   $NP=0;
	       } 
   if($hash2{$num2}=~/^_[0-9]+\([A-Z-]*\s\(PRP\s[A-Za-z]*\)\s[0-9]+\s[0-9]+\s[0-9]+\s[0-9]+\s\)$/){
                       print ATTR "(NPFORM)(PRP)\n";
				   $NP=0;
		   }
             
        if($hash2{$num2}=~/^_[0-9]+\(NP\s\(DT\s[A-Za-z]*\)[^A-Z^a-z]*$/){
                       print ATTR "(NPFORM)(DTpro)\n";
				   $NP=0;
		   }
            if($hash2{$num2}=~/^_[0-9]+\(NP[A-Z\=\-0-9]*\s\(NNP\s[A-Za-z]*\).+?[NNP]*\)$/){
                       print ATTR "(NPFORM)(NNP)\n";
				   $NP=0;
		   }
           if ($NP==1){         
  if($hash2{$num2}=~/^_[0-9]+\(NP\s\(DT a\)/||$hash2{$num2}=~/^_[0-9]+\(NP\s\([^DT]+/){
      print ATTR "(NPFORM)(IndefNP)\n";
  }
  else{
   print ATTR "(NPFORM)(defNP)\n";
}

}

 $j++;
      
}


close(ATTR);
close (INFO);
 close (REALNP);
}