placeholderFactory = $placeholderFactory; } public function containsMarkup( string $text ): bool { $nowiki = []; $text = $this->armourNowiki( $nowiki, $text ); return preg_match( '~]~', $text ) !== 0; } /** * Remove all opening and closing translate tags following the same whitespace rules as the * regular parsing. This doesn't try to parse the page, so it can handle unbalanced tags. */ public function cleanupTags( string $text ): string { $nowiki = []; $text = $this->armourNowiki( $nowiki, $text ); $text = preg_replace( '~\n?~s', '', $text ); $text = preg_replace( '~\n?~s', '', $text ); // Markers: headers and the rest $ic = preg_quote( TranslationUnit::UNIT_MARKER_INVALID_CHARS, '~' ); $text = preg_replace( "~(^=.*=) $~um", '\1', $text ); $text = preg_replace( "~[\n ]?~um", '', $text ); // Remove variables $unit = new TranslationUnit( $text ); $text = $unit->getTextForTrans(); return $this->unarmourNowiki( $nowiki, $text ); } /** @throws ParsingFailure */ public function parse( string $text ): ParserOutput { $nowiki = []; $text = $this->armourNowiki( $nowiki, $text ); $sections = []; $tagPlaceHolders = []; while ( true ) { $re = '~()(.*?)~s'; $matches = []; $ok = preg_match( $re, $text, $matches, PREG_OFFSET_CAPTURE ); if ( $ok === 0 || $ok === false ) { break; // No match or failure } $contentWithTags = $matches[0][0]; $contentWithoutTags = $matches[2][0]; // These are offsets to the content inside the tags in $text $offsetStart = $matches[0][1]; $offsetEnd = $offsetStart + strlen( $contentWithTags ); // Replace the whole match with a placeholder $ph = $this->placeholderFactory->make(); $text = substr( $text, 0, $offsetStart ) . $ph . substr( $text, $offsetEnd ); if ( preg_match( '~~', $contentWithoutTags ) !== 0 ) { throw new ParsingFailure( 'Nested tags', [ 'pt-parse-nested', $contentWithoutTags ] ); } $openTag = $matches[1][0]; $canWrap = $openTag !== ''; // Parse the content inside the tags $contentWithoutTags = $this->unarmourNowiki( $nowiki, $contentWithoutTags ); $parse = $this->parseSection( $contentWithoutTags, $canWrap ); // Update list of sections and the template with the results $sections += $parse['sections']; $tagPlaceHolders[$ph] = new Section( $openTag, $parse['template'], '' ); } $prettyTemplate = $text; foreach ( $tagPlaceHolders as $ph => $value ) { $prettyTemplate = str_replace( $ph, '[...]', $prettyTemplate ); } if ( preg_match( '~~', $text ) !== 0 ) { throw new ParsingFailure( 'Unmatched opening tag', [ 'pt-parse-open', $prettyTemplate ] ); } elseif ( str_contains( $text, '' ) ) { throw new ParsingFailure( "Unmatched closing tag", [ 'pt-parse-close', $prettyTemplate ] ); } $text = $this->unarmourNowiki( $nowiki, $text ); return new ParserOutput( $text, $tagPlaceHolders, $sections ); } /** * Splits the content marked with \ tags into translation units, which are * separated with two or more newlines. Extra whitespace is captured in the template and * is not included in the translation units. * @internal */ public function parseSection( string $text, bool $canWrap ): array { $flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE; $parts = preg_split( '~(^\s*|\s*\n\n\s*|\s*$)~', $text, -1, $flags ); $inline = preg_match( '~\n~', $text ) === 0; $template = ''; $sections = []; foreach ( $parts as $_ ) { if ( trim( $_ ) === '' ) { $template .= $_; } else { $ph = $this->placeholderFactory->make(); $tpSection = $this->parseUnit( $_ ); $tpSection->setIsInline( $inline ); $tpSection->setCanWrap( $canWrap ); $sections[$ph] = $tpSection; $template .= $ph; } } return [ 'template' => $template, 'sections' => $sections, ]; } /** * Checks if this unit already contains a section marker. If there * is not, a new one will be created. Marker will have the value of * -1, which will later be replaced with a real value. * @internal */ public function parseUnit( string $content ): TranslationUnit { $re = '~~'; $matches = []; $count = preg_match_all( $re, $content, $matches, PREG_SET_ORDER ); if ( $count > 1 ) { throw new ParsingFailure( 'Multiple translation unit markers', [ 'pt-shake-multiple', $content ] ); } // If no id given in the source, default to a new section id $id = TranslationUnit::NEW_UNIT_ID; if ( $count === 1 ) { foreach ( $matches as $match ) { [ /*full*/, $id ] = $match; // Currently handle only these two standard places. // Is this too strict? $rer1 = '~^( |\n)~'; // Normal sections $rer2 = '~\s*$~m'; // Sections with title $content = preg_replace( $rer1, '', $content ); $content = preg_replace( $rer2, '', $content ); if ( preg_match( $re, $content ) === 1 ) { throw new ParsingFailure( 'Translation unit marker is in unsupported position', [ 'pt-shake-position', $content ] ); } elseif ( trim( $content ) === '' ) { throw new ParsingFailure( 'Translation unit has no content besides marker', [ 'pt-shake-empty', $id ] ); } } } return new TranslationUnit( $content, $id ); } /** @internal */ public function armourNowiki( array &$holders, string $text ): string { $re = '~()(.*?)()~s'; while ( preg_match( $re, $text, $matches ) ) { $ph = $this->placeholderFactory->make(); $text = str_replace( $matches[0], $ph, $text ); $holders[$ph] = $matches[0]; } return $text; } /** @internal */ public function unarmourNowiki( array $holders, string $text ): string { return strtr( $text, $holders ); } }